##Imports

In [1]:
!pip install peft evaluate -q
#peft: This is the name of the first library being installed. PEFT (Parameter-Efficient Fine-Tuning) is a library that enables efficient fine-tuning of large language models.

#evaluate: This is the name of the second library being installed. Evaluate is a library for evaluating the performance of machine learning models.

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/480.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m471.0/480.6 kB[0m [31m20.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
from datasets import load_dataset, Dataset, DatasetDict
# Import functions and classes to handle datasets, such as loading prebuilt datasets or creating new ones.

from transformers import (
    AutoTokenizer,              # Automatically loads the appropriate tokenizer for a model.
    AutoConfig,                 # Retrieves model configuration details, such as architecture or parameters.
    AutoModelForSequenceClassification,  # Loads a pre-trained model for sequence classification tasks.
    TrainingArguments,          # Configures training parameters like batch size, learning rate, etc.
    Trainer,                    # High-level API for training and evaluation of transformers models.
    DataCollatorWithPadding)    # Dynamically pads sequences to the same length during batching.

from sklearn.model_selection import train_test_split
# Imports the function to split data into training and test sets.

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
# PEFT (Parameter-Efficient Fine-Tuning) tools to fine-tune large models efficiently, such as with LoRA (Low-Rank Adaptation).

import evaluate
# Library for evaluation metrics like accuracy, precision, recall, etc.

import torch
# PyTorch framework for deep learning, supporting GPU-accelerated computations.

import numpy as np
# NumPy library for numerical operations, such as arrays and mathematical computations.

import pandas as pd
# Pandas library for data manipulation and analysis, particularly useful for tabular data.

from tqdm import tqdm
# Library for creating progress bars in loops or processes.

tqdm.pandas()
# Extends Pandas operations to display progress bars when processing DataFrames or Series.


##Load Dataset

In [3]:
!kaggle datasets download deepcontractor/supreme-court-judgment-prediction
!unzip supreme-court-judgment-prediction.zip
CSV_PATH = 'justice.csv'


Dataset URL: https://www.kaggle.com/datasets/deepcontractor/supreme-court-judgment-prediction
License(s): CC0-1.0
Downloading supreme-court-judgment-prediction.zip to /content
  0% 0.00/1.33M [00:00<?, ?B/s]
100% 1.33M/1.33M [00:00<00:00, 84.3MB/s]
Archive:  supreme-court-judgment-prediction.zip
  inflating: justice.csv             


In [4]:
def readFromCsv(filePath):
    # Defines a function to read data from a CSV file and preprocess it.

    df = pd.read_csv(filePath)
    # Reads the CSV file at the specified file path into a Pandas DataFrame.

    # clean Unnamed col
    df.drop(columns=["Unnamed: 0"], inplace=True)
    # Removes the column named "Unnamed: 0" from the DataFrame (commonly an index column from saving).

    # take a sneak peek
    display(df.head())
    # Displays the first few rows of the DataFrame for a quick preview of the data.

    return df
    # Returns the cleaned DataFrame to the caller.


In [5]:
org_df = readFromCsv(CSV_PATH)
org_df.info()

Unnamed: 0,ID,name,href,docket,term,first_party,second_party,facts,facts_len,majority_vote,minority_vote,first_party_winner,decision_type,disposition,issue_area
0,50606,Roe v. Wade,https://api.oyez.org/cases/1971/70-18,70-18,1971,Jane Roe,Henry Wade,"<p>In 1970, Jane Roe (a fictional name used in...",501,7,2,True,majority opinion,reversed,
1,50613,Stanley v. Illinois,https://api.oyez.org/cases/1971/70-5014,70-5014,1971,"Peter Stanley, Sr.",Illinois,<p>Joan Stanley had three children with Peter ...,757,5,2,True,majority opinion,reversed/remanded,Civil Rights
2,50623,Giglio v. United States,https://api.oyez.org/cases/1971/70-29,70-29,1971,John Giglio,United States,<p>John Giglio was convicted of passing forged...,495,7,0,True,majority opinion,reversed/remanded,Due Process
3,50632,Reed v. Reed,https://api.oyez.org/cases/1971/70-4,70-4,1971,Sally Reed,Cecil Reed,"<p>The Idaho Probate Code specified that ""male...",378,7,0,True,majority opinion,reversed/remanded,Civil Rights
4,50643,Miller v. California,https://api.oyez.org/cases/1971/70-73,70-73,1971,Marvin Miller,California,"<p>Miller, after conducting a mass mailing cam...",305,5,4,True,majority opinion,vacated/remanded,First Amendment


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3303 entries, 0 to 3302
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   ID                  3303 non-null   int64 
 1   name                3303 non-null   object
 2   href                3303 non-null   object
 3   docket              3292 non-null   object
 4   term                3303 non-null   object
 5   first_party         3302 non-null   object
 6   second_party        3302 non-null   object
 7   facts               3303 non-null   object
 8   facts_len           3303 non-null   int64 
 9   majority_vote       3303 non-null   int64 
 10  minority_vote       3303 non-null   int64 
 11  first_party_winner  3288 non-null   object
 12  decision_type       3296 non-null   object
 13  disposition         3231 non-null   object
 14  issue_area          3161 non-null   object
dtypes: int64(4), object(11)
memory usage: 387.2+ KB


In [8]:
# Preprocess the data
# just keep facts and first_party_winner

#drop all rows with na
df = org_df.dropna()
df = df[['facts', 'first_party_winner']]
df['first_party_winner'] = df['first_party_winner'].astype(int)

#remname facts to text and first_party_winner to label
df = df.rename(columns={'first_party_winner': 'label', 'facts': 'text'})

# remove the p tag from the text
df['text'] = df['text'].str.replace('<p>', '')

print(df)

                                                   text  label
1     Joan Stanley had three children with Peter Sta...      1
2     John Giglio was convicted of passing forged mo...      1
3     The Idaho Probate Code specified that "males m...      1
4     Miller, after conducting a mass mailing campai...      1
5     Ernest E. Mandel was a Belgian professional jo...      1
...                                                 ...    ...
3297  For over a century after the Alaska Purchase i...      1
3298  Refugio Palomar-Santiago, a Mexican national, ...      1
3299  Tarahrick Terry pleaded guilty to one count of...      0
3300  Joshua James Cooley was parked in his pickup t...      1
3302  The Natural Gas Act (NGA), 15 U.S.C. §§ 717–71...      1

[3098 rows x 2 columns]


##validation split

In [11]:
from sklearn.model_selection import train_test_split

# Split the dataset into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Convert the dataframes to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
validation_dataset = Dataset.from_pandas(test_df)

train_dataset = train_dataset.select_columns(['label', 'text'])
validation_dataset = validation_dataset.select_columns(['label', 'text'])

# Remove the index column if it exists
if '__index_level_0__' in train_dataset.features:
    train_dataset = train_dataset.remove_columns(['__index_level_0__'])
if '__index_level_0__' in validation_dataset.features:
    validation_dataset = validation_dataset.remove_columns(['__index_level_0__'])


#print(train_dataset)
#print(validation_dataset)

dataset = DatasetDict({
    'train': train_dataset,
    'validation': validation_dataset
})
dataset



DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 2478
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 620
    })
})

##Classification Approach: TF-IDF

In [12]:
from sklearn.naive_bayes import MultinomialNB
# Imports the Multinomial Naive Bayes classifier, often used for text classification and discrete data.

from sklearn.linear_model import LogisticRegression
# Imports the Logistic Regression model, a popular linear model for binary and multi-class classification.

from sklearn.svm import LinearSVC
# Imports the Linear Support Vector Classifier, suitable for high-dimensional feature spaces, such as text data.

from sklearn.ensemble import RandomForestClassifier
# Imports the Random Forest classifier, an ensemble method using multiple decision trees for robust classification.

from xgboost import XGBClassifier
# Imports the XGBoost classifier, a gradient boosting method known for high performance on structured data.

from sklearn.feature_extraction.text import TfidfVectorizer
# Imports the TfidfVectorizer, which transforms text data into TF-IDF feature vectors for model training.

from sklearn.metrics import accuracy_score, classification_report
# Imports `accuracy_score` to measure the percentage of correct predictions.
# Imports `classification_report` to generate a detailed report of precision, recall, and F1-score for each class.


In [15]:
X_train = train_df['text'].str.lower().values
# Converts the 'text' column in the training DataFrame to lowercase to ensure uniformity,
# then extracts the values as a NumPy array for vectorization.

y_train = train_df['label']
# Extracts the target labels from the training DataFrame.

X_test = test_df['text'].str.lower().values
# Converts the 'text' column in the test DataFrame to lowercase for uniformity,
# then extracts the values as a NumPy array for vectorization.

y_test = test_df['label']
# Extracts the target labels from the test DataFrame.

tfidf_vectorizer = TfidfVectorizer()
# Initializes a TF-IDF Vectorizer to transform the text into numerical feature vectors.

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
# Fits the TF-IDF Vectorizer to the training text data and transforms it into a sparse matrix of features.

X_test_tfidf = tfidf_vectorizer.transform(X_test)
# Transforms the test text data into the same feature space as the training data,
# ensuring consistency for model predictions.

# Train the classifier
classifier = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
# Initializes an XGBoost classifier with specific settings:
# - `use_label_encoder=False`: Avoids unnecessary warnings about label encoding.
# - `eval_metric='mlogloss'`: Sets the evaluation metric to multi-class log loss.

classifier.fit(X_train_tfidf, y_train)
# Trains the XGBoost classifier using the TF-IDF feature matrix and the training labels.

# Make predictions and evaluate
y_pred = classifier.predict(X_test_tfidf)
# Uses the trained classifier to predict labels for the test dataset.

accuracy = accuracy_score(y_test, y_pred)
# Calculates the accuracy score by comparing the predicted labels (`y_pred`) with the actual labels (`y_test`).

print(f'Accuracy: {accuracy:.2f}')
# Prints the accuracy of the model as a percentage with two decimal places.

print(classification_report(y_test, y_pred))
# Generates and prints a detailed classification report showing precision, recall, F1-score, and support for each class.


Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.65
              precision    recall  f1-score   support

           0       0.41      0.18      0.25       202
           1       0.69      0.88      0.77       418

    accuracy                           0.65       620
   macro avg       0.55      0.53      0.51       620
weighted avg       0.60      0.65      0.60       620



##Prediction

In [17]:
# Define list of examples
text_list = test_df['text'][5:10].tolist()
# Extracts a slice of text data from the test dataset (rows 5 to 9) and converts it to a list.

actual_winner = test_df['label'][5:10].tolist()
# Extracts the actual labels (the true winners) for the selected text rows.

print("Trained model predictions:")
print("----------------------------")

# Loop through each example text in the list
for text in text_list:
    # Apply the pre-trained tokenizer to the text to convert it into input tokens
    inputs = tfidf_vectorizer.transform([text])
    # Transform the input text into TF-IDF features using the previously fitted vectorizer.

    # Predict the label using the trained classifier
    prediction = classifier.predict(inputs)
    # Use the trained classifier to predict the label for the transformed text.

    print(f"Predicted: {prediction[0]} - Actual Result: {actual_winner[text_list.index(text)]}")
    # Display the predicted label and the actual label for comparison.


Trained model predictions:
----------------------------
Predicted: 1 - Actual Result: 1
Predicted: 1 - Actual Result: 0
Predicted: 0 - Actual Result: 1
Predicted: 0 - Actual Result: 1
Predicted: 1 - Actual Result: 1
