##Imports

In [1]:
!pip install peft evaluate -q
#peft: This is the name of the first library being installed. PEFT (Parameter-Efficient Fine-Tuning) is a library that enables efficient fine-tuning of large language models.

#evaluate: This is the name of the second library being installed. Evaluate is a library for evaluating the performance of machine learning models.

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/480.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
from datasets import load_dataset, Dataset, DatasetDict
# Import functions and classes to handle datasets, such as loading prebuilt datasets or creating new ones.

from transformers import (
    AutoTokenizer,              # Automatically loads the appropriate tokenizer for a model.
    AutoConfig,                 # Retrieves model configuration details, such as architecture or parameters.
    AutoModelForSequenceClassification,  # Loads a pre-trained model for sequence classification tasks.
    TrainingArguments,          # Configures training parameters like batch size, learning rate, etc.
    Trainer,                    # High-level API for training and evaluation of transformers models.
    DataCollatorWithPadding)    # Dynamically pads sequences to the same length during batching.

from sklearn.model_selection import train_test_split
# Imports the function to split data into training and test sets.

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
# PEFT (Parameter-Efficient Fine-Tuning) tools to fine-tune large models efficiently, such as with LoRA (Low-Rank Adaptation).

import evaluate
# Library for evaluation metrics like accuracy, precision, recall, etc.

import torch
# PyTorch framework for deep learning, supporting GPU-accelerated computations.

import numpy as np
# NumPy library for numerical operations, such as arrays and mathematical computations.

import pandas as pd
# Pandas library for data manipulation and analysis, particularly useful for tabular data.

from tqdm import tqdm
# Library for creating progress bars in loops or processes.

tqdm.pandas()
# Extends Pandas operations to display progress bars when processing DataFrames or Series.


##Load Dataset

In [7]:
!kaggle datasets download deepcontractor/supreme-court-judgment-prediction
!unzip supreme-court-judgment-prediction.zip
CSV_PATH = 'justice.csv'


Dataset URL: https://www.kaggle.com/datasets/deepcontractor/supreme-court-judgment-prediction
License(s): CC0-1.0
supreme-court-judgment-prediction.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  supreme-court-judgment-prediction.zip
replace justice.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [8]:
def readFromCsv(filePath):
    # Defines a function to read data from a CSV file and preprocess it.

    df = pd.read_csv(filePath)
    # Reads the CSV file at the specified file path into a Pandas DataFrame.

    # clean Unnamed col
    df.drop(columns=["Unnamed: 0"], inplace=True)
    # Removes the column named "Unnamed: 0" from the DataFrame (commonly an index column from saving).

    # take a sneak peek
    display(df.head())
    # Displays the first few rows of the DataFrame for a quick preview of the data.

    return df
    # Returns the cleaned DataFrame to the caller.


In [9]:
org_df = readFromCsv(CSV_PATH)
org_df.info()

Unnamed: 0,ID,name,href,docket,term,first_party,second_party,facts,facts_len,majority_vote,minority_vote,first_party_winner,decision_type,disposition,issue_area
0,50606,Roe v. Wade,https://api.oyez.org/cases/1971/70-18,70-18,1971,Jane Roe,Henry Wade,"<p>In 1970, Jane Roe (a fictional name used in...",501,7,2,True,majority opinion,reversed,
1,50613,Stanley v. Illinois,https://api.oyez.org/cases/1971/70-5014,70-5014,1971,"Peter Stanley, Sr.",Illinois,<p>Joan Stanley had three children with Peter ...,757,5,2,True,majority opinion,reversed/remanded,Civil Rights
2,50623,Giglio v. United States,https://api.oyez.org/cases/1971/70-29,70-29,1971,John Giglio,United States,<p>John Giglio was convicted of passing forged...,495,7,0,True,majority opinion,reversed/remanded,Due Process
3,50632,Reed v. Reed,https://api.oyez.org/cases/1971/70-4,70-4,1971,Sally Reed,Cecil Reed,"<p>The Idaho Probate Code specified that ""male...",378,7,0,True,majority opinion,reversed/remanded,Civil Rights
4,50643,Miller v. California,https://api.oyez.org/cases/1971/70-73,70-73,1971,Marvin Miller,California,"<p>Miller, after conducting a mass mailing cam...",305,5,4,True,majority opinion,vacated/remanded,First Amendment


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3303 entries, 0 to 3302
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   ID                  3303 non-null   int64 
 1   name                3303 non-null   object
 2   href                3303 non-null   object
 3   docket              3292 non-null   object
 4   term                3303 non-null   object
 5   first_party         3302 non-null   object
 6   second_party        3302 non-null   object
 7   facts               3303 non-null   object
 8   facts_len           3303 non-null   int64 
 9   majority_vote       3303 non-null   int64 
 10  minority_vote       3303 non-null   int64 
 11  first_party_winner  3288 non-null   object
 12  decision_type       3296 non-null   object
 13  disposition         3231 non-null   object
 14  issue_area          3161 non-null   object
dtypes: int64(4), object(11)
memory usage: 387.2+ KB


##preprocessing dataset

In [10]:
# check for null before removing
org_df.isnull().sum()

Unnamed: 0,0
ID,0
name,0
href,0
docket,11
term,0
first_party,1
second_party,1
facts,0
facts_len,0
majority_vote,0


In [11]:
org_df['augmented_text'] = ""
# Creates a new column in the DataFrame named 'augmented_text' and initializes it with empty strings.

for idx, row in org_df.iterrows():
    # Iterates through each row in the DataFrame using the index (`idx`) and row data (`row`).

    org_df.at[idx, 'augmented_text'] = f"{row['name']} {row['first_party']} {row['second_party']} {row['majority_vote']} to {row['minority_vote']} {row['decision_type']} {row['disposition']}  {row['issue_area']} {row['facts']}"
    # Populates the 'augmented_text' column with a formatted string that concatenates various column values,
    # providing a combined textual representation of the case information.

org_df.head()
# Displays the first few rows of the updated DataFrame to verify the changes.


Unnamed: 0,ID,name,href,docket,term,first_party,second_party,facts,facts_len,majority_vote,minority_vote,first_party_winner,decision_type,disposition,issue_area,augmented_text
0,50606,Roe v. Wade,https://api.oyez.org/cases/1971/70-18,70-18,1971,Jane Roe,Henry Wade,"<p>In 1970, Jane Roe (a fictional name used in...",501,7,2,True,majority opinion,reversed,,Roe v. Wade Jane Roe Henry Wade 7 to 2 majorit...
1,50613,Stanley v. Illinois,https://api.oyez.org/cases/1971/70-5014,70-5014,1971,"Peter Stanley, Sr.",Illinois,<p>Joan Stanley had three children with Peter ...,757,5,2,True,majority opinion,reversed/remanded,Civil Rights,"Stanley v. Illinois Peter Stanley, Sr. Illino..."
2,50623,Giglio v. United States,https://api.oyez.org/cases/1971/70-29,70-29,1971,John Giglio,United States,<p>John Giglio was convicted of passing forged...,495,7,0,True,majority opinion,reversed/remanded,Due Process,Giglio v. United States John Giglio United St...
3,50632,Reed v. Reed,https://api.oyez.org/cases/1971/70-4,70-4,1971,Sally Reed,Cecil Reed,"<p>The Idaho Probate Code specified that ""male...",378,7,0,True,majority opinion,reversed/remanded,Civil Rights,Reed v. Reed Sally Reed Cecil Reed 7 to 0 majo...
4,50643,Miller v. California,https://api.oyez.org/cases/1971/70-73,70-73,1971,Marvin Miller,California,"<p>Miller, after conducting a mass mailing cam...",305,5,4,True,majority opinion,vacated/remanded,First Amendment,Miller v. California Marvin Miller California ...


In [12]:
# as we are primarily concerned about whether first party won or lost,
# discard only those nan rows for the moment

def removeNaN(df, colName):
    # Defines a function to remove rows with NaN (missing) values in a specific column.

    df = df.copy()
    # Creates a copy of the input DataFrame to avoid modifying the original data.

    df = df.dropna(subset=[colName])
    # Drops rows where the specified column (`colName`) has NaN values.

    return df
    # Returns the cleaned DataFrame without NaN rows in the specified column.

cleaned_df = removeNaN(org_df, 'first_party_winner')
# Calls the function to remove rows with NaN values in the 'first_party_winner' column from `org_df`.

cleaned_df.isnull().sum()
# Counts and displays the number of remaining NaN values for each column in the cleaned DataFrame,
# verifying that 'first_party_winner' no longer contains NaNs.


Unnamed: 0,0
ID,0
name,0
href,0
docket,10
term,0
first_party,1
second_party,1
facts,0
facts_len,0
majority_vote,0


In [13]:
# get features and target
def getFeatureNTarget(df):
    # Defines a function to extract and preprocess features and the target variable from the DataFrame.

    df = df.copy()
    # Creates a copy of the input DataFrame to avoid modifying the original data.

    df = df[['augmented_text', 'first_party_winner']]
    # Selects only the 'augmented_text' and 'first_party_winner' columns for further processing.

    df['first_party_winner'] = df['first_party_winner'].astype(int)
    # Converts the 'first_party_winner' column to integer type (e.g., from boolean or float).

    # rename facts to text and first_party_winner to label
    df = df.rename(columns={'first_party_winner': 'labels', 'augmented_text': 'text'})
    # Renames 'first_party_winner' to 'labels' (for the target variable) and 'augmented_text' to 'text' (for features).

    return df.reset_index(drop=True)
    # Resets the index of the DataFrame and drops the old index to maintain a clean sequential order, then returns it.

train_df = getFeatureNTarget(cleaned_df)
# Calls the function to preprocess the cleaned DataFrame and extract the features ('text') and labels ('labels').

train_df
# Displays the resulting processed DataFrame, which includes columns 'text' (features) and 'labels' (target).


Unnamed: 0,text,labels
0,Roe v. Wade Jane Roe Henry Wade 7 to 2 majorit...,1
1,"Stanley v. Illinois Peter Stanley, Sr. Illino...",1
2,Giglio v. United States John Giglio United St...,1
3,Reed v. Reed Sally Reed Cecil Reed 7 to 0 majo...,1
4,Miller v. California Marvin Miller California ...,1
...,...,...
3283,United States v. Palomar-Santiago United State...,1
3284,Terry v. United States Tarahrick Terry United ...,0
3285,United States v. Cooley United States Joshua J...,1
3286,Florida v. Georgia Florida Georgia 9 to 0 majo...,0


In [14]:
# now we have the base version of our train dataset with basic feature and its target
# here on, we will apply preprocessing if and where required

def preprocess_text(text):
    # Defines a function to preprocess text by removing unnecessary HTML tags.

    # remove <p> tag
    text = text.replace('<p>', '')
    # Removes occurrences of the HTML `<p>` tag from the input text.

    return text
    # Returns the cleaned text.

# apply preprocess on train_df
def apply_preprocess(df, colName):
    # Defines a function to apply text preprocessing to a specified column in the DataFrame.

    df = df.copy()
    # Creates a copy of the input DataFrame to avoid altering the original data.

    # reason why I'm doing a copy with each utility function is that I don't wanna alter original data frame
    df[colName] = df[colName].progress_apply(preprocess_text)
    # Applies the `preprocess_text` function to every row in the specified column (`colName`)
    # and shows a progress bar using `tqdm`.

    return df
    # Returns the DataFrame with the preprocessed column.

processed_df = apply_preprocess(train_df, 'text')
# Calls the function to preprocess the 'text' column of `train_df` using `preprocess_text`.

processed_df
# Displays the DataFrame after applying the preprocessing, where the 'text' column is cleaned.


100%|██████████| 3288/3288 [00:00<00:00, 265102.01it/s]


Unnamed: 0,text,labels
0,Roe v. Wade Jane Roe Henry Wade 7 to 2 majorit...,1
1,"Stanley v. Illinois Peter Stanley, Sr. Illino...",1
2,Giglio v. United States John Giglio United St...,1
3,Reed v. Reed Sally Reed Cecil Reed 7 to 0 majo...,1
4,Miller v. California Marvin Miller California ...,1
...,...,...
3283,United States v. Palomar-Santiago United State...,1
3284,Terry v. United States Tarahrick Terry United ...,0
3285,United States v. Cooley United States Joshua J...,1
3286,Florida v. Georgia Florida Georgia 9 to 0 majo...,0


##validation split

In [15]:
# Split the dataset into training and testing sets
df_train, df_test = train_test_split(processed_df, test_size=0.2, random_state=42)
# Splits the processed DataFrame into training (80%) and testing (20%) sets.
# `random_state=42` ensures reproducibility of the split.

# Convert the dataframes to Hugging Face Datasets
train_dataset = Dataset.from_pandas(df_train)
# Converts the training DataFrame (`df_train`) into a Hugging Face Dataset object.

validation_dataset = Dataset.from_pandas(df_test)
# Converts the testing DataFrame (`df_test`) into a Hugging Face Dataset object.

# pick only feature and target column
train_dataset = train_dataset.select_columns(['text', 'labels'])
# Retains only the 'text' (features) and 'labels' (target) columns in the training dataset.

validation_dataset = validation_dataset.select_columns(['text', 'labels'])
# Retains only the 'text' and 'labels' columns in the validation dataset.

dataset = DatasetDict({
    'train': train_dataset,          # Assigns the training dataset to the 'train' key.
    'validation': validation_dataset # Assigns the validation dataset to the 'validation' key.
})
# Combines the training and validation datasets into a `DatasetDict` object for easier handling.

dataset
# Displays the resulting `DatasetDict`, showing the structure and content of the train and validation datasets.


DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 2630
    })
    validation: Dataset({
        features: ['text', 'labels'],
        num_rows: 658
    })
})

##Classification Approach: TF-IDF

In [16]:
from sklearn.naive_bayes import MultinomialNB
# Imports the Multinomial Naive Bayes classifier, often used for text classification and discrete data.

from sklearn.linear_model import LogisticRegression
# Imports the Logistic Regression model, a popular linear model for binary and multi-class classification.

from sklearn.svm import LinearSVC
# Imports the Linear Support Vector Classifier, suitable for high-dimensional feature spaces, such as text data.

from sklearn.ensemble import RandomForestClassifier
# Imports the Random Forest classifier, an ensemble method using multiple decision trees for robust classification.

from xgboost import XGBClassifier
# Imports the XGBoost classifier, a gradient boosting method known for high performance on structured data.

from sklearn.feature_extraction.text import TfidfVectorizer
# Imports the TfidfVectorizer, which transforms text data into TF-IDF feature vectors for model training.

from sklearn.metrics import accuracy_score, classification_report
# Imports `accuracy_score` to measure the percentage of correct predictions.
# Imports `classification_report` to generate a detailed report of precision, recall, and F1-score for each class.


In [17]:
X_train = df_train['text'].str.lower().values
# Converts the 'text' column in the training DataFrame to lowercase to ensure uniformity,
# then extracts the values as a NumPy array for vectorization.

y_train = df_train['labels']
# Extracts the target labels from the training DataFrame.

X_test = df_test['text'].str.lower().values
# Converts the 'text' column in the test DataFrame to lowercase for uniformity,
# then extracts the values as a NumPy array for vectorization.

y_test = df_test['labels']
# Extracts the target labels from the test DataFrame.

tfidf_vectorizer = TfidfVectorizer()
# Initializes a TF-IDF Vectorizer to transform the text into numerical feature vectors.

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
# Fits the TF-IDF Vectorizer to the training text data and transforms it into a sparse matrix of features.

X_test_tfidf = tfidf_vectorizer.transform(X_test)
# Transforms the test text data into the same feature space as the training data,
# ensuring consistency for model predictions.

# Train the classifier
classifier = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
# Initializes an XGBoost classifier with specific settings:
# - `use_label_encoder=False`: Avoids unnecessary warnings about label encoding.
# - `eval_metric='mlogloss'`: Sets the evaluation metric to multi-class log loss.

classifier.fit(X_train_tfidf, y_train)
# Trains the XGBoost classifier using the TF-IDF feature matrix and the training labels.

# Make predictions and evaluate
y_pred = classifier.predict(X_test_tfidf)
# Uses the trained classifier to predict labels for the test dataset.

accuracy = accuracy_score(y_test, y_pred)
# Calculates the accuracy score by comparing the predicted labels (`y_pred`) with the actual labels (`y_test`).

print(f'Accuracy: {accuracy:.2f}')
# Prints the accuracy of the model as a percentage with two decimal places.

print(classification_report(y_test, y_pred))
# Generates and prints a detailed classification report showing precision, recall, F1-score, and support for each class.


Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.88
              precision    recall  f1-score   support

           0       0.94      0.73      0.83       252
           1       0.85      0.97      0.91       406

    accuracy                           0.88       658
   macro avg       0.90      0.85      0.87       658
weighted avg       0.89      0.88      0.88       658



##Prediction

In [18]:
# Define list of examples
text_list = df_test['text'][5:10].tolist()
# Extracts a slice of text data from the test dataset (rows 5 to 9) and converts it to a list.

actual_winner = df_test['labels'][5:10].tolist()
# Extracts the actual labels (the true winners) for the selected text rows.

print("Trained model predictions:")
print("----------------------------")

# Loop through each example text in the list
for text in text_list:
    # Apply the pre-trained tokenizer to the text to convert it into input tokens
    inputs = tfidf_vectorizer.transform([text])
    # Transform the input text into TF-IDF features using the previously fitted vectorizer.

    # Predict the label using the trained classifier
    prediction = classifier.predict(inputs)
    # Use the trained classifier to predict the label for the transformed text.

    print(f"Predicted: {prediction[0]} - Actual Result: {actual_winner[text_list.index(text)]}")
    # Display the predicted label and the actual label for comparison.


Trained model predictions:
----------------------------
Predicted: 1 - Actual Result: 0
Predicted: 1 - Actual Result: 1
Predicted: 1 - Actual Result: 1
Predicted: 1 - Actual Result: 1
Predicted: 0 - Actual Result: 0
