##Import

In [93]:
!pip install peft evaluate -q
#peft: This is the name of the first library being installed. PEFT (Parameter-Efficient Fine-Tuning) is a library that enables efficient fine-tuning of large language models.

#evaluate: This is the name of the second library being installed. Evaluate is a library for evaluating the performance of machine learning models.

In [94]:
from datasets import load_dataset, Dataset, DatasetDict
# Import functions and classes to handle datasets, such as loading prebuilt datasets or creating new ones.

from transformers import (
    AutoTokenizer,              # Automatically loads the appropriate tokenizer for a model.
    AutoConfig,                 # Retrieves model configuration details, such as architecture or parameters.
    AutoModelForSequenceClassification,  # Loads a pre-trained model for sequence classification tasks.
    TrainingArguments,          # Configures training parameters like batch size, learning rate, etc.
    Trainer,                    # High-level API for training and evaluation of transformers models.
    DataCollatorWithPadding)    # Dynamically pads sequences to the same length during batching.

from sklearn.model_selection import train_test_split
# Imports the function to split data into training and test sets.

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig, prepare_model_for_kbit_training
# PEFT (Parameter-Efficient Fine-Tuning) tools to fine-tune large models efficiently, such as with LoRA (Low-Rank Adaptation).

import evaluate
# Library for evaluation metrics like accuracy, precision, recall, etc.

import torch
# PyTorch framework for deep learning, supporting GPU-accelerated computations.

import numpy as np
# NumPy library for numerical operations, such as arrays and mathematical computations.

import pandas as pd
# Pandas library for data manipulation and analysis, particularly useful for tabular data.

from tqdm import tqdm
# Library for creating progress bars in loops or processes.

tqdm.pandas()
# Extends Pandas operations to display progress bars when processing DataFrames or Series.


##Load dataset

In [95]:
!kaggle datasets download deepcontractor/supreme-court-judgment-prediction
!unzip supreme-court-judgment-prediction.zip
CSV_PATH = 'justice.csv'


Dataset URL: https://www.kaggle.com/datasets/deepcontractor/supreme-court-judgment-prediction
License(s): CC0-1.0
supreme-court-judgment-prediction.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  supreme-court-judgment-prediction.zip
replace justice.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


In [96]:
def readFromCsv(filePath):
    # Defines a function to read data from a CSV file and preprocess it.

    df = pd.read_csv(filePath)
    # Reads the CSV file at the specified file path into a Pandas DataFrame.

    # clean Unnamed col
    df.drop(columns=["Unnamed: 0"], inplace=True)
    # Removes the column named "Unnamed: 0" from the DataFrame (commonly an index column from saving).

    # take a sneak peek
    display(df.head())
    # Displays the first few rows of the DataFrame for a quick preview of the data.

    return df
    # Returns the cleaned DataFrame to the caller.


In [97]:
org_df = readFromCsv(CSV_PATH)
org_df.info()

Unnamed: 0,ID,name,href,docket,term,first_party,second_party,facts,facts_len,majority_vote,minority_vote,first_party_winner,decision_type,disposition,issue_area
0,50606,Roe v. Wade,https://api.oyez.org/cases/1971/70-18,70-18,1971,Jane Roe,Henry Wade,"<p>In 1970, Jane Roe (a fictional name used in...",501,7,2,True,majority opinion,reversed,
1,50613,Stanley v. Illinois,https://api.oyez.org/cases/1971/70-5014,70-5014,1971,"Peter Stanley, Sr.",Illinois,<p>Joan Stanley had three children with Peter ...,757,5,2,True,majority opinion,reversed/remanded,Civil Rights
2,50623,Giglio v. United States,https://api.oyez.org/cases/1971/70-29,70-29,1971,John Giglio,United States,<p>John Giglio was convicted of passing forged...,495,7,0,True,majority opinion,reversed/remanded,Due Process
3,50632,Reed v. Reed,https://api.oyez.org/cases/1971/70-4,70-4,1971,Sally Reed,Cecil Reed,"<p>The Idaho Probate Code specified that ""male...",378,7,0,True,majority opinion,reversed/remanded,Civil Rights
4,50643,Miller v. California,https://api.oyez.org/cases/1971/70-73,70-73,1971,Marvin Miller,California,"<p>Miller, after conducting a mass mailing cam...",305,5,4,True,majority opinion,vacated/remanded,First Amendment


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3303 entries, 0 to 3302
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   ID                  3303 non-null   int64 
 1   name                3303 non-null   object
 2   href                3303 non-null   object
 3   docket              3292 non-null   object
 4   term                3303 non-null   object
 5   first_party         3302 non-null   object
 6   second_party        3302 non-null   object
 7   facts               3303 non-null   object
 8   facts_len           3303 non-null   int64 
 9   majority_vote       3303 non-null   int64 
 10  minority_vote       3303 non-null   int64 
 11  first_party_winner  3288 non-null   object
 12  decision_type       3296 non-null   object
 13  disposition         3231 non-null   object
 14  issue_area          3161 non-null   object
dtypes: int64(4), object(11)
memory usage: 387.2+ KB


##preprocessing dataset

In [98]:
# check for null before removing
org_df.isnull().sum()

Unnamed: 0,0
ID,0
name,0
href,0
docket,11
term,0
first_party,1
second_party,1
facts,0
facts_len,0
majority_vote,0


In [99]:
org_df['augmented_text'] = ""
# Creates a new column in the DataFrame named 'augmented_text' and initializes it with empty strings.

for idx, row in org_df.iterrows():
    # Iterates through each row in the DataFrame using the index (`idx`) and row data (`row`).

    org_df.at[idx, 'augmented_text'] = f"{row['name']} {row['first_party']} {row['second_party']} {row['majority_vote']} to {row['minority_vote']} {row['decision_type']} {row['disposition']}  {row['issue_area']} {row['facts']}"
    # Populates the 'augmented_text' column with a formatted string that concatenates various column values,
    # providing a combined textual representation of the case information.

org_df.head()
# Displays the first few rows of the updated DataFrame to verify the changes.


Unnamed: 0,ID,name,href,docket,term,first_party,second_party,facts,facts_len,majority_vote,minority_vote,first_party_winner,decision_type,disposition,issue_area,augmented_text
0,50606,Roe v. Wade,https://api.oyez.org/cases/1971/70-18,70-18,1971,Jane Roe,Henry Wade,"<p>In 1970, Jane Roe (a fictional name used in...",501,7,2,True,majority opinion,reversed,,Roe v. Wade Jane Roe Henry Wade 7 to 2 majorit...
1,50613,Stanley v. Illinois,https://api.oyez.org/cases/1971/70-5014,70-5014,1971,"Peter Stanley, Sr.",Illinois,<p>Joan Stanley had three children with Peter ...,757,5,2,True,majority opinion,reversed/remanded,Civil Rights,"Stanley v. Illinois Peter Stanley, Sr. Illino..."
2,50623,Giglio v. United States,https://api.oyez.org/cases/1971/70-29,70-29,1971,John Giglio,United States,<p>John Giglio was convicted of passing forged...,495,7,0,True,majority opinion,reversed/remanded,Due Process,Giglio v. United States John Giglio United St...
3,50632,Reed v. Reed,https://api.oyez.org/cases/1971/70-4,70-4,1971,Sally Reed,Cecil Reed,"<p>The Idaho Probate Code specified that ""male...",378,7,0,True,majority opinion,reversed/remanded,Civil Rights,Reed v. Reed Sally Reed Cecil Reed 7 to 0 majo...
4,50643,Miller v. California,https://api.oyez.org/cases/1971/70-73,70-73,1971,Marvin Miller,California,"<p>Miller, after conducting a mass mailing cam...",305,5,4,True,majority opinion,vacated/remanded,First Amendment,Miller v. California Marvin Miller California ...


In [100]:
# as we are primarily concerned about whether first party won or lost,
# discard only those nan rows for the moment

def removeNaN(df, colName):
    # Defines a function to remove rows with NaN (missing) values in a specific column.

    df = df.copy()
    # Creates a copy of the input DataFrame to avoid modifying the original data.

    df = df.dropna(subset=[colName])
    # Drops rows where the specified column (`colName`) has NaN values.

    return df
    # Returns the cleaned DataFrame without NaN rows in the specified column.

cleaned_df = removeNaN(org_df, 'first_party_winner')
# Calls the function to remove rows with NaN values in the 'first_party_winner' column from `org_df`.

cleaned_df.isnull().sum()
# Counts and displays the number of remaining NaN values for each column in the cleaned DataFrame,
# verifying that 'first_party_winner' no longer contains NaNs.


Unnamed: 0,0
ID,0
name,0
href,0
docket,10
term,0
first_party,1
second_party,1
facts,0
facts_len,0
majority_vote,0


In [101]:
# get features and target
def getFeatureNTarget(df):
    # Defines a function to extract and preprocess features and the target variable from the DataFrame.

    df = df.copy()
    # Creates a copy of the input DataFrame to avoid modifying the original data.

    df = df[['augmented_text', 'first_party_winner']]
    # Selects only the 'augmented_text' and 'first_party_winner' columns for further processing.

    df['first_party_winner'] = df['first_party_winner'].astype(int)
    # Converts the 'first_party_winner' column to integer type (e.g., from boolean or float).

    # rename facts to text and first_party_winner to label
    df = df.rename(columns={'first_party_winner': 'labels', 'augmented_text': 'text'})
    # Renames 'first_party_winner' to 'labels' (for the target variable) and 'augmented_text' to 'text' (for features).

    return df.reset_index(drop=True)
    # Resets the index of the DataFrame and drops the old index to maintain a clean sequential order, then returns it.

train_df = getFeatureNTarget(cleaned_df)
# Calls the function to preprocess the cleaned DataFrame and extract the features ('text') and labels ('labels').

train_df
# Displays the resulting processed DataFrame, which includes columns 'text' (features) and 'labels' (target).


Unnamed: 0,text,labels
0,Roe v. Wade Jane Roe Henry Wade 7 to 2 majorit...,1
1,"Stanley v. Illinois Peter Stanley, Sr. Illino...",1
2,Giglio v. United States John Giglio United St...,1
3,Reed v. Reed Sally Reed Cecil Reed 7 to 0 majo...,1
4,Miller v. California Marvin Miller California ...,1
...,...,...
3283,United States v. Palomar-Santiago United State...,1
3284,Terry v. United States Tarahrick Terry United ...,0
3285,United States v. Cooley United States Joshua J...,1
3286,Florida v. Georgia Florida Georgia 9 to 0 majo...,0


In [102]:
# now we have the base version of our train dataset with basic feature and its target
# here on, we will apply preprocessing if and where required

def preprocess_text(text):
    # Defines a function to preprocess text by removing unnecessary HTML tags.

    # remove <p> tag
    text = text.replace('<p>', '')
    # Removes occurrences of the HTML `<p>` tag from the input text.

    return text
    # Returns the cleaned text.

# apply preprocess on train_df
def apply_preprocess(df, colName):
    # Defines a function to apply text preprocessing to a specified column in the DataFrame.

    df = df.copy()
    # Creates a copy of the input DataFrame to avoid altering the original data.

    # reason why I'm doing a copy with each utility function is that I don't wanna alter original data frame
    df[colName] = df[colName].progress_apply(preprocess_text)
    # Applies the `preprocess_text` function to every row in the specified column (`colName`)
    # and shows a progress bar using `tqdm`.

    return df
    # Returns the DataFrame with the preprocessed column.

processed_df = apply_preprocess(train_df, 'text')
# Calls the function to preprocess the 'text' column of `train_df` using `preprocess_text`.

processed_df
# Displays the DataFrame after applying the preprocessing, where the 'text' column is cleaned.


100%|██████████| 3288/3288 [00:00<00:00, 109362.83it/s]


Unnamed: 0,text,labels
0,Roe v. Wade Jane Roe Henry Wade 7 to 2 majorit...,1
1,"Stanley v. Illinois Peter Stanley, Sr. Illino...",1
2,Giglio v. United States John Giglio United St...,1
3,Reed v. Reed Sally Reed Cecil Reed 7 to 0 majo...,1
4,Miller v. California Marvin Miller California ...,1
...,...,...
3283,United States v. Palomar-Santiago United State...,1
3284,Terry v. United States Tarahrick Terry United ...,0
3285,United States v. Cooley United States Joshua J...,1
3286,Florida v. Georgia Florida Georgia 9 to 0 majo...,0


## validation split

In [103]:
# Split the dataset into training and testing sets
df_train, df_test = train_test_split(processed_df, test_size=0.2, random_state=42)
# Splits the processed DataFrame into training (80%) and testing (20%) sets.
# `random_state=42` ensures reproducibility of the split.

# Convert the dataframes to Hugging Face Datasets
train_dataset = Dataset.from_pandas(df_train)
# Converts the training DataFrame (`df_train`) into a Hugging Face Dataset object.

validation_dataset = Dataset.from_pandas(df_test)
# Converts the testing DataFrame (`df_test`) into a Hugging Face Dataset object.

# pick only feature and target column
train_dataset = train_dataset.select_columns(['text', 'labels'])
# Retains only the 'text' (features) and 'labels' (target) columns in the training dataset.

validation_dataset = validation_dataset.select_columns(['text', 'labels'])
# Retains only the 'text' and 'labels' columns in the validation dataset.

dataset = DatasetDict({
    'train': train_dataset,          # Assigns the training dataset to the 'train' key.
    'validation': validation_dataset # Assigns the validation dataset to the 'validation' key.
})
# Combines the training and validation datasets into a `DatasetDict` object for easier handling.

dataset
# Displays the resulting `DatasetDict`, showing the structure and content of the train and validation datasets.


DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 2630
    })
    validation: Dataset({
        features: ['text', 'labels'],
        num_rows: 658
    })
})

##Tokenize

In [104]:
# Define the base model name from Hugging Face Model Hub
model_name = "microsoft/deberta-v3-small"  # Using this base model for binary classification due to its small size, suitable for this machine

# Define label mappings for case analysis classification
# Map numeric IDs to human-readable labels
id2label = {0: "First Party Loses", 1: "First Party Wins"}  # The ID 0 corresponds to "First Party Loses" and ID 1 corresponds to "First Party Wins"
# Map human-readable labels to numeric IDs
label2id = {"First Party Loses": 0, "First Party Wins": 1}  # The label "First Party Loses" maps to ID 0 and "First Party Wins" maps to ID 1

# Load the pre-trained model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,  # Load the model by its name from the Hugging Face Model Hub
    num_labels=2,  # The model is for binary classification (First Party Wins or Loses), so we set num_labels to 2
    # id2label=id2label,  # Optionally pass the id2label map (currently commented out)
    # label2id=label2id   # Optionally pass the label2id map (currently commented out)
)

# enable gradient check pointing
model.gradient_checkpointing_enable()

# enable quantized training
model = prepare_model_for_kbit_training(model)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [105]:
# Load the tokenizer from the pre-trained model
tokenizer = AutoTokenizer.from_pretrained(model_name)  # This loads the tokenizer corresponding to the DeBERTa model from Hugging Face

# Function to tokenize dataset rows
def tokenize_function(examples):
    text = examples['text']  # Extract the 'text' column from the dataset

    # Tokenize and truncate the input text to a maximum length of 512 tokens
    # Truncation ensures that texts longer than 512 tokens are shortened from the left
    # Padding ensures that shorter texts are padded to a length of 512
    tokenized_inputs = tokenizer(text,
                                 return_tensors="np",  # Return tokenized output as numpy arrays
                                 max_length=512,       # Set the maximum length of input sequence to 512
                                 truncation=True,      # Truncate longer texts
                                 padding='max_length') # Pad shorter texts to the max length

    # Optionally, if padding token is missing, you can add it to the tokenizer and resize the model's token embeddings
    # This step is commented out as it's not always necessary
    # if tokenizer.pad_token is None:
    #     tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    #     model.resize_token_embeddings(len(tokenizer))  # Resize the model's token embeddings accordingly (requires model object)

    return tokenized_inputs  # Return the tokenized inputs

# Tokenize the dataset using the map function, assuming the dataset is in a format compatible with .map() (e.g., Hugging Face datasets)
tokenized_dataset = dataset.map(tokenize_function, batched=True)  # Apply the tokenize function to the entire dataset in batches

# Use a collator to dynamically pad sequences during batching, ensuring that all sequences in a batch are padded to the same length
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)  # This collator will handle padding dynamically during training

# Output the tokenized dataset
tokenized_dataset  # This will display the tokenized dataset, ready for use in training or evaluation




Map:   0%|          | 0/2630 [00:00<?, ? examples/s]

Map:   0%|          | 0/658 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2630
    })
    validation: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 658
    })
})

##Evaluation metrics

In [106]:
# to import the performance of the model during training
# import accuracy evaluation metrics
accuracy = evaluate.load("accuracy")  # Loads the 'accuracy' metric from the Hugging Face `evaluate` library

# packaging accuracy metrics as a function, one for first party losses and first party wins class,
# whichever is larger will become model prediction.
# define an evaluation function to pass into trainer later
def compute_metrics(eval_pred):
    predictions, labels = eval_pred  # Unpack the eval_pred tuple into predictions and labels; predictions are logits, labels are the true values
    predictions = np.argmax(predictions, axis=1)  # Convert the logits to the predicted class by selecting the class with the highest logit value
    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}  # Compute accuracy by comparing predicted labels with true labels and return it as a dictionary


##Test before training model

In [107]:
# define list of examples
text_list = df_test['text'][5:10].tolist()
actual_winner = df_test['labels'][5:10].tolist()
print("Untrained model predictions:")
print("----------------------------")

for text in text_list:
    # tokenize text
    inputs = tokenizer.encode(text, return_tensors="pt")
    # compute logits
    logits = model(inputs).logits
    # convert logits to label
    predictions = torch.argmax(logits)
    print(id2label[predictions.tolist()]
          + " - Actual Result: " + id2label[actual_winner[text_list.index(text)]])

Untrained model predictions:
----------------------------
First Party Wins - Actual Result: First Party Loses
First Party Wins - Actual Result: First Party Wins
First Party Wins - Actual Result: First Party Wins
First Party Wins - Actual Result: First Party Wins
First Party Wins - Actual Result: First Party Loses


##Fine Tune using LoRA

In [108]:
peft_config = LoraConfig(
    task_type="SEQ_CLS", # sequence classification
    inference_mode=False,
    r=4, #intrinsic rank of trainable weight matrix
    lora_alpha=32, # learning rate
    lora_dropout=0.01, # probability of drop out, randomly 0 internal parameters during training
    target_modules = ["query_proj"] #, "value_proj"] # to see which modules to target, just print the layers
)  # apply lora to query layer

In [109]:
model = get_peft_model(model, peft_config) # get actual model and update it using the configuration of lora that we provided in previous step
model.print_trainable_parameters() # to see how much percentage of total parameters we actually need to model, as seen in result only 0.93% of the model will be trained, huge cost savings.

trainable params: 38,402 || all params: 141,934,852 || trainable%: 0.0271


In [110]:
# hyperparameters
lr = 1e-3 # size of optimization step
batch_size = 4 # number of rows in dataset processed per optimization step
num_epochs = 10 #number of times model runs through training data

In [111]:
# define training arguments
training_args = TrainingArguments(
    output_dir= model_name + "-qlora-text-classification", # defining where model to be saved
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    eval_strategy="epoch", # per epoch evaluate the model parameters
    save_strategy="epoch", # per epoch save the model parameters
    load_best_model_at_end=True, # at end return best version of the model
    fp16=True,  # Enable mixed precision training
)

In [112]:
# Create a Trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
# Train the model
trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3196,0.262444,{'accuracy': 0.958966565349544}
2,0.2108,0.286519,{'accuracy': 0.958966565349544}
3,0.1946,0.246155,{'accuracy': 0.958966565349544}
4,0.1622,0.258958,{'accuracy': 0.9574468085106383}
5,0.2107,0.267396,{'accuracy': 0.9574468085106383}
6,0.1489,0.269324,{'accuracy': 0.9604863221884499}
7,0.1769,0.2857,{'accuracy': 0.9574468085106383}
8,0.1667,0.272971,{'accuracy': 0.9604863221884499}
9,0.1683,0.280713,{'accuracy': 0.958966565349544}
10,0.1618,0.277363,{'accuracy': 0.958966565349544}


Trainer is attempting to log a value of "{'accuracy': 0.958966565349544}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.958966565349544}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.958966565349544}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.9574468085106383}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.9574468085106383}" of t

TrainOutput(global_step=6580, training_loss=0.18684608334828293, metrics={'train_runtime': 1380.9228, 'train_samples_per_second': 19.045, 'train_steps_per_second': 4.765, 'total_flos': 3487119319449600.0, 'train_loss': 0.18684608334828293, 'epoch': 10.0})

In [113]:
# evaluate
trainer.evaluate()

Trainer is attempting to log a value of "{'accuracy': 0.958966565349544}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 0.24615496397018433,
 'eval_accuracy': {'accuracy': 0.958966565349544},
 'eval_runtime': 8.491,
 'eval_samples_per_second': 77.494,
 'eval_steps_per_second': 19.432,
 'epoch': 10.0}

##Test after training model

In [114]:
# define list of examples
text_list = df_test['text'][5:10].tolist()
actual_winner = df_test['labels'][5:10].tolist()

print("Trained model predictions:")
print("----------------------------")

for text in text_list:
    # tokenize text
    inputs = tokenizer.encode(text, return_tensors="pt").to("cuda")
    # compute logits
    logits = model(inputs).logits
    # convert logits to label
    predictions = torch.argmax(logits)
    print(id2label[predictions.tolist()]
          + " - Actual Result: " + id2label[actual_winner[text_list.index(text)]])

Trained model predictions:
----------------------------
First Party Loses - Actual Result: First Party Loses
First Party Wins - Actual Result: First Party Wins
First Party Wins - Actual Result: First Party Wins
First Party Wins - Actual Result: First Party Wins
First Party Loses - Actual Result: First Party Loses
