Load dataset

In [None]:
!kaggle datasets download deepcontractor/supreme-court-judgment-prediction
!unzip supreme-court-judgment-prediction.zip

import pandas as pd

df = pd.read_csv('justice.csv')

print(df)


Dataset URL: https://www.kaggle.com/datasets/deepcontractor/supreme-court-judgment-prediction
License(s): CC0-1.0
Downloading supreme-court-judgment-prediction.zip to /content
  0% 0.00/1.33M [00:00<?, ?B/s]
100% 1.33M/1.33M [00:00<00:00, 31.9MB/s]
Archive:  supreme-court-judgment-prediction.zip
  inflating: justice.csv             
      Unnamed: 0     ID                                 name  \
0              0  50606                          Roe v. Wade   
1              1  50613                  Stanley v. Illinois   
2              2  50623              Giglio v. United States   
3              3  50632                         Reed v. Reed   
4              4  50643                 Miller v. California   
...          ...    ...                                  ...   
3298        3298  63324    United States v. Palomar-Santiago   
3299        3299  63323               Terry v. United States   
3300        3300  63331              United States v. Cooley   
3301        3301  63332  

Preprocess dataset

In [None]:
# Preprocess the data
# just keep facts and first_party_winner

#drop all rows with na
df = df.dropna()
df = df[['facts', 'first_party_winner']]
df['first_party_winner'] = df['first_party_winner'].astype(int)

#remname facts to text and first_party_winner to label
df = df.rename(columns={'first_party_winner': 'label', 'facts': 'text'})

# remove the p tag from the text
df['text'] = df['text'].str.replace('<p>', '')

print(df)

                                                   text  label
1     Joan Stanley had three children with Peter Sta...      1
2     John Giglio was convicted of passing forged mo...      1
3     The Idaho Probate Code specified that "males m...      1
4     Miller, after conducting a mass mailing campai...      1
5     Ernest E. Mandel was a Belgian professional jo...      1
...                                                 ...    ...
3297  For over a century after the Alaska Purchase i...      1
3298  Refugio Palomar-Santiago, a Mexican national, ...      1
3299  Tarahrick Terry pleaded guilty to one count of...      0
3300  Joshua James Cooley was parked in his pickup t...      1
3302  The Natural Gas Act (NGA), 15 U.S.C. §§ 717–71...      1

[3098 rows x 2 columns]



Use a light weight LLM model to predict which party will win based on some actual dataset records

imports for supervised fine tuning

In [None]:
!pip install datasets # install the datasets library
!pip install peft # install the peft library
!pip install evaluate # install the evaluate library

!pip install bitsandbytes # install the bitsandbytes library
!pip install accelerate # install the accelerate library

import bitsandbytes as bnb

from datasets import load_dataset,  Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig, prepare_model_for_kbit_training

import evaluate
import torch
import numpy as np

Collecting datasets
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.2-py3-none-any.whl (472 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.7/472.7 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading xx

defining model to use or fine tune

In [None]:
model_checkpoint = "microsoft/deberta-v3-small" # using this base model for doing binary classfication because it is the smallest parameter set, can run in this machine.

#we want to fine-tune this model to do case analysis on input text, for that we want to label map for First party wins and First party losses.

#define label maps
id2label = {0: "First Party Loses", 1: "First Party Wins"}
label2id = {"First Party Loses": 0, "First Party Wins": 1}

#generate classification model for model_checkpoint
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint,
    num_labels=2,
    id2label=id2label,
    label2id=label2id)

# enable gradient check pointing
model.gradient_checkpointing_enable()

# enable quantized training
model = prepare_model_for_kbit_training(model)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Dividing data to training and testing data from the given dataset

In [None]:
from sklearn.model_selection import train_test_split

# Split the dataset into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Convert the dataframes to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
validation_dataset = Dataset.from_pandas(test_df)

train_dataset = train_dataset.select_columns(['label', 'text'])
validation_dataset = validation_dataset.select_columns(['label', 'text'])

# Remove the index column if it exists
if '__index_level_0__' in train_dataset.features:
    train_dataset = train_dataset.remove_columns(['__index_level_0__'])
if '__index_level_0__' in validation_dataset.features:
    validation_dataset = validation_dataset.remove_columns(['__index_level_0__'])


#print(train_dataset)
#print(validation_dataset)

dataset = DatasetDict({
    'train': train_dataset,
    'validation': validation_dataset
})
dataset



DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 2478
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 620
    })
})

preprocess dataset wrt model

In [None]:
#create a tokenizer, for the particular model we are using.
# models don't understand text, need to convert them to numerical data before feeding to models
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space = True)

#create tokenize function,
#examples is rows in dataset the training dataset has 2 columns label and text, we want to grab text from it and convert into numerical values

def tokenize_function(examples):
  #extract text
   text = examples['text']

   #tokenize and truncate, required as examples for training need to be of the same length, truncate long or pad short, or do both.
   #here truncating form left, using numpy tensor, with max length 512
   tokenizer.truncate_side = "left"
   tokenized_inputs = tokenizer(text,
                                return_tensors = "np",
                                max_length=512,
                                truncation=True)

   return tokenized_inputs

   #add pad token if not exist, tokenizer doesn't have pad tokens so adding to sequence whenever PAD is there, it's ignored by LLM
   if tokenizer.pad_token is None:
      tokenizer.add_special_tokens({'pad_token': '[PAD]'})
      model.resize_token_embeddings(len(tokenizer))

#tokenize training and validation dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

# instead of doing padding for all rows, we can dynamically PAD the rows in the datasets using collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



Map:   0%|          | 0/2478 [00:00<?, ? examples/s]

Map:   0%|          | 0/620 [00:00<?, ? examples/s]

Evaluation metrics

In [None]:
#to import the performance of the model during training
#import accuracy evaluation metrics
from sklearn.metrics import classification_report
accuracy = evaluate.load("accuracy")

# packaging accuracy metrics as a function, one for first party losses and first party losses class, whichever is larger will become model prediction.
# define an evaluation function to pass into trainer later
# Evaluation metrics
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    # Calculate additional metrics
    report = classification_report(labels, predictions, output_dict=True)
    precision = report['macro avg']['precision']
    recall = report['macro avg']['recall']
    f1 = report['macro avg']['f1-score']

    return {
        "accuracy": accuracy.compute(predictions=predictions, references=labels)['accuracy'],
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Applying untrained model to text

In [None]:
# define list of examples
text_list = test_df['text'][5:10].tolist()
actual_winner = test_df['label'][5:10].tolist()
print("Untrained model predictions:")
print("----------------------------")
for text in text_list:
    # tokenize text
    inputs = tokenizer.encode(text, return_tensors="pt")
    # compute logits
    logits = model(inputs).logits
    # convert logits to label
    predictions = torch.argmax(logits)

    print(id2label[predictions.tolist()]
          + " - Actual Result: " + id2label[actual_winner[text_list.index(text)]])

Untrained model predictions:
----------------------------
First Party Wins - Actual Result: First Party Wins
First Party Wins - Actual Result: First Party Loses
First Party Wins - Actual Result: First Party Wins
First Party Wins - Actual Result: First Party Wins
First Party Wins - Actual Result: First Party Wins


Train Model

In [None]:

peft_config = LoraConfig(
    task_type="SEQ_CLS",  # sequence classification
    r=4,  # intrinsic rank of trainable weight matrix
    lora_alpha=32,  # learning rate
    lora_dropout=0.01,  # probability of dropout, randomly 0 internal parameters during training
    target_modules = ["query_proj"] #, "value_proj"] # to see which modules to target, just print the layers

)

Use config setting to update model

In [None]:
model = get_peft_model(model, peft_config) # get actual model and update it using the configuration of lora that we provided in previous step
model.print_trainable_parameters() # to see how much percentage of total parameters we actually need to model, as seen in result only 0.93% of the model will be trained, huge cost savings.

trainable params: 628,994 || all params: 67,584,004 || trainable%: 0.9307


In [None]:
# hyperparameters
lr = 1e-3 # size of optimization step
batch_size = 4 # number of rows in dataset processed per optimization step
num_epochs = 10 #number of times model runs through training data

In [None]:
# define training arguments
training_args = TrainingArguments(
    output_dir= model_checkpoint + "-qlora-text-classification", # defining where model to be saved
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    eval_strategy="epoch", # per epoch evaluate the model parameters
    save_strategy="epoch", # per epoch save the model parameters
    load_best_model_at_end=True, # at end return best version of the model
    fp16=True,  # Enable mixed precision training
)

In [None]:
# Create a Trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6395,0.64096,0.679032,0.696458,0.509984,0.427415
2,0.6566,0.651041,0.666129,0.447445,0.496577,0.408896
3,0.6476,0.665329,0.670968,0.461851,0.498887,0.406208
4,0.6459,0.672869,0.664516,0.519611,0.503056,0.433352
5,0.6161,0.645803,0.664516,0.577633,0.53887,0.520238
6,0.6184,0.702649,0.658065,0.532592,0.509783,0.462466
7,0.5984,0.708552,0.658065,0.540351,0.51362,0.472078
8,0.5937,0.751789,0.654839,0.544852,0.518902,0.487888
9,0.5737,0.750858,0.633871,0.526867,0.516142,0.499785
10,0.5301,0.778851,0.627419,0.514646,0.5088,0.490971


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast

TrainOutput(global_step=6200, training_loss=0.607559441597231, metrics={'train_runtime': 441.6026, 'train_samples_per_second': 56.114, 'train_steps_per_second': 14.04, 'total_flos': 2211308716213008.0, 'train_loss': 0.607559441597231, 'epoch': 10.0})

In [None]:
# define list of examples
text_list = test_df['text'][5:10].tolist()
actual_winner = test_df['label'][5:10].tolist()
print("Trained model predictions:")
print("----------------------------")
for text in text_list:
    # tokenize text
    inputs = tokenizer.encode(text, return_tensors="pt").to("cuda")
    # compute logits
    logits = model(inputs).logits
    # convert logits to label
    predictions = torch.argmax(logits)

    print(id2label[predictions.tolist()]
          + " - Actual Result: " + id2label[actual_winner[text_list.index(text)]])

Trained model predictions:
----------------------------
First Party Wins - Actual Result: First Party Wins
First Party Wins - Actual Result: First Party Loses
First Party Wins - Actual Result: First Party Wins
First Party Wins - Actual Result: First Party Wins
First Party Wins - Actual Result: First Party Wins
