# Imports

In [1]:
!pip install peft evaluate -q

In [2]:
from datasets import load_dataset,  Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding)

from sklearn.model_selection import train_test_split

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

# Load dataset

In [3]:
CSV_PATH = '/kaggle/input/supreme-court-judgment-prediction/justice.csv'

In [4]:
def readFromCsv(filePath):
    df = pd.read_csv(filePath)

    # clean Unnamed col
    df.drop(columns=["Unnamed: 0"], inplace=True)

    # take a sneak peek
    display(df.head())

    return df

In [5]:
org_df = readFromCsv(CSV_PATH)
org_df.info()

Unnamed: 0,ID,name,href,docket,term,first_party,second_party,facts,facts_len,majority_vote,minority_vote,first_party_winner,decision_type,disposition,issue_area
0,50606,Roe v. Wade,https://api.oyez.org/cases/1971/70-18,70-18,1971,Jane Roe,Henry Wade,"<p>In 1970, Jane Roe (a fictional name used in...",501,7,2,True,majority opinion,reversed,
1,50613,Stanley v. Illinois,https://api.oyez.org/cases/1971/70-5014,70-5014,1971,"Peter Stanley, Sr.",Illinois,<p>Joan Stanley had three children with Peter ...,757,5,2,True,majority opinion,reversed/remanded,Civil Rights
2,50623,Giglio v. United States,https://api.oyez.org/cases/1971/70-29,70-29,1971,John Giglio,United States,<p>John Giglio was convicted of passing forged...,495,7,0,True,majority opinion,reversed/remanded,Due Process
3,50632,Reed v. Reed,https://api.oyez.org/cases/1971/70-4,70-4,1971,Sally Reed,Cecil Reed,"<p>The Idaho Probate Code specified that ""male...",378,7,0,True,majority opinion,reversed/remanded,Civil Rights
4,50643,Miller v. California,https://api.oyez.org/cases/1971/70-73,70-73,1971,Marvin Miller,California,"<p>Miller, after conducting a mass mailing cam...",305,5,4,True,majority opinion,vacated/remanded,First Amendment


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3303 entries, 0 to 3302
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   ID                  3303 non-null   int64 
 1   name                3303 non-null   object
 2   href                3303 non-null   object
 3   docket              3292 non-null   object
 4   term                3303 non-null   object
 5   first_party         3302 non-null   object
 6   second_party        3302 non-null   object
 7   facts               3303 non-null   object
 8   facts_len           3303 non-null   int64 
 9   majority_vote       3303 non-null   int64 
 10  minority_vote       3303 non-null   int64 
 11  first_party_winner  3288 non-null   object
 12  decision_type       3296 non-null   object
 13  disposition         3231 non-null   object
 14  issue_area          3161 non-null   object
dtypes: int64(4), object(11)
memory usage: 387.2+ KB


# Preprocess dataset

In [6]:
# check for null before removing
org_df.isnull().sum()

ID                      0
name                    0
href                    0
docket                 11
term                    0
first_party             1
second_party            1
facts                   0
facts_len               0
majority_vote           0
minority_vote           0
first_party_winner     15
decision_type           7
disposition            72
issue_area            142
dtype: int64

In [7]:
org_df['augmented_text'] = ""

for idx, row in org_df.iterrows():
    org_df.at[idx, 'augmented_text'] = f"{row['name']} {row['first_party']} {row['second_party']} {row['majority_vote']} to {row['minority_vote']} {row['decision_type']} {row['disposition']}  {row['issue_area']} {row['facts']}"

org_df.head()

Unnamed: 0,ID,name,href,docket,term,first_party,second_party,facts,facts_len,majority_vote,minority_vote,first_party_winner,decision_type,disposition,issue_area,augmented_text
0,50606,Roe v. Wade,https://api.oyez.org/cases/1971/70-18,70-18,1971,Jane Roe,Henry Wade,"<p>In 1970, Jane Roe (a fictional name used in...",501,7,2,True,majority opinion,reversed,,Roe v. Wade Jane Roe Henry Wade 7 to 2 majorit...
1,50613,Stanley v. Illinois,https://api.oyez.org/cases/1971/70-5014,70-5014,1971,"Peter Stanley, Sr.",Illinois,<p>Joan Stanley had three children with Peter ...,757,5,2,True,majority opinion,reversed/remanded,Civil Rights,"Stanley v. Illinois Peter Stanley, Sr. Illino..."
2,50623,Giglio v. United States,https://api.oyez.org/cases/1971/70-29,70-29,1971,John Giglio,United States,<p>John Giglio was convicted of passing forged...,495,7,0,True,majority opinion,reversed/remanded,Due Process,Giglio v. United States John Giglio United St...
3,50632,Reed v. Reed,https://api.oyez.org/cases/1971/70-4,70-4,1971,Sally Reed,Cecil Reed,"<p>The Idaho Probate Code specified that ""male...",378,7,0,True,majority opinion,reversed/remanded,Civil Rights,Reed v. Reed Sally Reed Cecil Reed 7 to 0 majo...
4,50643,Miller v. California,https://api.oyez.org/cases/1971/70-73,70-73,1971,Marvin Miller,California,"<p>Miller, after conducting a mass mailing cam...",305,5,4,True,majority opinion,vacated/remanded,First Amendment,Miller v. California Marvin Miller California ...


In [8]:
# as we are primarily concerned about whether first party won or lost,
# discard only those nan rows for the moment

def removeNaN(df, colName):
    df = df.copy()
    df = df.dropna(subset=[colName])
    return df
    
cleaned_df = removeNaN(org_df, 'first_party_winner')
cleaned_df.isnull().sum()

ID                      0
name                    0
href                    0
docket                 10
term                    0
first_party             1
second_party            1
facts                   0
facts_len               0
majority_vote           0
minority_vote           0
first_party_winner      0
decision_type           6
disposition            65
issue_area            129
augmented_text          0
dtype: int64

In [9]:
# get features and target
def getFeatureNTarget(df):
    df = df.copy()
    df = df[['augmented_text', 'first_party_winner']]
    df['first_party_winner'] = df['first_party_winner'].astype(int)
    
    #rename facts to text and first_party_winner to label
    df = df.rename(columns={'first_party_winner': 'labels', 'augmented_text': 'text'})

    return df.reset_index(drop=True)

train_df = getFeatureNTarget(cleaned_df)
train_df

Unnamed: 0,text,labels
0,Roe v. Wade Jane Roe Henry Wade 7 to 2 majorit...,1
1,"Stanley v. Illinois Peter Stanley, Sr. Illino...",1
2,Giglio v. United States John Giglio United St...,1
3,Reed v. Reed Sally Reed Cecil Reed 7 to 0 majo...,1
4,Miller v. California Marvin Miller California ...,1
...,...,...
3283,United States v. Palomar-Santiago United State...,1
3284,Terry v. United States Tarahrick Terry United ...,0
3285,United States v. Cooley United States Joshua J...,1
3286,Florida v. Georgia Florida Georgia 9 to 0 majo...,0


In [10]:
# now we have the base version of our train dataset with basic feature and it's target
# here on, we will apply preprocessing if and where required
def preprocess_text(text):
    # remove <p> tag
    text = text.replace('<p>', '')
    return text

# apply preprocess on train_df
def apply_preprocess(df, colName):
    df = df.copy() # reason why I'm doing a copy with each utility function is that I don't wanna alter original data frame
    df[colName] = df[colName].progress_apply(preprocess_text)
    return df

processed_df = apply_preprocess(train_df, 'text')
processed_df

100%|██████████| 3288/3288 [00:00<00:00, 263733.18it/s]


Unnamed: 0,text,labels
0,Roe v. Wade Jane Roe Henry Wade 7 to 2 majorit...,1
1,"Stanley v. Illinois Peter Stanley, Sr. Illino...",1
2,Giglio v. United States John Giglio United St...,1
3,Reed v. Reed Sally Reed Cecil Reed 7 to 0 majo...,1
4,Miller v. California Marvin Miller California ...,1
...,...,...
3283,United States v. Palomar-Santiago United State...,1
3284,Terry v. United States Tarahrick Terry United ...,0
3285,United States v. Cooley United States Joshua J...,1
3286,Florida v. Georgia Florida Georgia 9 to 0 majo...,0


# Validation Split

In [11]:
# Split the dataset into training and testing sets
df_train, df_test = train_test_split(processed_df, test_size=0.2, random_state=42)

# Convert the dataframes to Hugging Face Datasets
train_dataset = Dataset.from_pandas(df_train)
validation_dataset = Dataset.from_pandas(df_test)

# pick only feature and target column
train_dataset = train_dataset.select_columns(['text', 'labels'])
validation_dataset = validation_dataset.select_columns(['text', 'labels'])

dataset = DatasetDict({
    'train': train_dataset,
    'validation': validation_dataset
})

dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 2630
    })
    validation: Dataset({
        features: ['text', 'labels'],
        num_rows: 658
    })
})

# Classfical Approach: Tf-Idf

In [12]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report

In [13]:
X_train = df_train['text'].str.lower().values
y_train = df_train['labels']
X_test = df_test['text'].str.lower().values
y_test = df_test['labels']

tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Train the classifier
classifier = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
classifier.fit(X_train_tfidf, y_train)

# Make predictions and evaluate
y_pred = classifier.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print(classification_report(y_test, y_pred))

Accuracy: 0.88
              precision    recall  f1-score   support

           0       0.94      0.73      0.83       252
           1       0.85      0.97      0.91       406

    accuracy                           0.88       658
   macro avg       0.90      0.85      0.87       658
weighted avg       0.89      0.88      0.88       658



# Tokenize

In [14]:
model_name = "microsoft/deberta-v3-small" # using this base model for doing binary classfication because it is the smallest parameter set, can run in this machine.

# we want to fine-tune this model to do case analysis on input text, for that we want to label map for First party wins and First party losses.
# define label maps
id2label = {0: "First Party Loses", 1: "First Party Wins"}
label2id = {"First Party Loses": 0, "First Party Wins": 1}

#generate classification model for model_checkpoint

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,
    # id2label=id2label,
    # label2id=label2id
)

config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/286M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
# Load tokenizer from pretrained model
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize dataset rows
def tokenize_function(examples):
    text = examples['text']  # Extract the text from the dataset

    # Tokenize and truncate, with max_length set to 512 and truncation from the left
    tokenized_inputs = tokenizer(text, 
                                 return_tensors="np", 
                                 max_length=512, 
                                 truncation=True,
                                padding='max_length')

    # Add a padding token if it's missing
    # if tokenizer.pad_token is None:
    #     tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    #     model.resize_token_embeddings(len(tokenizer))  # Resize embeddings for model (requires model object)

    return tokenized_inputs

# Tokenize the dataset, assuming it's in a format compatible with the .map() method
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Use a collator to dynamically pad sequences during batching
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Output the tokenized dataset
tokenized_dataset

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



Map:   0%|          | 0/2630 [00:00<?, ? examples/s]

Map:   0%|          | 0/658 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2630
    })
    validation: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 658
    })
})

# Evaluation metrics

In [16]:
# to import the performance of the model during training
# import accuracy evaluation metrics
accuracy = evaluate.load("accuracy")

# packaging accuracy metrics as a function, one for first party losses and first party losses class, whichever is larger will become model prediction.
# define an evaluation function to pass into trainer later
def compute_metrics(eval_pred):
  predictions, labels = eval_pred # predictions here are the logits, has 2 elements + and -, evaluating which element is larger and which is larger will be the label.
  predictions = np.argmax(predictions, axis=1)
  return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

# Zero-shot Inference

In [17]:
# define list of examples
text_list = df_test['text'][5:10].tolist()
actual_winner = df_test['labels'][5:10].tolist()
print("Untrained model predictions:")
print("----------------------------")

for text in text_list:
    # tokenize text
    inputs = tokenizer.encode(text, return_tensors="pt")
    # compute logits
    logits = model(inputs).logits
    # convert logits to label
    predictions = torch.argmax(logits)
    print(id2label[predictions.tolist()]
          + " - Actual Result: " + id2label[actual_winner[text_list.index(text)]])

Untrained model predictions:
----------------------------
First Party Wins - Actual Result: First Party Loses
First Party Wins - Actual Result: First Party Wins
First Party Wins - Actual Result: First Party Wins
First Party Wins - Actual Result: First Party Wins
First Party Wins - Actual Result: First Party Loses


# Fine-tune

In [18]:
peft_config = LoraConfig(
    task_type="SEQ_CLS", # sequence classification
    inference_mode=False,
    r=4, #intrinsic rank of trainable weight matrix
    lora_alpha=32, # learning rate
    lora_dropout=0.01, # probability of drop out, randomly 0 internal parameters during training
    target_modules = ["query_proj"] #, "value_proj"] # to see which modules to target, just print the layers
)  # apply lora to query layer

In [19]:
model = get_peft_model(model, peft_config) # get actual model and update it using the configuration of lora that we provided in previous step
model.print_trainable_parameters() # to see how much percentage of total parameters we actually need to model, as seen in result only 0.93% of the model will be trained, huge cost savings.

trainable params: 38,402 || all params: 141,934,852 || trainable%: 0.0271


In [20]:
# hyperparameters
lr = 1e-3 # size of optimization step
batch_size = 4 # number of rows in dataset processed per optimization step
num_epochs = 10 #number of times model runs through training data

In [21]:
# define training arguments
training_args = TrainingArguments(
    output_dir= f'./{model_name}-lora-text-classification', # defining where model to be saved
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    eval_strategy="epoch", # per epoch evaluate the model parameters
    save_strategy="epoch", # per epoch save the model parameters
    load_best_model_at_end=True, # at end return best version of the model
)

In [22]:
# Create a Trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
# Train the model
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112873600000562, max=1.0…

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.231626,{'accuracy': 0.9559270516717325}
2,0.284900,0.239165,{'accuracy': 0.958966565349544}
3,0.284900,0.194412,{'accuracy': 0.9620060790273556}
4,0.170600,0.200408,{'accuracy': 0.958966565349544}
5,0.159800,0.186323,{'accuracy': 0.9620060790273556}
6,0.159800,0.200447,{'accuracy': 0.9620060790273556}
7,0.137900,0.220921,{'accuracy': 0.9604863221884499}
8,0.127400,0.213209,{'accuracy': 0.9604863221884499}
9,0.127400,0.210113,{'accuracy': 0.9650455927051672}
10,0.117200,0.214385,{'accuracy': 0.9620060790273556}


Trainer is attempting to log a value of "{'accuracy': 0.9559270516717325}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
Trainer is attempting to log a value of "{'accuracy': 0.958966565349544}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
Trainer is attempting to log a value of "{'accuracy': 0.9620060790273556}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
Trainer is attemp

TrainOutput(global_step=3290, training_loss=0.1623167484364611, metrics={'train_runtime': 1274.0732, 'train_samples_per_second': 20.642, 'train_steps_per_second': 2.582, 'total_flos': 3487119319449600.0, 'train_loss': 0.1623167484364611, 'epoch': 10.0})

In [25]:
# evaluate
trainer.evaluate()

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Trainer is attempting to log a value of "{'accuracy': 0.9620060790273556}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 0.18632306158542633,
 'eval_accuracy': {'accuracy': 0.9620060790273556},
 'eval_runtime': 13.4433,
 'eval_samples_per_second': 48.946,
 'eval_steps_per_second': 6.174,
 'epoch': 10.0}

In [24]:
# define list of examples
text_list = df_test['text'][5:10].tolist()
actual_winner = df_test['labels'][5:10].tolist()

print("Trained model predictions:")
print("----------------------------")

for text in text_list:
    # tokenize text
    inputs = tokenizer.encode(text, return_tensors="pt").to("cuda")
    # compute logits
    logits = model(inputs).logits
    # convert logits to label
    predictions = torch.argmax(logits)
    print(id2label[predictions.tolist()]
          + " - Actual Result: " + id2label[actual_winner[text_list.index(text)]])

Trained model predictions:
----------------------------
First Party Loses - Actual Result: First Party Loses
First Party Wins - Actual Result: First Party Wins
First Party Wins - Actual Result: First Party Wins
First Party Wins - Actual Result: First Party Wins
First Party Loses - Actual Result: First Party Loses


# End