In [2]:
from datasets import Dataset , load_dataset, DatasetDict
import pandas as pd
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding , AutoTokenizer
from transformers import AutoModelForSequenceClassification , TrainingArguments , AutoConfig
from peft import get_peft_model, LoraConfig, TaskType
import torch
import evaluate
import numpy as np
import os
from torch.utils.data import DataLoader

import wandb
import random 
from transformers import TrainingArguments, Trainer , AutoModelForSequenceClassification

roberta_checkpoint = "roberta-large"
mistral_checkpoint = "mistralai/Mistral-7B-v0.1"
llama_checkpoint = "meta-llama/Llama-2-7b-hf"
MAX_LEN = 512 


## Dataset

In [None]:
dataset = load_dataset("yelp_review_full")
dataset["train"][100]

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


tokenized_datasets = dataset.map(tokenize_function, batched=True)

small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(100))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(100))

In [48]:
def decision2label(decision):
    if  "grant" in decision:
        return 1
    elif "deny" in decision:
        return 0
    else:
        exit("Invalid decision")





def tokenize_function(briefs):
     return tokenizer(briefs["prompt"], padding="max_length", truncation=True)


TESTSET = "/u3/oqcardos/motion_prediction/dataset/testset.csv"

testset = pd.read_csv(TESTSET, index_col=0)

testset['labels'] = testset['completion'].apply(decision2label)

train = testset.loc[testset['data_type'] == 'train']
test = testset.loc[testset['data_type'] == 'test']




dataset_train = Dataset.from_pandas(train, preserve_index=False)
dataset_test = Dataset.from_pandas(test, preserve_index=False)
dataset = DatasetDict()
dataset['train'] = dataset_train
dataset['test'] = dataset_test

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


tokenized_datasets = dataset.map(tokenize_function, batched=True)

small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(200))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(200))


print(dataset)
testset

Map:   0%|          | 0/610 [00:00<?, ? examples/s]

Map:   0%|          | 0/605 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['prompt', 'completion', 'brief_type', 'data_type', 'file_path', 'file_name', 'labels'],
        num_rows: 610
    })
    test: Dataset({
        features: ['prompt', 'completion', 'brief_type', 'data_type', 'file_path', 'file_name', 'labels'],
        num_rows: 605
    })
})


Unnamed: 0,prompt,completion,brief_type,data_type,file_path,file_name,labels
0,Case 9:11-cv-80416-KLR Document 737 Entered on...,deny,support,train,1013,gov.uscourts.flsd.377721.737.0.txt,0
1,Case 9:11-cv-80416-KLR Document 743 Entered on...,deny,opposition,train,1013,gov.uscourts.flsd.377721.743.0.txt,0
2,IN THE UNITED STATES DISTRICT COURT\nFOR THE S...,deny,support,train,909,gov.uscourts.mssd.95610.6.0.txt,0
3,"I i .\nt | | , : tl\n\n \n\n \n\n \n\n \n\n \n...",deny,opposition,train,909,gov.uscourts.mssd.95610.7.0.txt,0
4,Case 1:08-cv-11908-RWZ Document 27\n\nFiled 09...,grant,support,train,507,gov.uscourts.mad.118796.27.0.txt,1
...,...,...,...,...,...,...,...
1210,Case 6:10-cv-00111-LED Document 333\n\nFiled 0...,deny,opposition,test,621,gov.uscourts.txed.121829.333.0.txt,0
1211,"Helio LLC v. Palm, Inc. Doc. 3\n\n \n\n \n\n \...",deny,support,train,574,gov.uscourts.cand.187342.3.0.txt,0
1212,"Helio LLC v. Palm, lnc.\n\n60373/2021440.1\n\n...",deny,opposition,train,574,gov.uscourts.cand.187342.9.0.txt,0
1213,Case 1:11-cv-00696-RLW Document 4 Filed 04/08/...,grant,support,test,711,gov.uscourts.dcd.147524.4.0.txt,1


## Model

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5).to(device)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Trainer API

In [None]:
training_args = TrainingArguments(output_dir="test_trainer")
metric = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


training_args = TrainingArguments(output_dir="tutorial_trainer", evaluation_strategy="epoch",  report_to="wandb",)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,

)

trainer.train()


## Native pytorch API

In [49]:
tokenized_datasets = tokenized_datasets.remove_columns(["completion","prompt","brief_type","data_type", "file_path", "file_name"])
#tokenized_datasets = tokenized_datasets.rename_column("completion", "labels")
tokenized_datasets.set_format("torch")

small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(100))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(100))



train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, batch_size=8)
eval_dataloader = DataLoader(tokenized_datasets["test"], batch_size=8)



optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

In [50]:

# what does get_scheduler do?

from transformers import get_scheduler

num_epochs = 10
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)




In [44]:
train_dataloader.dataset[2]

{'labels': tensor(0),
 'input_ids': tensor([  101,  9060,   124,   131,  1405,   118,   172,  1964,   118,  3135,
          1571,  1559,  1559,   118,   147, 22689, 11387, 15447,  2227,  5581,
         25647,  1181,  5037,   120,  1479,   120,  1479,  8500,   122,  1104,
          3164,  8500,  9949,   108, 22196,  1477,  7414, 12150, 10069, 23676,
         13821,  9919,   141,  6258, 23313,  9741,  1942, 18732, 19556,  1942,
           143,  9565,  7462, 23616,  9272,  9637,  2249,   141,  6258, 23313,
          9741,  1942, 11345,  7118,  2069, 21065, 27451,  1592,  6110,  1784,
           153,  3048, 17656, 11410,   150,  9565, 20595,  1708,  3066, 15969,
          1658,   119,   117, 13823,  3121,  3101,   117,   191,   119,  6586,
          9637,  9741, 14962, 18396,  9919, 23161, 20595,  9919, 15969, 12880,
         15654, 21669, 11414, 12507,   117,   149,   119,   149,   119,   140,
           119,   117,  3177, 13488, 26977,   119,   114,   114,   114,   114,
           114,  

In [51]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/770 [00:00<?, ?it/s]

In [53]:
import evaluate

metric = evaluate.load("accuracy")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

Using the latest cached version of the module from /u3/oqcardos/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--accuracy/f887c0aab52c2d38e1f8a215681126379eca617f96c447638f751434e8e65b14 (last modified on Fri Feb  9 23:00:53 2024) since it couldn't be found locally at evaluate-metric--accuracy, or remotely on the Hugging Face Hub.


{'accuracy': 0.48925619834710743}

In [52]:
loss

tensor(0.0949, device='cuda:0', grad_fn=<NllLossBackward0>)