In [1]:
from datasets import Dataset , load_dataset, DatasetDict
import pandas as pd
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding , AutoTokenizer
from transformers import AutoModelForSequenceClassification , TrainingArguments , AutoConfig
from peft import get_peft_model, LoraConfig, TaskType
import torch
import evaluate
import numpy as np
import os
from torch.utils.data import DataLoader

import wandb
import random 
from transformers import TrainingArguments, Trainer , AutoModelForSequenceClassification

roberta_checkpoint = "roberta-large"

mistral_checkpoint = "mistralai/Mistral-7B-v0.1"

llama_checkpoint = "meta-llama/Llama-2-7b-hf"
MAX_LEN = 512 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.distributed.is_available()

True

## Dataset

In [2]:
dataset = load_dataset("yelp_review_full")
dataset["train"][100]

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(100))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(100))

In [2]:
def decision2label(decision):
    if  "grant" in decision:
        return 1
    elif "deny" in decision:
        return 0
    else:
        print(f"error occured with decision: {decision} ",)
        exit("Invalid decision")

def tokenize_function(briefs):
     return tokenizer(briefs["prompt"], padding="max_length", truncation=True)

import evaluate


def test_metrics(model, dataloader):
    acc = evaluate.load("accuracy")
    preci = evaluate.load("precision")
    recall = evaluate.load("recall")

    model.eval()
    for batch in dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        acc.add_batch(predictions=predictions, references=batch["labels"])
        preci.add_batch(predictions=predictions, references=batch["labels"])
        recall.add_batch(predictions=predictions, references=batch["labels"])

    return {'accuracy': acc.compute()['accuracy'],
            'precision': preci.compute()['precision'], 
            'recall': recall.compute()['recall']}

TESTSET = "/u3/oqcardos/motion_prediction/dataset/testset.csv"

testset = pd.read_csv(TESTSET, index_col=0)

testset['labels'] = testset['completion'].apply(decision2label)

train = testset.loc[testset['data_type'] == 'train']
test = testset.loc[testset['data_type'] == 'test']

support_train = train.loc[train['brief_type'] == "support"]
support_test = test.loc[test['brief_type'] == "support"]

oppo_train = train.loc[train['brief_type'] == "opposition"]
oppo_test = test.loc[test['brief_type'] == "opposition"]





In [5]:

model_type = "mistral"
# can change the argument
dataset_train = Dataset.from_pandas(support_train, preserve_index=False)
dataset_test = Dataset.from_pandas(support_test, preserve_index=False)


dataset = DatasetDict()


dataset['train'] = dataset_train
dataset['test'] = dataset_test

pos_weights = len(dataset['train'].to_pandas()) / (2 * dataset['train'].to_pandas().labels.value_counts()[1])
neg_weights = len(dataset['train'].to_pandas()) / (2 * dataset['train'].to_pandas().labels.value_counts()[0])


if model_type == "mistral":

    tokenizer = AutoTokenizer.from_pretrained(mistral_checkpoint, add_prefix_space=True, device=device)
    tokenizer.pad_token_id = tokenizer.eos_token_id
    tokenizer.pad_token = tokenizer.eos_token


    config = AutoConfig.from_pretrained(mistral_checkpoint)
    max_input_size =  10000

    def tokenize_function(examples):
        return tokenizer(examples['prompt'], truncation=True, max_length=max_input_size)

    mistral_data_collator = DataCollatorWithPadding(tokenizer)

else:
    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

    def tokenize_function(briefs):
     return tokenizer(briefs["prompt"], padding="max_length", truncation=True)


tokenized_datasets = dataset.map(tokenize_function, batched=True)

# small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(200))
# small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(200))

tokenized_datasets = tokenized_datasets.remove_columns(["completion","prompt","brief_type","data_type", "file_path", "file_name"])
tokenized_datasets.set_format("torch")


train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, batch_size=1, collate_fn=mistral_data_collator)
eval_dataloader = DataLoader(tokenized_datasets["test"], batch_size=1, collate_fn=mistral_data_collator)


print(dataset)
testset

Map:   0%|          | 0/321 [00:00<?, ? examples/s]

Map:   0%|          | 0/309 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['prompt', 'completion', 'brief_type', 'data_type', 'file_path', 'file_name', 'labels'],
        num_rows: 321
    })
    test: Dataset({
        features: ['prompt', 'completion', 'brief_type', 'data_type', 'file_path', 'file_name', 'labels'],
        num_rows: 309
    })
})


Unnamed: 0,prompt,completion,brief_type,data_type,file_path,file_name,labels
0,Case 9:11-cv-80416-KLR Document 737 Entered on...,deny,support,train,1013,gov.uscourts.flsd.377721.737.0.txt,0
1,Case 9:11-cv-80416-KLR Document 743 Entered on...,deny,opposition,train,1013,gov.uscourts.flsd.377721.743.0.txt,0
2,IN THE UNITED STATES DISTRICT COURT\nFOR THE S...,deny,support,train,909,gov.uscourts.mssd.95610.6.0.txt,0
3,"I i .\nt | | , : tl\n\n \n\n \n\n \n\n \n\n \n...",deny,opposition,train,909,gov.uscourts.mssd.95610.7.0.txt,0
4,Case 1:08-cv-11908-RWZ Document 27\n\nFiled 09...,grant,support,train,507,gov.uscourts.mad.118796.27.0.txt,1
...,...,...,...,...,...,...,...
1210,Case 6:10-cv-00111-LED Document 333\n\nFiled 0...,deny,opposition,test,621,gov.uscourts.txed.121829.333.0.txt,0
1211,"Helio LLC v. Palm, Inc. Doc. 3\n\n \n\n \n\n \...",deny,support,train,574,gov.uscourts.cand.187342.3.0.txt,0
1212,"Helio LLC v. Palm, lnc.\n\n60373/2021440.1\n\n...",deny,opposition,train,574,gov.uscourts.cand.187342.9.0.txt,0
1213,Case 1:11-cv-00696-RLW Document 4 Filed 04/08/...,grant,support,test,711,gov.uscourts.dcd.147524.4.0.txt,1


## Model

Bert

In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
num_params = model.num_parameters()
print(f"The model has {num_params} parameters.")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


The model has 108311810 parameters.


Mistral

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model =  AutoModelForSequenceClassification.from_pretrained(
  pretrained_model_name_or_path=mistral_checkpoint,
  num_labels=2,
  use_flash_attention_2=True,
  torch_dtype= torch.bfloat16,
  device_map="auto"
)#.to(device)

model.config.pad_token_id = model.config.eos_token_id

mistral_peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS, r=2, lora_alpha=16, lora_dropout=0.1, bias="none", 
    target_modules=[
        "q_proj",
        "v_proj",
    ],
)

model = get_peft_model(model, mistral_peft_config)
model.print_trainable_parameters()

lr = 0.01
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at mistralai/Mistral-7B-v0.1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 860,160 || all params: 7,111,528,448 || trainable%: 0.012095290151611583


## Trainer API

In [None]:
training_args = TrainingArguments(output_dir="test_trainer")
metric = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(output_dir="tutorial_trainer", evaluation_strategy="epoch",  report_to="wandb",)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,

)

trainer.train()


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33moqcardoso[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [5]:
def compute_metrics(eval_pred):
    # All metrics are already predefined in the HF `evaluate` package
    precision_metric = evaluate.load("precision")
    recall_metric = evaluate.load("recall")
    f1_metric= evaluate.load("f1")
    accuracy_metric = evaluate.load("accuracy")

    logits, labels = eval_pred # eval_pred is the tuple of predictions and labels returned by the model
    predictions = np.argmax(logits, axis=-1)
    precision = precision_metric.compute(predictions=predictions, references=labels)["precision"]
    recall = recall_metric.compute(predictions=predictions, references=labels)["recall"]
    f1 = f1_metric.compute(predictions=predictions, references=labels)["f1"]
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
    # The trainer is expecting a dictionary where the keys are the metrics names and the values are the scores. 
    return {"precision": precision, "recall": recall, "f1-score": f1, 'accuracy': accuracy}



from transformers import Trainer

class WeightedCELossTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        # Get model's predictions
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # Compute custom loss
        loss_fct = torch.nn.CrossEntropyLoss(weight=torch.tensor([neg_weights, pos_weights], device=model.device, dtype=logits.dtype))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss


In [None]:
model.device

device(type='cuda', index=0)

In [None]:
from transformers import TrainingArguments, Trainer

# mistral_model = mistral_model.cuda()

lr = 1e-4
batch_size = 1
num_epochs = 5

training_args = TrainingArguments(
    output_dir="mistral-lora-token-classification",
    learning_rate=lr,
    lr_scheduler_type= "constant",
    warmup_ratio= 0.1,
    max_grad_norm= 0.3,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.001,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="wandb",
    fp16=True,
    gradient_checkpointing=True,
)


mistral_trainer = WeightedCELossTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets["test"],
    data_collator=mistral_data_collator,
    compute_metrics=compute_metrics
)

mistral_trainer.train()


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33moqcardoso[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Epoch,Training Loss,Validation Loss,Precision,Recall,F1-score,Accuracy
1,No log,0.693147,0.0,0.0,0.0,0.537217
2,No log,0.693147,0.0,0.0,0.0,0.537217
3,No log,0.693147,0.0,0.0,0.0,0.537217


Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


KeyboardInterrupt: 

## Native pytorch API

In [5]:

# what does get_scheduler do?

from transformers import get_scheduler

num_epochs = 100
num_training_steps = num_epochs * len(train_dataloader)

wandb.init(
    # set the wandb project where this run will be logged
    project="LLM_TOTURIAL",  
    name=f"Support-mistral-7B-v0.1-1-Tokensize:{max_input_size}",
    # track hyperparameters and run metadata
    config={
    "optimizer": "AdamW",
    "lr": lr,

    "dataset": "single-supports",
    "epochs": num_epochs,
    }
)

# lr_scheduler = get_scheduler(
#     name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
# )


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


[34m[1mwandb[0m: Currently logged in as: [33moqcardoso[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [6]:
from tqdm.auto import tqdm
progress_bar = tqdm(range(num_training_steps))

# i wonder if the outputs.loss is the same as loss_fn(outputs, labels)
# Try to log the values 


model.train()
print("Training model")
for epoch in range(num_epochs):
    acc = evaluate.load("accuracy")
    average_loss = 0
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        average_loss += loss.item()
        loss.backward()

        optimizer.step()
        #lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

        # get the predictions
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        acc.add_batch(predictions=predictions, references=batch["labels"])

    accuracy_per_epoch = acc.compute()
    print(f"Accuracy: {accuracy_per_epoch}")
    avg_loss = average_loss / len(train_dataloader)
    print(f"loss : {avg_loss}")

    print("Evaluating model on test set")
    metrics = test_metrics(model, eval_dataloader)
    print(metrics )
    wandb.log({"loss_per_epoch": avg_loss , 
               "accuracy_per_epoch": accuracy_per_epoch,
               "test_accuracy" :metrics["accuracy"],
                "test_recall": metrics["recall"],
                "test_precision": metrics["precision"],
               })
    
wandb.finish()

  0%|          | 0/32100 [00:00<?, ?it/s]

Training model


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Accuracy: {'accuracy': 0.4735202492211838}
loss : nan
Evaluating model on test set


KeyboardInterrupt: 

In [7]:
data = train_dataloader.dataset[0]
data['input_ids'].shape , data['attention_mask'].shape , data['labels'].shape

(torch.Size([3623]), torch.Size([3623]), torch.Size([]))

In [8]:
5e-3

0.005

In [6]:
    
for batch in train_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    outputs = model(**batch)
    break

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [7]:
# del model

outputs.logits

tensor([[0., 0.]], device='cuda:0', dtype=torch.bfloat16,
       grad_fn=<ToCopyBackward0>)

In [None]:
loss.to("cpu").detach().numpy()

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
# figure if there is a bug in Bert
    # get prediction files for mistral support and opposition.
    # show that bug doesn't exist in support.

# figure out how mistral classification isn't working
# get prediction files for Deepset support and oppostion
