### Note 
You will have to run this notebook twice. One for Support and the Other for Opposition


In [5]:
from datasets import Dataset , load_dataset, DatasetDict
import pandas as pd
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding , AutoTokenizer
from transformers import AutoModelForSequenceClassification , TrainingArguments , AutoConfig
from peft import get_peft_model, LoraConfig, TaskType
import torch
import torch.nn.functional as f
import evaluate
import numpy as np
import os
from torch.utils.data import DataLoader
import evaluate
import evaluate
import time
import wandb
import random 
from transformers import TrainingArguments, Trainer , AutoModelForSequenceClassification
from transformers import get_scheduler
from datasets import Features , ClassLabel, Value, Sequence
from tqdm.auto import tqdm
roberta_checkpoint = "roberta-large"

mistral_checkpoint = "mistralai/Mistral-7B-v0.1"
bert_checkpoint = "allenai/longformer-large-4096"# "bert-base-uncased"  

llama_checkpoint = "meta-llama/Llama-2-7b-hf"
MAX_LEN = 512 
from accelerate import Accelerator

# device =  torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
print(torch.distributed.is_available())

torch.cuda.device_count()

True


3

### Dataset and Functions

In this tab retrive the dataset relevant for the experiment.
You only have to run the cell below once.
It is important know that the cell below produces two types of dataframes, "support" and "opposition".


This tab also contains most of the functinos used in this Notebook

In [6]:
def decision2label(decision):
    if  "grant" in decision:
        return 1
    elif "deny" in decision:
        return 0
    else:
        print(f"error occured with decision: {decision} ",)
        exit("Invalid decision")


def test_metrics(model, dataloader):
    acc = evaluate.load("accuracy")
    preci = evaluate.load("precision")
    recall = evaluate.load("recall")

    csv = {'brief':[],'predict':[], 'score':[], 'truth':[]}

    model.eval()
    for inputs in dataloader:
        briefs = inputs['file_name']
        inputs = {k: v for k, v in inputs.items() if k != "file_name"}
        with torch.no_grad():
            outputs = model(**inputs)

        logits = outputs.logits
        propabilities = f.softmax(logits, dim=-1)
        predictions = torch.argmax(logits, dim=-1)
        acc.add_batch(predictions=predictions, references=inputs["labels"])
        preci.add_batch(predictions=predictions, references=inputs["labels"])
        recall.add_batch(predictions=predictions, references=inputs["labels"])

        csv['brief'].extend(briefs)
        labels = lambda x: "grant" if x == 1 else "deny"
        predict = list(map(labels, predictions))
        csv['predict'].extend(predict) 
        csv['score'].extend(propabilities[:,1].cpu().numpy())
        csv['truth'].extend(list(map(labels, inputs["labels"].cpu().numpy())))

    return {'accuracy': acc.compute()['accuracy'],
            'precision': preci.compute()['precision'], 
            'recall': recall.compute()['recall'],
            'csv': csv}


model_type = "bert"

key = "support"

# Remember to change this

#TESTSET = "../dataset/testset.csv"

UNPAIRED_PATH = '../dataset/testset.csv'

testset = pd.read_csv(UNPAIRED_PATH, index_col=0)

testset = testset.loc[testset['data_type'] == 'train']

# randomly set 20% of the data to test
testset['data_type'] = testset['data_type'].apply(lambda x: "test" if random.random() < 0.2 else "train")

testset['labels'] = testset['completion'].apply(decision2label)

train = testset.loc[testset['data_type'] == 'train']
test = testset.loc[testset['data_type'] == 'test']

support_train = train.loc[train['brief_type'] == "support"]
support_test = test.loc[test['brief_type'] == "support"]

oppo_train = train.loc[train['brief_type'] == "opposition"]
oppo_test = test.loc[test['brief_type'] == "opposition"]




features = Features({ 'prompt' : Value(dtype='string'),
                    'completion': ClassLabel(num_classes=3, names=['deny', 'grant', 'TBD'],  id=None),
                    'brief_type' : ClassLabel(num_classes=2, names=["support", "opposition"], id=None),
                        'data_type' : ClassLabel(num_classes=2, names=["train", "test"], id=None),
                        'file_path' : Value(dtype='int64') ,
                        'file_name' : Value(dtype='string'),   
                        'labels' : Value(dtype='int64')
                    })

# can change the argument
if key == "support":    
    dataset_train = Dataset.from_pandas(support_train, preserve_index=False , features= features )
    dataset_test = Dataset.from_pandas(support_test, preserve_index=False,  features= features)
else: #  key == opposition

    dataset_train = Dataset.from_pandas(oppo_train, preserve_index=False, features= features)
    dataset_test = Dataset.from_pandas(oppo_test, preserve_index=False, features= features)



dataset = DatasetDict()


dataset['train'] = dataset_train
dataset['test'] = dataset_test


In [7]:

def training_function():


    # try putting support and train together

    lr = 1e-5
    
    accelerator = Accelerator()

    if model_type == "mistral":

        tokenizer = AutoTokenizer.from_pretrained(mistral_checkpoint, add_prefix_space=True,)
        tokenizer.pad_token_id = tokenizer.eos_token_id
        tokenizer.pad_token = tokenizer.eos_token


        config = AutoConfig.from_pretrained(mistral_checkpoint)
        max_input_size =  1024

        def tokenize_function(examples):
            return tokenizer(examples['prompt'], truncation= True, padding="max_length" , max_length=max_input_size)

        #mistral_data_collator = DataCollatorWithPadding(tokenizer)

    else:
        tokenizer = AutoTokenizer.from_pretrained(bert_checkpoint)

        def tokenize_function(briefs):
            return tokenizer(briefs["prompt"], padding="max_length", truncation=True)


    tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["completion","prompt","brief_type","data_type", "file_path",  ])


    # tokenized_datasets = tokenized_datasets.remove_columns() # ])
    tokenized_datasets.set_format("torch")

    

    train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, batch_size=1, )
    eval_dataloader = DataLoader(tokenized_datasets["test"], batch_size=1, ) 




    num_epochs = 50
    num_training_steps = num_epochs * len(train_dataloader)

    name = "Support" if key == "support" else "Opposition"

    wandb.init(
        # set the wandb project where this run will be logged
        project="LLM_TUTORIAL",  
        name= f"{name}-{bert_checkpoint}",#f"Opposition-mistral-7B-v0.1-1-Tokensize:{max_input_size}",
        # track hyperparameters and run metadata
        config={
        "optimizer": "AdamW",
        "lr": lr,

        "dataset": "single-supports",
        "epochs": num_epochs,
        }
    )


    lr = 1e-5
    model = AutoModelForSequenceClassification.from_pretrained(bert_checkpoint, num_labels=2, )
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    num_params = model.num_parameters()
    print(f"The model has {num_params} parameters.")
    print(f"The model has a context window of {model.config.max_position_embeddings} tokens.")


    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(model, optimizer, train_dataloader, eval_dataloader)


    progress_bar = tqdm(range(num_training_steps))

    # i wonder if the outputs.loss is the same as loss_fn(outputs, labels)
    # Try to log the values 

    best_valid_acc = 0.0
    model.train()
    print("Training model")
    for epoch in range(num_epochs):
        acc = evaluate.load("accuracy")
        average_loss = 0
        for batch in train_dataloader:
            inputs = {k: v for k, v in batch.items() if k != "file_name"}
            outputs = model(**inputs)
            loss = outputs.loss
            # loss.backward()
            accelerator.backward(loss)
            
            average_loss += loss.item()
            

            optimizer.step()
        
            optimizer.zero_grad()
            progress_bar.update(1)

            # get the predictions
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)
            predictions = accelerator.gather(predictions)
            labels = accelerator.gather(inputs["labels"])
            acc.add_batch(predictions=predictions, references=labels)

        accuracy_per_epoch = acc.compute()
        print(f"Epoch {epoch} completed")
        print(f"Accuracy: {accuracy_per_epoch}")
        avg_loss = average_loss / len(train_dataloader)
        print(f"loss : {avg_loss}")

        print("Evaluating model on test set")
        metrics = test_metrics(model, eval_dataloader)
        csv = metrics["csv"]
        csv = pd.DataFrame(csv)
        print(metrics)
        
        wandb.log({"loss_per_epoch": avg_loss , 
                "accuracy_per_epoch": accuracy_per_epoch,
                "test_accuracy" :metrics["accuracy"],
                    "test_recall": metrics["recall"],
                    "test_precision": metrics["precision"],
                })
        
        if metrics["accuracy"] > best_valid_acc:
            best_valid_acc = metrics["accuracy"]
            print("Saving model")
            model.save_pretrained(f"../models/LLM-{model_type}-{key}-test")
            csv.to_csv(f"../predictions/LLM-{model_type}-{key}-test.csv", index=False)
        
    wandb.finish()




In [8]:
from accelerate import notebook_launcher
notebook_launcher(training_function, num_processes=2)

Launching training on 2 GPUs.


Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Map:   0%|          | 0/251 [00:00<?, ? examples/s]

Map:   0%|          | 0/70 [00:00<?, ? examples/s]

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33moqcardoso[0m. Use [1m`wandb login --relogin`[0m to force relogin


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33moqcardoso[0m. Use [1m`wandb login --relogin`[0m to force relogin


Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-large-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


The model has 434603010 parameters.
The model has a context window of 4098 tokens.


Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-large-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


The model has 434603010 parameters.
The model has a context window of 4098 tokens.


  0%|          | 0/12550 [00:00<?, ?it/s]

Training model

  0%|          | 0/12550 [00:00<?, ?it/s]


Training model


Initializing global attention on CLS token...
Initializing global attention on CLS token...


RuntimeError: An issue was found when launching the training: 

-- Process 1 terminated with the following error:
Traceback (most recent call last):
  File "/scratchB/oqcardoso/.pyenv/versions/3.11.4/lib/python3.11/site-packages/torch/multiprocessing/spawn.py", line 68, in _wrap
    fn(i, *args)
  File "/scratchB/oqcardoso/.pyenv/versions/3.11.4/lib/python3.11/site-packages/accelerate/utils/launch.py", line 570, in __call__
    self.launcher(*args)
  File "/tmp/ipykernel_24318/573297226.py", line 90, in training_function
    outputs = model(**inputs)
              ^^^^^^^^^^^^^^^
  File "/scratchB/oqcardoso/.pyenv/versions/3.11.4/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratchB/oqcardoso/.pyenv/versions/3.11.4/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratchB/oqcardoso/.pyenv/versions/3.11.4/lib/python3.11/site-packages/torch/nn/parallel/distributed.py", line 1523, in forward
    else self._run_ddp_forward(*inputs, **kwargs)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratchB/oqcardoso/.pyenv/versions/3.11.4/lib/python3.11/site-packages/torch/nn/parallel/distributed.py", line 1359, in _run_ddp_forward
    return self.module(*inputs, **kwargs)  # type: ignore[index]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratchB/oqcardoso/.pyenv/versions/3.11.4/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratchB/oqcardoso/.pyenv/versions/3.11.4/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratchB/oqcardoso/.pyenv/versions/3.11.4/lib/python3.11/site-packages/transformers/models/longformer/modeling_longformer.py", line 1925, in forward
    outputs = self.longformer(
              ^^^^^^^^^^^^^^^^
  File "/scratchB/oqcardoso/.pyenv/versions/3.11.4/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratchB/oqcardoso/.pyenv/versions/3.11.4/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratchB/oqcardoso/.pyenv/versions/3.11.4/lib/python3.11/site-packages/transformers/models/longformer/modeling_longformer.py", line 1738, in forward
    encoder_outputs = self.encoder(
                      ^^^^^^^^^^^^^
  File "/scratchB/oqcardoso/.pyenv/versions/3.11.4/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratchB/oqcardoso/.pyenv/versions/3.11.4/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratchB/oqcardoso/.pyenv/versions/3.11.4/lib/python3.11/site-packages/transformers/models/longformer/modeling_longformer.py", line 1318, in forward
    layer_outputs = layer_module(
                    ^^^^^^^^^^^^^
  File "/scratchB/oqcardoso/.pyenv/versions/3.11.4/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratchB/oqcardoso/.pyenv/versions/3.11.4/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratchB/oqcardoso/.pyenv/versions/3.11.4/lib/python3.11/site-packages/transformers/models/longformer/modeling_longformer.py", line 1246, in forward
    self_attn_outputs = self.attention(
                        ^^^^^^^^^^^^^^^
  File "/scratchB/oqcardoso/.pyenv/versions/3.11.4/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratchB/oqcardoso/.pyenv/versions/3.11.4/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratchB/oqcardoso/.pyenv/versions/3.11.4/lib/python3.11/site-packages/transformers/models/longformer/modeling_longformer.py", line 1182, in forward
    self_outputs = self.self(
                   ^^^^^^^^^^
  File "/scratchB/oqcardoso/.pyenv/versions/3.11.4/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratchB/oqcardoso/.pyenv/versions/3.11.4/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratchB/oqcardoso/.pyenv/versions/3.11.4/lib/python3.11/site-packages/transformers/models/longformer/modeling_longformer.py", line 637, in forward
    attn_probs = torch.masked_fill(attn_probs, is_index_masked[:, :, None, None], 0.0)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 130.00 MiB. GPU 1 has a total capacity of 23.64 GiB of which 91.38 MiB is free. Including non-PyTorch memory, this process has 23.55 GiB memory in use. Of the allocated memory 22.05 GiB is allocated by PyTorch, and 650.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
