# Finetuning Code for Encoder LLMs


#### The finetuned model weights stored epoch by epoch, refer to the notes to find the best performing instace of each model 

## Dataset 

In [None]:
import datasets

file_names = {"train":r"..\input_data\input_output_train.csv","test":r"..\input_data\input_output_test.csv"}
dataset = datasets.load_dataset("csv",data_files= file_names)

## Tokenize Data 

In [None]:
import transformers as tm
model_path = "google-bert/bert-base-uncased"
tokenizer = tm.AutoTokenizer.from_pretrained(model_path)


In [None]:
def tokenizer_f(examples):
    tokens = tokenizer(examples["input"], truncation=True, padding="max_length",max_length = 512)
    return tokens

tokenized_dataset = dataset["train"].map(tokenizer_f, batched=True)

In [None]:
tokenized_dataset


In [None]:
tokenized_dataset = tokenized_dataset.rename_column("output", "labels")

In [None]:
tokenized_dataset = tokenized_dataset.remove_columns("input")

In [None]:
tokenized_eval_dataset = dataset["test"].map(tokenizer_f, batched=True)


In [None]:
tokenized_eval_dataset

In [None]:
tokenized_eval_dataset = tokenized_eval_dataset.rename_column("output", "labels")
tokenized_eval_dataset = tokenized_eval_dataset.remove_columns("input")

## Model

In [None]:
import torch

In [None]:
model_path = "google-bert/bert-base-uncased"  # specify the model path as needed
device = torch.cuda.current_device() if torch.cuda.is_available() else "cpu"
print(device)
tokenizer = tm.AutoTokenizer.from_pretrained(model_path)
model = tm.AutoModelForSequenceClassification.from_pretrained(model_path, num_labels = 3).to(device)

In [None]:
torch.cuda.is_available()

## LoRA

In [None]:
from peft import LoraConfig, TaskType

lora_config = LoraConfig(r=8, lora_alpha=16, target_modules=["query", "value"],
    lora_dropout=0.1,
    bias="none",  
    task_type=TaskType.SEQ_CLS)

In [None]:
model.unload()

In [None]:
from peft import get_peft_model
model = get_peft_model(model, lora_config)

In [None]:
model

## Train

#### Evaluation Function

In [None]:
from evaluate import load


metric = load("f1")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return metric.compute(predictions=predictions, references=labels, average="macro")

In [None]:
from evaluate import load


precision_metric = load("precision")
recall_metric = load("recall")
f1_metric = load("f1")


import numpy as np

def compute_metrics(eval_pred):

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)


    per_class_f1 = f1_metric.compute(predictions=predictions, references=labels, average=None)['f1']
    per_class_precision = precision_metric.compute(predictions=predictions, references=labels, average=None)['precision']
    per_class_recall = recall_metric.compute(predictions=predictions, references=labels, average=None)['recall']


    macro_f1 = f1_metric.compute(predictions=predictions, references=labels, average="macro")['f1']
    macro_precision = precision_metric.compute(predictions=predictions, references=labels, average="macro")['precision']
    macro_recall = recall_metric.compute(predictions=predictions, references=labels, average="macro")['recall']
    

    metrics = {
        'f1_macro': macro_f1,
        'precision_macro': macro_precision,
        'recall_macro': macro_recall,
    }


    for i, (p, r, f) in enumerate(zip(per_class_precision, per_class_recall, per_class_f1)):
        metrics[f'precision_class_{i}'] = p
        metrics[f'recall_class_{i}'] = r
        metrics[f'f1_class_{i}'] = f
        
    return metrics

#### Parameters for training 

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir=r"trained_weights/bert-base-uncased/",
                                 num_train_epochs=10, 
                                 label_names=["labels"],
                                 learning_rate=3e-5,
                                 save_strategy = "epoch",   #save weight after every epoch
                                 per_device_train_batch_size=16,  
                                 weight_decay=0.01, #prevent overfitting

                                 logging_strategy="epoch",
                                 logging_dir=r"trained_weights/bert-base-uncased/",

                                 eval_strategy = "epoch",
                                 per_device_eval_batch_size=10,
                                 load_best_model_at_end=True,
                                 metric_for_best_model ="eval_f1_macro"
                                 )

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_eval_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

## Qualitative Testing 


#### Pipeline can also be used here, but decided write the building blocks used within a pipeline to have more control

In [None]:
dataset['test']

In [None]:
import transformers as tm
model_path_huggingfacehub = "google-bert/bert-base-uncased"
tokenizer = tm.AutoTokenizer.from_pretrained(model_path_huggingfacehub)

In [None]:
import torch
model_path_local = r"/srv/scratch/z5503831/trainer_bert_base_uncased_5/checkpoint-405/"
device = torch.cuda.current_device() if torch.cuda.is_available() else "cpu"
print(device)
model = tm.AutoModelForSequenceClassification.from_pretrained(model_path_local, num_labels = 3).to(device)

In [None]:
test_input = dataset['test']['input'][13]
dataset['test']['output'][13]

In [None]:
inputs = tokenizer(test_input, truncation = True, padding = "max_length", max_length = 512,return_tensors="pt").to(device)

In [None]:
with torch.no_grad():
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits)

print(f"Predicted class: {predictions.item()}")

## Base-Line

In [None]:
import transformers as tm

analyser = tm.pipeline("sentiment-analysis",  model = "google-bert/bert-base-uncased", truncation=True,max_length=512  )


In [None]:
analyser(dataset['test']['input'][92])


In [None]:
input_test_list = list(dataset['test']['input'])
analyser(input_test_list)

In [None]:
from transformers import pipeline, AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-uncased")

print(model.config.id2label)



In [None]:
from transformers import pipeline

zero_shot = pipeline("zero-shot-classification", model="google-bert/bert-base-uncased")
res = zero_shot(
    input_test_list,
    candidate_labels=["negative","neutral","positive"]
)

res

In [None]:
res

## Saving the Outputs for Confusion Matrix

In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset, SequentialSampler
from transformers import AutoTokenizer, AutoModelForSequenceClassification


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


model_name = "google-bert/bert-base-uncased" 
model_path_local =r"trained_weights/bert-base-uncased/checkpoint-170"
tokenizer  = AutoTokenizer.from_pretrained(model_name)
model      = AutoModelForSequenceClassification.from_pretrained(
                 model_path_local, num_labels=3
             ).to(device)


texts  = list(dataset["test"]["input"])      # list of 152 strings
labels = list(dataset["test"]["output"])     # list of ints


encodings = tokenizer(
    texts,
    truncation=True,
    padding="max_length",
    max_length=512,
    return_tensors="pt"
)


test_dataset = TensorDataset(
    encodings["input_ids"],
    encodings["attention_mask"],
    torch.tensor(labels)
)
test_loader = DataLoader(
    test_dataset,
    sampler=SequentialSampler(test_dataset),
    batch_size=32,      # feel free to tune
    pin_memory=True
)


model.eval()
all_preds = []
with torch.no_grad():
    for input_ids, attention_mask, _ in test_loader:
        input_ids     = input_ids.to(device)
        attention_mask= attention_mask.to(device)

        logits = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        ).logits

        preds = torch.argmax(logits, dim=-1)
        all_preds.extend(preds.cpu().tolist())

print(f"Predictions for all {len(all_preds)} test examples:\n", all_preds)
