# Sentiment Classification using LoRA

In [None]:
# importing libraries
import os

from enum import Enum
from functools import partial
import torch
import pandas as pd

from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed, default_data_collator, get_linear_schedule_with_warmup, TrainingArguments
from transformers import default_data_collator, get_linear_schedule_with_warmup
from peft import get_peft_config, get_peft_model, get_peft_model, LoraConfig, TaskType, PeftModel, PeftType, PeftConfig
from datasets import load_dataset, DatasetDict
from trl import SFTTrainer
from torch.utils.data import DataLoader
from tqdm import tqdm
import wandb

In [None]:
wandb.init(project="lora_learning_methods", name="lora")
seed = 42
device = "cuda"
model_name_or_path = "mistralai/Mistral-7B-v0.1"
tokenizer_name_or_path = "mistralai/Mistral-7B-v0.1"
text_column = "input"
label_column = "output"
max_length = 64
lr = 1e-4
num_epochs = 10
batch_size = 8

## Dataset Preparation

In [None]:
# loading dataset
dataset = load_dataset("FinGPT/fingpt-sentiment-train")

classes = list(set([k["output"] for k in dataset["train"]]))
print(dataset)
dataset["train"][0]

In [None]:
# dataset splitting
train_testvalid = dataset['train'].train_test_split(test_size=0.2)
# Split the 10% test + valid in half test, half valid
test_valid = train_testvalid['test'].train_test_split(test_size=0.5)
# gather everyone if you want to have a single DatasetDict
dataset = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})

In [None]:
# data preprocessing
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) #token=hf_token
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
target_max_length = max([len(tokenizer(class_label)["input_ids"]) for class_label in classes])
print(f"{target_max_length=}")

def preprocess_function(examples):
    batch_size = len(examples[text_column]) # define batch size, here it is no. of rows
    inputs = [f"{text_column} : {x}\nLabel : " for x in examples[text_column]] # format input as Input text: input Label:
    targets = [str(x) for x in examples[label_column]] # get the labels as targets

    # tokenize input and label both
    model_inputs = tokenizer(inputs)
    labels = tokenizer(targets, add_special_tokens=False)  # don't add bos token because we concatenate with inputs

    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i] # get the inputs
        label_input_ids = labels["input_ids"][i] + [tokenizer.eos_token_id] # get the label
        # print(i, sample_input_ids, label_input_ids)
        model_inputs["input_ids"][i] = sample_input_ids + label_input_ids # add label to the input
        labels["input_ids"][i] = [-100] * len(sample_input_ids) + label_input_ids # make the label length same as input
        model_inputs["attention_mask"][i] = [1] * len(model_inputs["input_ids"][i]) # make attention mask of same length

    # Each example in the dataset is tokenized but they have different lengths. The next for loop makes all of
    # them equal in length by adding tokenizer.pad_token_id at the beginning. Similarly we add 0s at the
    # beginning of attention masks and -100 for labels.
    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i]
        model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * (
            max_length - len(sample_input_ids)
        ) + sample_input_ids
        model_inputs["attention_mask"][i] = [0] * (max_length - len(sample_input_ids)) + model_inputs[
            "attention_mask"
        ][i]
        labels["input_ids"][i] = [-100] * (max_length - len(sample_input_ids)) + label_input_ids
        model_inputs["input_ids"][i] = torch.tensor(model_inputs["input_ids"][i][:max_length])
        model_inputs["attention_mask"][i] = torch.tensor(model_inputs["attention_mask"][i][:max_length])
        labels["input_ids"][i] = torch.tensor(labels["input_ids"][i][:max_length])
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


train_dataset = dataset["train"].map(
    preprocess_function,
    batched=True,
    num_proc=1,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)

eval_dataset = dataset["valid"].map(
    preprocess_function,
    batched=True,
    num_proc=1,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)

train_dataloader = DataLoader(
    train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True
)
eval_dataloader = DataLoader(eval_dataset, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True)
next(iter(train_dataloader))

In [None]:
# pre-processing test dataset
def test_preprocess_function(examples):
    batch_size = len(examples[text_column])
    inputs = [f"{text_column} : {x}\nLabel : " for x in examples[text_column]]
    model_inputs = tokenizer(inputs)
    # print(model_inputs)
    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i]
        model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * (
            max_length - len(sample_input_ids)
        ) + sample_input_ids
        model_inputs["attention_mask"][i] = [0] * (max_length - len(sample_input_ids)) + model_inputs[
            "attention_mask"
        ][i]
        model_inputs["input_ids"][i] = torch.tensor(model_inputs["input_ids"][i][:max_length])
        model_inputs["attention_mask"][i] = torch.tensor(model_inputs["attention_mask"][i][:max_length])
    return model_inputs


test_dataset = dataset["test"].map(
    test_preprocess_function,
    batched=True,
    num_proc=1,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)

test_dataloader = DataLoader(test_dataset, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True)
next(iter(test_dataloader))

## Create the PEFT model, Optimizer and LR Scheduler


In [None]:
peft_config = LoraConfig(r=8,
                         lora_alpha=16,
                         lora_dropout=0.1,
                         target_modules=["gate_proj","q_proj","lm_head","o_proj","k_proj","embed_tokens","down_proj","up_proj","v_proj"],
                         task_type=TaskType.CAUSAL_LM)

In [None]:
model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
model.resize_token_embeddings(len(tokenizer))
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

# cast non-trainable params in fp16
for p in model.parameters():
    if not p.requires_grad:
        p.data = p.to(torch.float16)

In [None]:
model

## Training

In [None]:
output_dir = "mistral_lora_sentiment"
per_device_train_batch_size = 1
per_device_eval_batch_size = 1
gradient_accumulation_steps = 8
logging_steps = 5
learning_rate = 5e-4
max_grad_norm = 1.0
max_steps = 250
num_train_epochs=10
warmup_ratio = 0.1
lr_scheduler_type = "cosine"
max_seq_length = 2048

training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    save_strategy="no",
    evaluation_strategy="epoch",
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    max_grad_norm=max_grad_norm,
    weight_decay=0.1,
    warmup_ratio=warmup_ratio,
    lr_scheduler_type=lr_scheduler_type,
    fp16=True,
    report_to=["tensorboard", "wandb"],
    hub_private_repo=True,
    push_to_hub=True,
    num_train_epochs=num_train_epochs,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False}
)

In [None]:
trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    # packing=True,
    # dataset_text_field="content",
    # max_seq_length=max_seq_length,
)

In [None]:
trainer.train()
trainer.save_model()

In [None]:
!nvidia-smi

## Inference

In [None]:
#Loading the trained model and getting the predictions of the trained model
peft_model_id = "anurag-kr/mistral_lora_sentiment"
device = "cuda"
config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, torch_dtype=torch.float16)
model = PeftModel.from_pretrained(model, peft_model_id)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

In [None]:
model.to(device)
model.eval()
i = 36
inputs = tokenizer(f'{text_column} : {test_dataset[i]["Tweet text"]}\nLabel : ', return_tensors="pt")
# print(test_dataset[i]["input"])

with torch.no_grad():
    inputs = {k: v.to(device) for k, v in inputs.items()}
    outputs = model.generate(
        input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], max_new_tokens=10, eos_token_id=tokenizer.eos_token_id
    )
    print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0])