# LORA Finetuning
## Reference
- [Fine-Tuning Large Language Models (LLMs)](https://github.com/ShawhinT/YouTube-Blog/blob/main/LLMs/fine-tuning/ft-example.ipynb)

In [1]:
!pip install -q peft evaluate

In [2]:
# !pip install --ignore-installed -q datasets==2.15 # upgrade from datasets 2.10

In [3]:
from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
    AutoTokenizer,
    AutoConfig, 
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

# Parameter-Efficient Fine-Tuning (PEFT)
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np



In [4]:
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
print(DEVICE)

cuda:0


## Dataset 

- Refer: [Dataset slice split to get more information about download dataset from HuggingFace](https://huggingface.co/docs/datasets/loading#slice-splits)

In [28]:
split="train"

dataset = load_dataset("imdb", split=split)

# 70% train, 15% test + 15%validation
train_testvalid = dataset.train_test_split(test_size=0.3)

# Split the 10% test + valid in half test, half valid
test_valid = train_testvalid['test'].train_test_split(test_size=0.5)

# gather everyone if you want to have a single DatasetDict
dataset = DatasetDict({'train': train_testvalid['train'], 'test': test_valid['test'], 'validation': test_valid['train']})

In [29]:
# display % of training data with label=1
np.array(dataset['train']['label']).sum()/len(dataset['train']['label'])

0.4997714285714286

## Model

In [30]:
model_checkpoint = 'distilbert-base-uncased'
# model_checkpoint = 'roberta-base' # you can alternatively use roberta-base but this model is bigger thus training will take longer

# define label maps
id2label = {0: "Negative", 1: "Positive"}
label2id = {"Negative":0, "Positive":1}

# generate classification model from model_checkpoint
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=2, id2label=id2label, label2id=label2id)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Tokeniser

In [31]:
# create tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

# add pad token if none exists
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

In [11]:
# create tokenize function
def tokenize_function(examples):
    # extract text
    text = examples["text"]

    #tokenize and truncate text
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=512
    )

    return tokenized_inputs

In [32]:
# tokenize training and validation datasets
tokenized_dataset = dataset.map(tokenize_function, batched=True)
# create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

  0%|          | 0/18 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

## Evaluation

In [33]:
# import accuracy evaluation metric
accuracy = evaluate.load("accuracy")
# define an evaluation function to pass into trainer later
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)

    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}

### Untrained model performance
- Before training our model, we can evaluate how the base model with a randomly initialized classification head performs on some example inp

In [34]:
# Apply "untrained" model to text

# define list of examples
text_list = ["It was good.", "Not a fan, don't recommed.", "Better than the first one.", "This is not worth watching even once.", "This one is a pass."]

print("Untrained model predictions:")
print("----------------------------")
for text in text_list:
    # tokenize text
    inputs = tokenizer.encode(text, return_tensors="pt")
    # compute logits
    logits = model(inputs).logits
    # convert logits to label
    predictions = torch.argmax(logits)

    print(f"{text:40}: {id2label[predictions.tolist()]}")

Untrained model predictions:
----------------------------
It was good.                            : Positive
Not a fan, don't recommed.              : Negative
Better than the first one.              : Positive
This is not worth watching even once.   : Positive
This one is a pass.                     : Negative


## Model Training

In [35]:
peft_config = LoraConfig(task_type="SEQ_CLS", # sequence classification
                        r=4, # intrinsic rank of trainable weight matrix
                        lora_alpha=32, # this is like a learning rate
                        lora_dropout=0.01, # probablity of dropout
                        target_modules = ['q_lin']) # we apply lora to query layer only

In [36]:
# new version of our model that can be trained via PEFT.
peft_model = get_peft_model(model, peft_config)
peft_model.print_trainable_parameters()

trainable params: 628,994 || all params: 67,584,004 || trainable%: 0.9306847223789819


- define hyperparameters for model training

In [38]:
# hyperparameters
lr = 1e-3 # size of optimization step 
batch_size = 32 # number of examples processed per optimziation step
num_epochs = 5 # number of times model runs through training data

# define training arguments
training_args = TrainingArguments(
    output_dir= model_checkpoint + "-lora-text-classification",
    learning_rate=lr,
    per_device_train_batch_size=batch_size, 
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

- How to clear GPU memory without stopping the GPU runtime
- "!nvidia-smi" inside a cell in the notebook, and kill the process id for the GPU like "!kill process_id" run the command
- Tensorflow:

```python
# !pip install numba

from numba import cuda 
device = cuda.get_current_device()
device.reset()
```

- Pytorch:  `torch.cuda.clear_cache`


In [39]:
torch.cuda.empty_cache()

In [40]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Sun Dec 31 10:28:35 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.161.03   Driver Version: 470.161.03   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P0    32W / 250W |   1531MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [41]:
# creater trainer object
trainer = Trainer(
    model=peft_model, # our peft model
    args=training_args, # hyperparameters
    train_dataset=tokenized_dataset["train"], # training data
    eval_dataset=tokenized_dataset["validation"], # validation data
    tokenizer=tokenizer, # define tokenizer
    data_collator=data_collator, # this will dynamically pad examples in each batch to be equal length
    compute_metrics=compute_metrics, # evaluates model using compute_metrics() function from before
)

# train model
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2827,0.229511,{'accuracy': 0.9072}
2,0.2253,0.215059,{'accuracy': 0.9149333333333334}
3,0.1926,0.222459,{'accuracy': 0.9154666666666667}
4,0.1643,0.228699,{'accuracy': 0.9168}
5,0.1423,0.237962,{'accuracy': 0.9149333333333334}


Trainer is attempting to log a value of "{'accuracy': 0.9072}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.9149333333333334}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.9154666666666667}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.9168}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.9149333333333334}" of type <class 'dict'> fo

TrainOutput(global_step=2735, training_loss=0.19604631804025152, metrics={'train_runtime': 1919.1478, 'train_samples_per_second': 45.593, 'train_steps_per_second': 1.425, 'total_flos': 1.175572898007168e+16, 'train_loss': 0.19604631804025152, 'epoch': 5.0})

In [45]:
peft_model.to(DEVICE)
print("Trained model predictions:")
print("--------------------------")
for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt").to(DEVICE) # moving to mps for Mac (can alternatively do 'cpu')
    print(inputs)
    logits = peft_model(inputs).logits
    predictions = torch.max(logits,1).indices

    print(text + " - " + id2label[predictions.tolist()[0]])

Trained model predictions:
--------------------------
tensor([[ 101, 2009, 2001, 2204, 1012,  102]], device='cuda:0')
It was good. - Positive
tensor([[  101,  2025,  1037,  5470,  1010,  2123,  1005,  1056, 28667,  5358,
          7583,  1012,   102]], device='cuda:0')
Not a fan, don't recommed. - Negative
tensor([[ 101, 2488, 2084, 1996, 2034, 2028, 1012,  102]], device='cuda:0')
Better than the first one. - Positive
tensor([[ 101, 2023, 2003, 2025, 4276, 3666, 2130, 2320, 1012,  102]],
       device='cuda:0')
This is not worth watching even once. - Positive
tensor([[ 101, 2023, 2028, 2003, 1037, 3413, 1012,  102]], device='cuda:0')
This one is a pass. - Positive
