### Fine tuning
Code example [link](https://github.com/ShawhinT/YouTube-Blog/blob/main/LLMs/fine-tuning/ft-example.ipynb)

In [75]:
from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
    AutoTokenizer,
    AutoConfig, 
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np


In [3]:
dataset = load_dataset("shawhin/imdb-truncated")
dataset

Downloading readme:   0%|          | 0.00/592 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/836k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/853k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
})

Model

[Look up](https://huggingface.co/distilbert/distilbert-base-uncased)

In [7]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [8]:
model_name = "distilbert-base-uncased"

#Label identification

id2label = {0: "Negative", 1: "Positive"}
label2id = {"Negative":0, "Positive":1}

model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=2, id2label=id2label, label2id=label2id).to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [11]:
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)

In [12]:
if tokenizer.pad_token is None:
    #Se debe adicionar el token de pad
    tokenizer.add_special_tokens({"pad_token": "[PAD]"}) 
    #También se debe actualizar en el modelo los nuevos largos del token
    model.resize_token_embeddings(len(tokenizer))

Tokenized function

Remeber how the training dataset is display

dataset

In [18]:
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
})

In [17]:
dataset["validation"]["text"][0]

'Disgused as an Asian Horror, "A Tale Of Two Sisters" is actually a complex character driven psychological drama, that engulfs the viewer into the problems of a seemingly normal family. I was really surprised at the depth of this movie. Director Ji-woon Kim\'s decision to focus more on telling a story rather than providing cheap scares, has proved a correct one. Creating one of the most ingenious new horror movies.<br /><br />"A Tale Of Two Sisters" tels the story, as it\'s name suggest of two sisters Su-mi and the younger Su-yeon, who after spending time in a mental institution return home to their father and apparently abusive stepmother. From then on we witness how the sisters deal with their stepmother\'s gradually rising aggression and erratic behavior. To say what would happen next would be to be spoil the entire experience. So I\'ll just leave it at that.<br /><br />The plot is very tightly written. With the characters nicely fleshed out. Ji-woon Kim\'s focus on a small cast off

In [41]:
def tokenizer_func(example: dict):
    text = example["text"]

    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text, #We pass the text of the dataset
        return_tensors="pt", #The datatype that we want the output data
        truncation=True,
        padding=True,
        max_length=512 #This is intrinsic to the len of the model
    )
    return tokenized_inputs

In [42]:
tokenized_dataset = dataset.map(tokenizer_func, batched=True)
tokenized_dataset

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
})

In [45]:
example = tokenized_dataset["validation"]["input_ids"][0]
example

[101,
 21007,
 2015,
 1996,
 13972,
 2046,
 1996,
 3471,
 1997,
 1037,
 9428,
 3671,
 2155,
 1012,
 1045,
 2001,
 2428,
 4527,
 2012,
 1996,
 5995,
 1997,
 2023,
 3185,
 1012,
 2472,
 10147,
 1011,
 15854,
 2078,
 5035,
 1005,
 1055,
 3247,
 2000,
 3579,
 2062,
 2006,
 4129,
 1037,
 2466,
 2738,
 2084,
 4346,
 10036,
 29421,
 1010,
 2038,
 4928,
 1037,
 6149,
 2028,
 1012,
 4526,
 2028,
 1997,
 1996,
 2087,
 13749,
 18595,
 3560,
 2047,
 5469,
 5691,
 1012,
 1026,
 7987,
 1013,
 1028,
 1026,
 7987,
 1013,
 1028,
 1000,
 1037,
 6925,
 1997,
 2048,
 5208,
 1000,
 10093,
 2015,
 1996,
 2466,
 1010,
 2004,
 2009,
 1005,
 1055,
 2171,
 6592,
 1997,
 2048,
 5208,
 10514,
 1011,
 2771,
 1998,
 1996,
 3920,
 10514,
 1011,
 6300,
 2239,
 1010,
 2040,
 2044,
 5938,
 2051,
 1999,
 1037,
 5177,
 5145,
 2709,
 2188,
 2000,
 2037,
 2269,
 1998,
 4593,
 20676,
 26959,
 1012,
 2013,
 2059,
 2006,
 2057,
 7409,
 2129,
 1996,
 5208,
 3066,
 2007,
 2037,
 26959,
 1005,
 1055,
 6360,
 4803,
 14974,
 1998,

In [47]:
type(example[0])

int

Ok we have our dataset tokenized and with the atention heads

Collactor to help us whit the dynamic padding 

In [66]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Evaluation

In [25]:
accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [101]:
#We can add another metrics if we want like F1
def compute_metrics(p: tuple) -> dict:
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    #return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}
    #Quick correction for new versions
    return accuracy.compute(predictions=predictions, references=labels)

We are going to test the model

In [56]:
text_list = ["Horrible pelicula", "Excelente me gusto mucho",
              "This was a error", "This is a hard pass"]
for i in text_list:
    inputs = tokenizer.encode(i, return_tensors="pt").to(device)
    logits = model(inputs).logits
    prediction = torch.argmax(logits)
    print(i + "-" + id2label[prediction.tolist()])

Horrible pelicula-Positive
Excelente me gusto mucho-Positive
This was a error-Positive
This is a hard pass-Positive


Train model

In [57]:
peft_config = LoraConfig(task_type="SEQ_CLS", #The task that we want te model to perform
                        r=4, 
                        lora_alpha=32,
                        lora_dropout=0.01,
                        target_modules = ['q_lin'])


In [58]:
peft_config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='SEQ_CLS', inference_mode=False, r=4, target_modules={'q_lin'}, lora_alpha=32, lora_dropout=0.01, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False))

Start the model to fine tune

In [64]:
model = get_peft_model(model, peft_config).to(device)
model.print_trainable_parameters()

trainable params: 628,994 || all params: 67,584,004 || trainable%: 0.9307


In [60]:
lr = 1e-3
batch_size = 4
num_epochs = 20

In [79]:
training_args = TrainingArguments(
    output_dir= "../models/" + model_name + "-lora-text-classification",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
    bf16=True
)

In [102]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator, # this will dynamically pad examples in each batch to be equal length
    compute_metrics=compute_metrics
)

In [103]:
trainer.train()

  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 1.3868778944015503, 'eval_accuracy': 0.87, 'eval_runtime': 2.4688, 'eval_samples_per_second': 405.048, 'eval_steps_per_second': 101.262, 'epoch': 1.0}
{'loss': 0.1898, 'grad_norm': 1.4144700765609741, 'learning_rate': 0.0008, 'epoch': 2.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 1.3392442464828491, 'eval_accuracy': 0.864, 'eval_runtime': 2.461, 'eval_samples_per_second': 406.332, 'eval_steps_per_second': 101.583, 'epoch': 2.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 1.3182048797607422, 'eval_accuracy': 0.866, 'eval_runtime': 2.4653, 'eval_samples_per_second': 405.636, 'eval_steps_per_second': 101.409, 'epoch': 3.0}
{'loss': 0.192, 'grad_norm': 3.7050577361696924e-07, 'learning_rate': 0.0006, 'epoch': 4.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 1.4299923181533813, 'eval_accuracy': 0.866, 'eval_runtime': 2.4782, 'eval_samples_per_second': 403.514, 'eval_steps_per_second': 100.878, 'epoch': 4.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 1.6296740770339966, 'eval_accuracy': 0.871, 'eval_runtime': 2.5042, 'eval_samples_per_second': 399.337, 'eval_steps_per_second': 99.834, 'epoch': 5.0}
{'loss': 0.109, 'grad_norm': 0.0002020986139541492, 'learning_rate': 0.0004, 'epoch': 6.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 1.6260613203048706, 'eval_accuracy': 0.873, 'eval_runtime': 2.4915, 'eval_samples_per_second': 401.366, 'eval_steps_per_second': 100.341, 'epoch': 6.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 1.8962013721466064, 'eval_accuracy': 0.867, 'eval_runtime': 2.4873, 'eval_samples_per_second': 402.05, 'eval_steps_per_second': 100.513, 'epoch': 7.0}
{'loss': 0.033, 'grad_norm': 0.0015833841171115637, 'learning_rate': 0.0002, 'epoch': 8.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 1.8780865669250488, 'eval_accuracy': 0.867, 'eval_runtime': 2.4873, 'eval_samples_per_second': 402.05, 'eval_steps_per_second': 100.513, 'epoch': 8.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 1.8227405548095703, 'eval_accuracy': 0.868, 'eval_runtime': 2.4787, 'eval_samples_per_second': 403.44, 'eval_steps_per_second': 100.86, 'epoch': 9.0}
{'loss': 0.0212, 'grad_norm': 0.05187961831688881, 'learning_rate': 0.0, 'epoch': 10.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 1.7314329147338867, 'eval_accuracy': 0.872, 'eval_runtime': 2.5015, 'eval_samples_per_second': 399.759, 'eval_steps_per_second': 99.94, 'epoch': 10.0}
{'train_runtime': 82.4113, 'train_samples_per_second': 121.343, 'train_steps_per_second': 30.336, 'train_loss': 0.10900678215026856, 'epoch': 10.0}


TrainOutput(global_step=2500, training_loss=0.10900678215026856, metrics={'train_runtime': 82.4113, 'train_samples_per_second': 121.343, 'train_steps_per_second': 30.336, 'total_flos': 1343996682240000.0, 'train_loss': 0.10900678215026856, 'epoch': 10.0})

In [107]:
text_list = ["Horrible pelicula", "Excelente me gusto mucho", "Nunca vi algo tan horrible",
             "De las mejores peliculas que e visto",
              "This was a error", "This is a hard pass"]+["It was good.",
             "Not a fan, don't recommed.", "Better than the first one.",
              "This is not worth watching even once.", "This one is a pass."]
for i in text_list:
    inputs = tokenizer.encode(i, return_tensors="pt").to(device)
    logits = model(inputs).logits
    prediction = torch.argmax(logits)
    print(i + "-" + id2label[prediction.tolist()])

Horrible pelicula-Negative
Excelente me gusto mucho-Positive
Nunca vi algo tan horrible-Negative
De las mejores peliculas que e visto-Positive
This was a error-Negative
This is a hard pass-Positive
It was good.-Positive
Not a fan, don't recommed.-Negative
Better than the first one.-Positive
This is not worth watching even once.-Negative
This one is a pass.-Positive
