In [4]:
import numpy as np
import pandas as pd
import torch

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


# Loading The Dataset

In [6]:
from datasets import load_dataset
dataset = load_dataset("dair-ai/emotion")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [7]:
# Labels
label_mapping = {0:"sadness", 1:"joy", 2:"love", 3:"anger", 4:"fear", 5:"surprise"}

## Exploring The Dataset

In [8]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

# Loading The Model And Tokenizer

In [11]:
from transformers import AutoTokenizer

# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [12]:
# Tokenize The Data
tokenized_datasets = {}

for split in dataset.keys():
    tokenized_datasets[split] = dataset[split].map(lambda x: tokenizer(x['text'], truncation=True, padding="max_length"), batched=True)

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [13]:
tokenized_datasets

{'train': Dataset({
     features: ['text', 'label', 'input_ids', 'attention_mask'],
     num_rows: 16000
 }),
 'validation': Dataset({
     features: ['text', 'label', 'input_ids', 'attention_mask'],
     num_rows: 2000
 }),
 'test': Dataset({
     features: ['text', 'label', 'input_ids', 'attention_mask'],
     num_rows: 2000
 })}

In [14]:
# Load The Model
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=6,
                                 id2label=label_mapping)

# freeze Model Parameters
for param in model.base_model.parameters():
    param.requires_grad = False

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Exploring The Model

In [15]:
print(model)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

# PEFT

In [16]:
from peft import LoraConfig, get_peft_model, TaskType
from transformers import DataCollatorWithPadding, Trainer, TrainingArguments

2024-02-18 10:19:10.490438: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-02-18 10:19:10.716838: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX512F AVX512_VNNI, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [17]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": (predictions == labels).mean()}

In [18]:
# Training Function

def training(model, tokenizer, datasets, compute_metrics):
    
    training_args = TrainingArguments(
        output_dir='./data',
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=5,
        load_best_model_at_end=True,
        learning_rate=2e-5,
        evaluation_strategy='epoch',
        save_strategy='epoch')
    
    return Trainer(
        model=model,
        args=training_args,
        train_dataset=datasets["train"],
        eval_dataset=datasets["test"],
        tokenizer=tokenizer,
        data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
        compute_metrics=compute_metrics)

In [19]:
# Create Lora Config File
config = LoraConfig(r=10, target_modules=['q_lin', 'k_lin', 'v_lin', 'lin1', 'lin2'], 
                    lora_alpha=16, lora_dropout=0.1, bias="none", 
                    task_type=TaskType.SEQ_CLS)

In [20]:
# Load Model With PEFT Config File
lora_model = get_peft_model(model, config)

In [21]:
lora_model.print_trainable_parameters()

trainable params: 1,332,486 || all params: 68,290,572 || trainable%: 1.9512005258939698


## Evaluate Model Prior To FineTuning

In [22]:
trainer = training(model, tokenizer, tokenized_datasets, compute_metrics)

In [23]:
# Train Without Finetuning
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.9059,0.701501,0.7435
2,0.558,0.477719,0.838
3,0.4294,0.386786,0.864
4,0.3824,0.345996,0.874
5,0.3542,0.334149,0.879


TrainOutput(global_step=5000, training_loss=0.5932028839111328, metrics={'train_runtime': 1549.9086, 'train_samples_per_second': 51.616, 'train_steps_per_second': 3.226, 'total_flos': 1.09256196096e+16, 'train_loss': 0.5932028839111328, 'epoch': 5.0})

In [24]:
# Base Model Evaluations
trainer.evaluate(eval_dataset=tokenized_datasets["validation"])

{'eval_loss': 0.3298896849155426,
 'eval_accuracy': 0.8915,
 'eval_runtime': 15.0157,
 'eval_samples_per_second': 133.194,
 'eval_steps_per_second': 8.325,
 'epoch': 5.0}

In [25]:
df = pd.DataFrame(tokenized_datasets["validation"])
df = df[["text", "label"]]
predictions = trainer.predict(tokenized_datasets["validation"])
df["predicted_label"] = np.argmax(predictions[0], axis=1)
df.head(100)

Unnamed: 0,text,label,predicted_label
0,im feeling quite sad and sorry for myself but ...,0,0
1,i feel like i am still looking at a blank canv...,0,0
2,i feel like a faithful servant,2,2
3,i am just feeling cranky and blue,3,3
4,i can have for a treat or if i am feeling festive,1,1
...,...,...,...
95,i feel like i ve been having some issues with ...,1,0
96,im feeling more fucked up than last night,3,3
97,i can feel violent biff whole length is hit by...,3,3
98,i am reading something the saints have written...,2,2


## Train PEFT Model

In [26]:
peft_trainer = training(lora_model, tokenizer, tokenized_datasets, compute_metrics)

In [27]:
# Train
peft_trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.3124,0.266441,0.9
2,0.2503,0.24676,0.902
3,0.2333,0.234768,0.908
4,0.2255,0.227305,0.91
5,0.2086,0.223209,0.907


Checkpoint destination directory ./data/checkpoint-1000 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./data/checkpoint-2000 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./data/checkpoint-3000 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./data/checkpoint-4000 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./data/checkpoint-5000 already exists and is non-empty.Saving will proceed but saved results may be invalid.


TrainOutput(global_step=5000, training_loss=0.2510279769897461, metrics={'train_runtime': 1553.8289, 'train_samples_per_second': 51.486, 'train_steps_per_second': 3.218, 'total_flos': 1.09256196096e+16, 'train_loss': 0.2510279769897461, 'epoch': 5.0})

In [29]:
lora_model.save_pretrained("lora_model")

In [30]:
# PEFT Model Evaluations

In [31]:
trainer.evaluate(eval_dataset=tokenized_datasets["validation"])

{'eval_loss': 0.21582446992397308,
 'eval_accuracy': 0.925,
 'eval_runtime': 14.7934,
 'eval_samples_per_second': 135.196,
 'eval_steps_per_second': 8.45,
 'epoch': 5.0}

In [32]:
df = pd.DataFrame(tokenized_datasets["validation"])
df = df[["text", "label"]]
predictions = peft_trainer.predict(tokenized_datasets["validation"])
df["predicted_label"] = np.argmax(predictions[0], axis=1)
df.head(100)

Unnamed: 0,text,label,predicted_label
0,im feeling quite sad and sorry for myself but ...,0,0
1,i feel like i am still looking at a blank canv...,0,0
2,i feel like a faithful servant,2,2
3,i am just feeling cranky and blue,3,3
4,i can have for a treat or if i am feeling festive,1,1
...,...,...,...
95,i feel like i ve been having some issues with ...,1,1
96,im feeling more fucked up than last night,3,3
97,i can feel violent biff whole length is hit by...,3,3
98,i am reading something the saints have written...,2,2


# Inference

In [34]:
from peft import AutoPeftModelForSequenceClassification

inference_model = AutoPeftModelForSequenceClassification.from_pretrained("lora_model",  num_labels=6)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [35]:
text1 = 'I can\'t believe I saw a shark eat that seagull at the beach today!'

In [36]:
text2 = 'I hate when the bread gets soggy in my sandwiches.'

In [37]:
text3 = 'I love having eggs on toast for breakfast, especially when the yolks are runny.'

In [38]:
def run_inf(text):
    input = tokenizer(text, 
                  padding="max_length", truncation=True, return_tensors="pt")

    logits = inference_model(**input).logits
    predictions = torch.argmax(logits,dim=1).numpy()[0]

    print(f'Tweet: {text}')
    print()
    print(f'Prediction: {predictions} ({label_mapping[predictions]})')

In [39]:
run_inf(text1)

Tweet: I can't believe I saw a shark eat that seagull at the beach today!

Prediction: 5 (surprise)


In [40]:
run_inf(text2)

Tweet: I hate when the bread gets soggy in my sandwiches.

Prediction: 0 (sadness)


In [41]:
run_inf(text3)

Tweet: I love having eggs on toast for breakfast, especially when the yolks are runny.

Prediction: 3 (anger)
