# **Analytics 2 :** <font color=#DF4807>**Transformers**</font>

In [2]:
!pip install datasets --quiet
import pandas as pd
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
emotions = load_dataset("SetFit/emotion")
emotions_train = emotions['train']

Using the latest cached version of the dataset since SetFit/emotion couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at C:\Users\Poorn\.cache\huggingface\datasets\SetFit___emotion\default\0.0.0\6c362e04d016f6b6a9377e85c3b944140f0b96c9 (last modified on Fri May  3 13:05:01 2024).


In [4]:
# emotions.set_format(type='pandas')
# df = emotions['train'][:]
# df.head(10)

## **Tasks**

1. Fine tune a classification model using the dataset provided.
2. Test your trained model on unseen data.
3. If you are having difficulties with the multiclass classification, you can tweak the labels into 2 i.e. positive or negative to simplify.

In [56]:
pip install  -q scikit-learn 


Note: you may need to restart the kernel to use updated packages.


In [57]:
#install requirements

!pip install transformers datasets torch  -q gwpy
!pip install sentencepiece -q gwpy

from datasets import load_dataset
from datasets import load_metric #for access to metrics like F1 score, precision, recall...

from transformers import pipeline
from transformers import DataCollatorWithPadding #help batch data and pad them to a common lenght

from transformers import AutoTokenizer #generic tokenizer that we can set to match a pre-trained model
from transformers import AutoModelForQuestionAnswering, DistilBertConfig #AutoModel class plus bert config
from transformers import Trainer, TrainingArguments #used for fine tuning a model

from sklearn.metrics import f1_score
import torch

In [58]:

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")



In [60]:
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True)

In [61]:
tokenized_emotions = emotions.map(preprocess_function, batched=True)


[AAsking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.

Map: 100%|██████████| 16000/16000 [00:00<00:00, 77974.08 examples/s]

Map: 100%|██████████| 2000/2000 [00:00<00:00, 71367.51 examples/s]

Map: 100%|██████████| 2000/2000 [00:00<00:00, 74109.55 examples/s]


In [62]:
tokenized_emotions

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_text', 'input_ids', 'attention_mask'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label', 'label_text', 'input_ids', 'attention_mask'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label', 'label_text', 'input_ids', 'attention_mask'],
        num_rows: 2000
    })
})

In [63]:
# tokenized_test = emotions['test'].map(preprocess_function, batched=True)

# tokenized_train = emotions['train'].map(preprocess_function, batched=True)

In [64]:
# tokenized_train

In [65]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer,padding=True)

In [66]:
id2label = {0: "sadness",1:'joy', 2: "love",3:'anger',4:'fear',5:'surprise'}

label2id = {"sadness": 0, "joy": 1,"love":2,"anger":3,"fear":4,"surprise":5}

In [67]:
from transformers import AutoModelForSequenceClassification #automodel class provided by HF, allows us to select models for classification tasks
#NOTE: sequence classification tasks include sentiment analysis, text categorization, and natural language inference.
from transformers import TrainingArguments, Trainer #classes that we need for fine tuning

In [68]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [69]:

model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased", num_labels=6,id2label=id2label,label2id=label2id,ignore_mismatched_sizes=True)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [70]:
pip install accelerate -U --quiet

Note: you may need to restart the kernel to use updated packages.


In [71]:
pip install transformers[torch] --quiet


Note: you may need to restart the kernel to use updated packages.


In [72]:
import os
os.environ['PYDEVD_USE_FRAME_EVAL'] = 'NO'


In [73]:
training_args = TrainingArguments(
    output_dir='./results',           # output directory where the model predictions and checkpoints will be written.
    learning_rate=2e-5,               # learning rate
    evaluation_strategy = "epoch",    # you can either define epoch or steps for your training cycles
    per_device_train_batch_size=16,   # batch size per GPU/TPU core/CPU for training
    per_device_eval_batch_size=16,    # batch size per GPU/TPU core/CPU for evaluation
    num_train_epochs=10,               # total number of training epochs
    weight_decay=0.01,  
    
                  # strength of weight decay (regularisation technique to reduce overfitting), higher values of weight_decay result in stronger regularization
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_emotions['train'],
    eval_dataset=tokenized_emotions['test'],
    tokenizer=tokenizer,
    data_collator=data_collator
)
trainer.train()

 20%|██        | 4042/20000 [18:37<1:13:30,  3.62it/s]
  5%|▍         | 499/10000 [00:30<08:07, 19.48it/s]
  5%|▌         | 500/10000 [00:30<08:07, 19.48it/s] 

{'loss': 0.7169, 'grad_norm': 3.7223079204559326, 'learning_rate': 1.9e-05, 'epoch': 0.5}


 10%|▉         | 999/10000 [00:58<08:11, 18.33it/s]
 10%|█         | 1000/10000 [00:58<08:11, 18.33it/s]

{'loss': 0.2543, 'grad_norm': 0.8517903089523315, 'learning_rate': 1.8e-05, 'epoch': 1.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

[A[A                                           
                                                    
 10%|█         | 1000/10000 [01:02<08:11, 18.33it/s]
 10%|█         | 1001/10000 [01:02<1:33:24,  1.61it/s]

{'eval_loss': 0.21984118223190308, 'eval_runtime': 2.8708, 'eval_samples_per_second': 696.666, 'eval_steps_per_second': 43.542, 'epoch': 1.0}


 15%|█▌        | 1500/10000 [01:31<07:16, 19.47it/s]  
 15%|█▌        | 1500/10000 [01:31<07:16, 19.47it/s]

{'loss': 0.1547, 'grad_norm': 0.4876151382923126, 'learning_rate': 1.7e-05, 'epoch': 1.5}


 20%|█▉        | 1998/10000 [02:00<07:07, 18.70it/s]
 20%|██        | 2000/10000 [02:00<07:07, 18.70it/s]

{'loss': 0.1589, 'grad_norm': 17.280487060546875, 'learning_rate': 1.6000000000000003e-05, 'epoch': 2.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

[A[A                                           
                                                    
 20%|██        | 2000/10000 [02:04<07:07, 18.70it/s]
 20%|██        | 2001/10000 [02:04<1:05:19,  2.04it/s]

{'eval_loss': 0.1985936313867569, 'eval_runtime': 2.7654, 'eval_samples_per_second': 723.213, 'eval_steps_per_second': 45.201, 'epoch': 2.0}


 25%|██▌       | 2500/10000 [02:32<06:34, 19.00it/s]  
 25%|██▌       | 2500/10000 [02:32<06:34, 19.00it/s]

{'loss': 0.1214, 'grad_norm': 0.025412963703274727, 'learning_rate': 1.5000000000000002e-05, 'epoch': 2.5}


 30%|███       | 3000/10000 [03:01<06:05, 19.14it/s]
 30%|███       | 3000/10000 [03:01<06:05, 19.14it/s]

{'loss': 0.1156, 'grad_norm': 0.23864758014678955, 'learning_rate': 1.4e-05, 'epoch': 3.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

[A[A                                           
                                                    
 30%|███       | 3000/10000 [03:05<06:05, 19.14it/s]
 30%|███       | 3002/10000 [03:05<1:09:44,  1.67it/s]

{'eval_loss': 0.1918976902961731, 'eval_runtime': 2.808, 'eval_samples_per_second': 712.247, 'eval_steps_per_second': 44.515, 'epoch': 3.0}


 35%|███▍      | 3499/10000 [03:33<05:46, 18.76it/s]  
 35%|███▌      | 3500/10000 [03:33<05:46, 18.76it/s]

{'loss': 0.0896, 'grad_norm': 8.426790237426758, 'learning_rate': 1.3000000000000001e-05, 'epoch': 3.5}


 40%|███▉      | 3999/10000 [04:03<05:10, 19.30it/s]
 40%|████      | 4000/10000 [04:03<05:10, 19.30it/s]

{'loss': 0.0943, 'grad_norm': 1.8521944284439087, 'learning_rate': 1.2e-05, 'epoch': 4.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

[A[A                                           
                                                    
 40%|████      | 4000/10000 [04:06<05:10, 19.30it/s]
 40%|████      | 4001/10000 [04:06<57:07,  1.75it/s]

{'eval_loss': 0.2399238795042038, 'eval_runtime': 2.8312, 'eval_samples_per_second': 706.414, 'eval_steps_per_second': 44.151, 'epoch': 4.0}


 45%|████▍     | 4499/10000 [04:34<04:47, 19.13it/s]
 45%|████▌     | 4500/10000 [04:34<04:47, 19.13it/s]

{'loss': 0.0655, 'grad_norm': 5.214418888092041, 'learning_rate': 1.1000000000000001e-05, 'epoch': 4.5}


 50%|█████     | 5000/10000 [05:04<04:21, 19.11it/s]
 50%|█████     | 5000/10000 [05:04<04:21, 19.11it/s]

{'loss': 0.0684, 'grad_norm': 26.923316955566406, 'learning_rate': 1e-05, 'epoch': 5.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

[A[A                                           
                                                    
 50%|█████     | 5000/10000 [05:07<04:21, 19.11it/s]
 50%|█████     | 5002/10000 [05:07<47:39,  1.75it/s]

{'eval_loss': 0.27388060092926025, 'eval_runtime': 2.8298, 'eval_samples_per_second': 706.752, 'eval_steps_per_second': 44.172, 'epoch': 5.0}


 55%|█████▌    | 5500/10000 [05:35<03:56, 19.04it/s]
 55%|█████▌    | 5500/10000 [05:35<03:56, 19.04it/s]

{'loss': 0.0511, 'grad_norm': 67.83209991455078, 'learning_rate': 9e-06, 'epoch': 5.5}


 60%|█████▉    | 5999/10000 [06:05<03:31, 18.88it/s]
 60%|██████    | 6000/10000 [06:05<03:31, 18.88it/s]

{'loss': 0.055, 'grad_norm': 0.8436415791511536, 'learning_rate': 8.000000000000001e-06, 'epoch': 6.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

[A[A                                           
                                                    
 60%|██████    | 6000/10000 [06:08<03:31, 18.88it/s]
 60%|██████    | 6001/10000 [06:09<39:01,  1.71it/s]

{'eval_loss': 0.30287256836891174, 'eval_runtime': 2.8713, 'eval_samples_per_second': 696.539, 'eval_steps_per_second': 43.534, 'epoch': 6.0}


 65%|██████▌   | 6500/10000 [06:37<03:05, 18.88it/s]
 65%|██████▌   | 6500/10000 [06:37<03:05, 18.88it/s]

{'loss': 0.0347, 'grad_norm': 0.09074461460113525, 'learning_rate': 7e-06, 'epoch': 6.5}


 70%|██████▉   | 6999/10000 [07:06<02:35, 19.31it/s]
 70%|███████   | 7000/10000 [07:06<02:35, 19.31it/s]

{'loss': 0.0315, 'grad_norm': 0.003914815839380026, 'learning_rate': 6e-06, 'epoch': 7.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

[A[A                                           
                                                    
 70%|███████   | 7000/10000 [07:10<02:35, 19.31it/s]
 70%|███████   | 7001/10000 [07:10<28:54,  1.73it/s]

{'eval_loss': 0.3786075711250305, 'eval_runtime': 2.8448, 'eval_samples_per_second': 703.049, 'eval_steps_per_second': 43.941, 'epoch': 7.0}


 75%|███████▍  | 7499/10000 [07:38<02:09, 19.35it/s]
 75%|███████▌  | 7500/10000 [07:38<02:09, 19.35it/s]

{'loss': 0.0303, 'grad_norm': 0.20619796216487885, 'learning_rate': 5e-06, 'epoch': 7.5}


 80%|████████  | 8000/10000 [08:08<01:46, 18.85it/s]
 80%|████████  | 8000/10000 [08:08<01:46, 18.85it/s]

{'loss': 0.0247, 'grad_norm': 0.06029238924384117, 'learning_rate': 4.000000000000001e-06, 'epoch': 8.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

[A[A                                           
                                                    
 80%|████████  | 8000/10000 [08:12<01:46, 18.85it/s]
 80%|████████  | 8002/10000 [08:12<20:14,  1.64it/s]

{'eval_loss': 0.3982737362384796, 'eval_runtime': 2.8315, 'eval_samples_per_second': 706.329, 'eval_steps_per_second': 44.146, 'epoch': 8.0}


 85%|████████▌ | 8500/10000 [08:40<01:18, 19.07it/s]
 85%|████████▌ | 8500/10000 [08:40<01:18, 19.07it/s]

{'loss': 0.0179, 'grad_norm': 0.0023340994957834482, 'learning_rate': 3e-06, 'epoch': 8.5}


 90%|████████▉ | 8999/10000 [09:09<00:52, 19.17it/s]
 90%|█████████ | 9000/10000 [09:09<00:52, 19.17it/s]

{'loss': 0.0145, 'grad_norm': 7.467386722564697, 'learning_rate': 2.0000000000000003e-06, 'epoch': 9.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

[A[A                                           
                                                    
 90%|█████████ | 9000/10000 [09:13<00:52, 19.17it/s]
 90%|█████████ | 9001/10000 [09:13<10:04,  1.65it/s]

{'eval_loss': 0.42730310559272766, 'eval_runtime': 2.8616, 'eval_samples_per_second': 698.908, 'eval_steps_per_second': 43.682, 'epoch': 9.0}


 95%|█████████▍| 9498/10000 [09:41<00:25, 19.33it/s]
 95%|█████████▌| 9500/10000 [09:41<00:25, 19.33it/s]

{'loss': 0.012, 'grad_norm': 0.04790203645825386, 'learning_rate': 1.0000000000000002e-06, 'epoch': 9.5}


100%|█████████▉| 9999/10000 [10:11<00:00, 18.78it/s]
100%|██████████| 10000/10000 [10:11<00:00, 18.78it/s]

{'loss': 0.0142, 'grad_norm': 0.003499669022858143, 'learning_rate': 0.0, 'epoch': 10.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

[A[A                                           
                                                     
100%|██████████| 10000/10000 [10:14<00:00, 18.78it/s]
[A
100%|██████████| 10000/10000 [10:14<00:00, 16.26it/s]

{'eval_loss': 0.446134090423584, 'eval_runtime': 2.8087, 'eval_samples_per_second': 712.07, 'eval_steps_per_second': 44.504, 'epoch': 10.0}
{'train_runtime': 614.8484, 'train_samples_per_second': 260.227, 'train_steps_per_second': 16.264, 'train_loss': 0.10627782917022705, 'epoch': 10.0}





TrainOutput(global_step=10000, training_loss=0.10627782917022705, metrics={'train_runtime': 614.8484, 'train_samples_per_second': 260.227, 'train_steps_per_second': 16.264, 'total_flos': 1945612950228864.0, 'train_loss': 0.10627782917022705, 'epoch': 10.0})

In [74]:
from transformers import TextClassificationPipeline

myClassifier = TextClassificationPipeline(model=model, tokenizer=tokenizer)

myClassifier("i didnt feel humiliated")

[{'label': 'sadness', 'score': 0.9999728202819824}]

In [87]:
text = emotions['test'][20]['text']
print(text)

im not sure the feeling of loss will ever go away but it may dull to a sweet feeling of nostalgia at what i shared in this life with my dad and the luck i had to have a dad for years


In [88]:
myClassifier(text)

[{'label': 'sadness', 'score': 0.9999444484710693}]

In [90]:

emotions['test'][20]['label_text']

'sadness'