In [1]:
!pip install transformers
!pip install datasets
!pip uninstall -y transformers accelerate
!pip install transformers accelerate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m63.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m27.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m106.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90

In [16]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset

In [17]:
emotion_dataset = load_dataset('emotion')

print(emotion_dataset['train']['text'][0:10])

#This dataset contains labeled data for our emotions
print(emotion_dataset['train'].features)



  0%|          | 0/3 [00:00<?, ?it/s]

['i didnt feel humiliated', 'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake', 'im grabbing a minute to post i feel greedy wrong', 'i am ever feeling nostalgic about the fireplace i will know that it is still on the property', 'i am feeling grouchy', 'ive been feeling a little burdened lately wasnt sure why that was', 'ive been taking or milligrams or times recommended amount and ive fallen asleep a lot faster but i also feel like so funny', 'i feel as confused about life as a teenager or as jaded as a year old man', 'i have been with petronas for years i feel that petronas has performed well and made a huge profit', 'i feel romantic too']
{'text': Value(dtype='string', id=None), 'label': ClassLabel(names=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise'], id=None)}


We have the class labels, but we don't know yet, what actual labels our labels (represented with numbers) belong to. We can easily discover this

In [18]:
features = emotion_dataset['train'].features


#Let's do a mapping to all at once to get the label names associated for the int values
id2label = {id: features['label'].int2str(id) for id in range(6)}

label2id = {v:k for k, v in id2label.items()}

print(id2label)
print(label2id)

{0: 'sadness', 1: 'joy', 2: 'love', 3: 'anger', 4: 'fear', 5: 'surprise'}
{'sadness': 0, 'joy': 1, 'love': 2, 'anger': 3, 'fear': 4, 'surprise': 5}


We want to see whether our dataset is balanced or not. Let's convert our dataset into a pandas dataframe and check the labels frequency.

In [19]:
emotion_df = emotion_dataset['train'].to_pandas()

emotion_df['label'].value_counts(normalize=True).sort_index()

0    0.291625
1    0.335125
2    0.081500
3    0.134937
4    0.121063
5    0.035750
Name: label, dtype: float64

We can see that our dataset is pretty imbalanced. We have the highest frequency at our label 0 corresponding to sadness, but the label surprise has a very low frequency. We can ensure some balance, by duplicating the records in the 5th class, making sure we are having an even distribution. But the disadvantage is that if we do that, the model will memorize these duplicates and not going to generalize well. We have to modify the loss function of the model during the training, introducing a bias at the level of loss function.

## Tokenization

In [20]:
model_checkpoint = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
#print(emotion_dataset['train']['text'][0:10])

['i didnt feel humiliated', 'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake', 'im grabbing a minute to post i feel greedy wrong', 'i am ever feeling nostalgic about the fireplace i will know that it is still on the property', 'i am feeling grouchy', 'ive been feeling a little burdened lately wasnt sure why that was', 'ive been taking or milligrams or times recommended amount and ive fallen asleep a lot faster but i also feel like so funny', 'i feel as confused about life as a teenager or as jaded as a year old man', 'i have been with petronas for years i feel that petronas has performed well and made a huge profit', 'i feel romantic too']


In [21]:
#Applying tokenize function for all other texts
def tokenize_text(example):
  #every transformer has a maximum sequence size
  return tokenizer(example['text'], truncation=True, max_length=512)

emotion_dataset = emotion_dataset.map(tokenize_text, batched=True)





Map:   0%|          | 0/2000 [00:00<?, ? examples/s]



## Working with imbalanced classes

We have to introduce some weights or coefficients for the loss function, which will multiply each one of the classes by an amount reflected in the data. If we want these coefficients to range from 0 to 1, then we might assign some higher weights to the rare classes and a lower weight to the common weights. That is how the model doesn't get biased.

In [22]:
import numpy as np

class_weights = np.array(1 - emotion_df['label'].value_counts(normalize=True).sort_index())
class_weights

array([0.708375 , 0.664875 , 0.9185   , 0.8650625, 0.8789375, 0.96425  ])

In [23]:
import torch

#All the weights should be as torch tensors, since the Trainer API is based on torch tensors
class_weights = torch.from_numpy(class_weights).float().to("cuda")

In [24]:
#Renaming the column label to labels for our model to recognize
emotion_dataset = emotion_dataset.rename_column("label","labels")

## Building the Weighted Loss function

In [25]:
from torch import nn
from transformers import Trainer, TrainingArguments

class WeightedLoss(Trainer):

  def compute_loss(self, model, inputs, return_outputs=False):

    outputs = model(**inputs)
    #Get the logits
    logits = outputs.logits
    #The labels
    labels = inputs.get('labels')
    #Apply the weights to our loss function
    loss_func = nn.CrossEntropyLoss(class_weights)
    #Calculate loss
    loss = loss_func(logits, labels)

    return (loss, outputs) if return_outputs else loss

In [26]:
#Instantiate the model
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint,
                                                           num_labels=6,
                                                           id2label=id2label,
                                                           label2id=label2id)

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

In [27]:
#Metrics we have chosen is the f1 score
from sklearn.metrics import f1_score

def compute_metrics(pred):

  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  f1 = f1_score(labels, preds, average = "weighted")
  return {"f1":f1}

In [28]:
#initialize the  hyperparameters

batch_size = 32
#number of training steps
num_train_epochs = 3
num_of_samples = len(emotion_dataset['train'])
logging_steps = (num_train_epochs * num_of_samples) / batch_size

training_arguments = TrainingArguments('test-trainer',
                                       num_train_epochs= num_train_epochs,
                                       evaluation_strategy = "epoch",
                                       per_device_train_batch_size = batch_size,
                                       per_device_eval_batch_size = batch_size,
                                       logging_steps = logging_steps,
                                       weight_decay = 0.01,
                                       learning_rate = 2e-5,
                                       )

In [29]:
#Initializing the WeightedTrainer
weighted_trainer = WeightedLoss(model,
                                   training_arguments,
                                   train_dataset = emotion_dataset['train'],
                                   eval_dataset = emotion_dataset['validation'],
                                   tokenizer = tokenizer,
                                   compute_metrics = compute_metrics,
                                   )

In [30]:
weighted_trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.235065,0.918002
2,No log,0.14706,0.936465
3,0.319400,0.146839,0.937321


TrainOutput(global_step=1500, training_loss=0.31944307454427084, metrics={'train_runtime': 460.3286, 'train_samples_per_second': 104.273, 'train_steps_per_second': 3.259, 'total_flos': 1289931829907328.0, 'train_loss': 0.31944307454427084, 'epoch': 3.0})