In [None]:
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer

In [16]:
random_seed = 42
model_checkpoint = 'distilbert-base-uncased'
dataset_name = "sem_eval_2018_task_1"

# Get the data

In [5]:
# load dataset
dataset = load_dataset(dataset_name, "subtask5.english")


# get mapping of labels
labels = [label for label in dataset['train'].features.keys() if label not in ['ID', 'Tweet']]

In [12]:
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}

## Tokenization

In [17]:
# download tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def preprocess_data(examples):
  # take a batch of texts
  text = examples["Tweet"]
  # encode them
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)
  # add labels
  labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
  # create numpy array of shape (batch_size, num_labels)
  labels_matrix = np.zeros((len(text), len(labels)))
  # fill numpy array
  for idx, label in enumerate(labels):
    labels_matrix[:, idx] = labels_batch[label]

  encoding["labels"] = labels_matrix.tolist()
  
  return encoding

In [7]:
dataset_tokenized = dataset.map(preprocess_data, batched=True, remove_columns=dataset['train'].column_names)


Loading cached processed dataset at /Users/Amiros/.cache/huggingface/datasets/sem_eval_2018_task_1/subtask5.english/1.1.0/a7c0de8b805f1988b118882fb289ccfbbeb9085c7820b6f046b5887e234af182/cache-308da669712af75e.arrow
Loading cached processed dataset at /Users/Amiros/.cache/huggingface/datasets/sem_eval_2018_task_1/subtask5.english/1.1.0/a7c0de8b805f1988b118882fb289ccfbbeb9085c7820b6f046b5887e234af182/cache-e56b686f5b41d255.arrow
Loading cached processed dataset at /Users/Amiros/.cache/huggingface/datasets/sem_eval_2018_task_1/subtask5.english/1.1.0/a7c0de8b805f1988b118882fb289ccfbbeb9085c7820b6f046b5887e234af182/cache-f412a5b21a4d7c16.arrow


In [10]:
# set format for pytorch
dataset_tokenized.set_format("torch")

dataset_tokenized

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 6838
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3259
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 886
    })
})

# Build the Model

In [33]:
import torch
from transformers import (AutoModelForSequenceClassification,
                          Trainer, 
                          TrainingArguments
                         )
from transformers import EvalPrediction
from sklearn.metrics import  f1_score, classification_report

In [11]:
# setting the hyperparamteres relying on the suggested values from https://arxiv.org/abs/1905.05583
epochs = 5
train_batch_size = 16
eval_batch_size = 16
warmup_steps = 500
learning_rate = 5e-5


We have to set the problem_type to be `multi_label_classification`. This will make sure the appropriate loss [`BCEWithLogitsLoss`](https://pytorch.org/docs/stable/generated/torch.nn.BCEWithLogitsLoss.html) function is used. BCEWithLogitLoss uses multiple Sigmoid (equal to the number of labels) when computing loss function. 

## Get the base model

multi_label_classification

In [18]:
# download model from model hub
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, 
    problem_type="multi_label_classification", 
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id
)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'pre_classi

## Fine-tuning configuration

I will freeze the DistilBERT model layers except for the dense top layer, as the first layers of encode universal features while the top layers are more task-specific. This will also help with trianing speed and reduces memory usage during training.



In [19]:
# Freeze all base-model layers in order to speed up the fine-tuning process
for param in model.distilbert.parameters():
    param.requires_grad = False
    
# check which layers require gradiantes (i.e trainable)
for name, param in model.named_parameters():
    if param.requires_grad == True:
        print(name) 

pre_classifier.weight
pre_classifier.bias
classifier.weight
classifier.bias


## Define evaluation metric


Since we are dealing with imbalance data, I will use a `weighted-averaged F1` score. The weighted-averaged F1 score is calculated by taking the mean of all per-class F1 scores while considering each class’s support (support is the number of actual occurrences of the class in the dataset).



In [23]:
# compute metrics function for multi-label classification
def multi_label_metrics(predictions, labels=labels, threshold=0.5):
    """
    Multi-label classification metric for Trainer
    
    The steps taken in this function includes:
        1. Tokenization with padding and truncation provided by the max_lentgh
        2. Encode labels as a vector
        
    Keyword arguments:
    predictions -- Logits from model
    labels -- name of label columns (List)
    threshold -- class threshold (int) default value is 0.5
    """
    
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_average = f1_score(y_true=y_true, y_pred=y_pred, average='weighted')

    # return as dictionary
    metrics = {'f1_weighted': f1_average
              }
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

## Define loss function

Since we are dealing with imbalanced data, I will update the default loss function to take `pos_weight` for labels as recommedned by the [PyTorch documentation](https://pytorch.org/docs/stable/generated/torch.nn.BCEWithLogitsLoss.html#torch.nn.BCEWithLogitsLoss) for `BCEWithLogitsLoss` as the ratio between the negative counts and the positive counts for each class.

In [27]:
def calculate_pos_weight(training_data):
    """
    Function to calculate pos_weight for labels
    """
    num_positives = torch.sum(training_data['labels'], dim=0)
    num_negatives = len(training_data['labels']) - num_positives
    pos_weight  = num_negatives / num_positives
    
    return pos_weight

pos_weight = calculate_pos_weight(dataset_tokenized['train'])

## Model for multi-label classification

In [28]:
class MultilabelTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), 
                        labels.float().view(-1, self.model.config.num_labels))
        return (loss, outputs) if return_outputs else loss

# Train

In [29]:
# define training args
training_args = TrainingArguments(
    output_dir='../output/',
    num_train_epochs=epochs,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=eval_batch_size,
    warmup_steps=warmup_steps,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir=f"../output/logs",
    logging_steps=2,
    learning_rate=float(learning_rate),
    metric_for_best_model="f1_weighted",
    load_best_model_at_end=True
)

In [34]:
# create Trainer instance
trainer = MultilabelTrainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=dataset_tokenized['train'],
    eval_dataset=dataset_tokenized['validation'],
    tokenizer=tokenizer
)

In [35]:
trainer.train()

***** Running training *****
  Num examples = 6838
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 2140
  Number of trainable parameters = 599051


Epoch,Training Loss,Validation Loss,F1 Weighted
1,1.0039,1.043456,0.561322
2,1.2092,0.966692,0.589762
3,0.9336,0.930037,0.594589
4,1.0038,0.905994,0.597023
5,0.7939,0.902524,0.600004


***** Running Evaluation *****
  Num examples = 886
  Batch size = 16
Saving model checkpoint to ../output/checkpoint-428
Configuration saved in ../output/checkpoint-428/config.json
Model weights saved in ../output/checkpoint-428/pytorch_model.bin
tokenizer config file saved in ../output/checkpoint-428/tokenizer_config.json
Special tokens file saved in ../output/checkpoint-428/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 886
  Batch size = 16
Saving model checkpoint to ../output/checkpoint-856
Configuration saved in ../output/checkpoint-856/config.json
Model weights saved in ../output/checkpoint-856/pytorch_model.bin
tokenizer config file saved in ../output/checkpoint-856/tokenizer_config.json
Special tokens file saved in ../output/checkpoint-856/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 886
  Batch size = 16
Saving model checkpoint to ../output/checkpoint-1284
Configuration saved in ../output/checkpoint-1284/config.json
Model we

TrainOutput(global_step=2140, training_loss=0.977554079361051, metrics={'train_runtime': 3089.0824, 'train_samples_per_second': 11.068, 'train_steps_per_second': 0.693, 'total_flos': 1132446821076480.0, 'train_loss': 0.977554079361051, 'epoch': 5.0})

# Evaluate

In [38]:
import evaluate

In [39]:
eval_result = trainer.evaluate(eval_dataset=dataset_tokenized['test'])

# save the evaluation result
hyperparams = {"model": model_checkpoint, "evaluation": eval_result}
e = evaluate.save("../output/experiments/", **hyperparams)

eval_result

{'eval_loss': 0.920563280582428,
 'eval_f1_weighted': 0.5942335301853315,
 'eval_runtime': 233.5157,
 'eval_samples_per_second': 13.956,
 'eval_steps_per_second': 0.874,
 'epoch': 5.0}

## Classification report

In [40]:
# get last hidden state of the trained model
outputs = trainer.predict(dataset_tokenized['test'])
logits = outputs[0]

***** Running Prediction *****
  Num examples = 3259
  Batch size = 16


Now I'll get the logits from the model output and apply `sigmoid` with threshold of 0.5 to predict which emotions are present.

The reason I am using sigmoid here is that a key property of sigmoid is that the probabilities produced by a sigmoid are independent, and therefore are not constrained to sum to one.

Since in a multi-label classification setting there are more than one right answer = Non-exclusive outputs, appling a sigmoid function to each element of the raw output independently makes sense.

In [41]:
def predict_emotions(model_output, threshold=0.5):
     
    # apply sigmoid + threshold
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(model_output).squeeze().cpu())
    # initialize an array of the same size
    predictions = np.zeros(probs.shape)
    # assign 1 for values where the predicted probalbity is larger than the threshold
    predictions[np.where(probs >= threshold)] = 1
    
    return predictions

In [43]:
y_pred = predict_emotions(logits)
print(classification_report(outputs[1], y_pred, target_names=labels))

              precision    recall  f1-score   support

       anger       0.65      0.74      0.70      1101
anticipation       0.18      0.46      0.26       425
     disgust       0.63      0.75      0.69      1099
        fear       0.28      0.66      0.39       485
         joy       0.75      0.81      0.78      1442
        love       0.36      0.84      0.50       516
    optimism       0.61      0.76      0.67      1143
   pessimism       0.20      0.60      0.30       375
     sadness       0.47      0.70      0.57       960
    surprise       0.11      0.56      0.19       170
       trust       0.09      0.56      0.16       153

   micro avg       0.43      0.73      0.54      7869
   macro avg       0.39      0.68      0.47      7869
weighted avg       0.53      0.73      0.59      7869
 samples avg       0.43      0.72      0.51      7869



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Export

In [None]:
print(f"The trained model is {round(model.get_memory_footprint()/1e6)} mb.")

In [None]:
# save locally
model.save_pretrained(f"../output/model/{e.name.split('.')[0]}")

In [None]:
# Push to Hub
model.push_to_hub(f"amir_llm_multiclass")