# Load Dependencies

In [33]:
import pandas as pd
import numpy as np

import re
import string

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    x = torch.ones(1, device=mps_device)
    print (x)
else:
    print ("MPS device not found.")


import logging
logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s", level=logging.INFO)

tensor([1.], device='mps:0')


  nonzero_finite_vals = torch.masked_select(


# Load Dataset

In [15]:
dataset = load_dataset("sem_eval_2018_task_1", "subtask5.english")

logging.info(f"the dataset distribution: {dataset.num_rows}")

2023-08-27 14:08:25,924 - INFO - the dataset distribution: {'train': 6838, 'test': 3259, 'validation': 886}


In [16]:
dataset["train"][0]

{'ID': '2017-En-21441',
 'Tweet': "“Worry is a down payment on a problem you may never have'. \xa0Joyce Meyer.  #motivation #leadership #worry",
 'anger': False,
 'anticipation': True,
 'disgust': False,
 'fear': False,
 'joy': False,
 'love': False,
 'optimism': True,
 'pessimism': False,
 'sadness': False,
 'surprise': False,
 'trust': True}

# Preprocess data

In [17]:
logging.info(f"creating a mapping table for labels")

labels = [label for label in dataset["train"].features if label not in ["ID", "Tweet"]]

label2id = {idx:label for idx, label in enumerate(labels)}
id2label = {label:idx for idx, label in enumerate(labels)}

2023-08-27 14:08:25,934 - INFO - creating a mapping table for labels


In [18]:
model_id = "bert-base-uncased"
max_length = 128
batch_size = 16
metric_name = "f1"
num_labels = len(labels)

tokenizer = AutoTokenizer.from_pretrained(model_id)

In [19]:
def text_preprocessing(text):
    if not isinstance(text, str):
        text = str(text)
    text = re.sub("\#", " ", text) # remove hashtag # symbol only 
    text = re.sub(r"\\n", " ", text) # remove newlines
    text = text.lower() # text normalization
    text = re.sub("\@\w+", " ", text) # remove @
    text = re.sub('\w*\d\w*', " ", text) # remove digits
    text = re.sub("([^\x00-\x7F])+", " ", text) # remove emoji
    text = re.sub('[%s]' % re.escape(string.punctuation), " ", text) # remove punctuation
    text = re.sub("\s+", " ", text) # remove excessive white space 
    return text.strip()

In [20]:
logging.info(f"measuring text length")

train_text_len = []

for text in dataset["train"]["Tweet"]:
    processed_text = text_preprocessing(text)
    len_ = len(text.split(" "))
    train_text_len.append(len_)

logging.info(f"max length is {max(train_text_len)}")

2023-08-27 14:08:29,038 - INFO - measuring text length
2023-08-27 14:08:29,191 - INFO - max length is 58


In [21]:
examples = dataset["train"][0]
item_label = {k: examples[k] for k in examples.keys() if k in labels}
for idx, label in enumerate(labels):
    print(item_label[label])

False
True
False
False
False
False
True
False
False
False
True


In [22]:
def encoding_text(item):

    # preprocessing text
    text = [text_preprocessing(t) for t in item["Tweet"]]

    # encoding text
    encoding = tokenizer(text, 
                       padding="max_length",
                       truncation=True,
                       max_length=max_length)

    # initiate an empty matrix to store both text and labels
    labels_matrix = np.zeros((len(text), len(labels)))

    # process labels
    item_label = {label:item[label] for label in labels}

    # fill the empty array
    for idx, label in enumerate(labels):
        labels_matrix[:, idx] = item_label[label]

    # converting to list
    encoding["labels"] = labels_matrix.tolist()
    
    return encoding

In [23]:
logging.info(f"preprocessing text")

encoded_dataset = dataset.map(encoding_text, batched=True, remove_columns=dataset['train'].column_names)

2023-08-27 14:08:31,895 - INFO - preprocessing text
Map: 100%|████████████████████████| 3259/3259 [00:00<00:00, 18236.88 examples/s]


In [27]:
print(encoded_dataset["train"][0])

{'input_ids': [101, 4737, 2003, 1037, 2091, 7909, 2006, 1037, 3291, 2017, 2089, 2196, 2031, 11830, 11527, 14354, 4105, 4737, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

# Model Training

In [28]:
model = AutoModelForSequenceClassification.from_pretrained(model_id,
                                                           problem_type="multi_label_classification",
                                                           num_labels=num_labels,
                                                           id2label=id2label,
                                                           label2id=label2id
                                                          )

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    f"bert-finetuned-sem_eval-english",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    #push_to_hub=True,
)

In [35]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
    
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

In [36]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [37]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.322189,0.672363,0.769876,0.257336
2,0.369500,0.309953,0.694824,0.787536,0.285553
3,0.283500,0.305427,0.707035,0.798385,0.291196
4,0.245800,0.313761,0.700223,0.796378,0.267494
5,0.222900,0.311049,0.702757,0.796239,0.283296


TrainOutput(global_step=2140, training_loss=0.27597292962475356, metrics={'train_runtime': 1185.2641, 'train_samples_per_second': 28.846, 'train_steps_per_second': 1.806, 'total_flos': 2249123476753920.0, 'train_loss': 0.27597292962475356, 'epoch': 5.0})

In [38]:
trainer.evaluate()

{'eval_loss': 0.305427223443985,
 'eval_f1': 0.707035175879397,
 'eval_roc_auc': 0.7983848635714869,
 'eval_accuracy': 0.291196388261851,
 'eval_runtime': 7.9125,
 'eval_samples_per_second': 111.975,
 'eval_steps_per_second': 7.077,
 'epoch': 5.0}