Mount your google drive folder

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Install transformer library

In [None]:
%%capture
!pip install transformers

Log into HuggingFace for uploading the fine-tuned model

In [None]:
# !huggingface-cli login

Import libraries and packages needed for this notebook and set random seeds (this will not make the experiment fully reproducible)

In [None]:
import random
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import pandas as pd
from transformers import set_seed, TFAutoModelForSequenceClassification, AutoTokenizer
from tensorflow.keras.losses import CategoricalCrossentropy, BinaryCrossentropy
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, f1_score
from sklearn.utils import class_weight

seed = 101
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)
set_seed(seed)

Read in train, dev and test files

In [None]:
def read_corpus(corpus_file):
    '''Reads a string and returns a list of words and a list of corresponding labels.'''
    documents = []
    labels = []

    with open(corpus_file, encoding='utf-8') as in_file:
        for line in in_file:
            tokens = line.strip().split('\t')
            documents.append(tokens[0])
            labels.append(tokens[1])

    return documents, labels

# Path to project folder
path = "/content/gdrive/MyDrive/LFD_FP/"

# Telegram data
# X_train, Y_train = read_corpus(path + "data/telegram/train.tsv")
# X_dev, Y_dev = read_corpus(path + "data/telegram/dev.tsv")

# Twitter data
X_train, Y_train = read_corpus(path + "data/twitter/train.tsv")
X_dev, Y_dev = read_corpus(path + "data/twitter/dev.tsv")

Load in model and tokenizer from Huggingface

In [None]:
# lm = "bert-base-cased"
# lm = "bert-base-uncased"
# lm = "distilbert-base-cased"
# lm = "distilbert-base-uncased"
lm = "roberta-base"
# lm = "albert-base-v2"
tokenizer = AutoTokenizer.from_pretrained(lm)
model = TFAutoModelForSequenceClassification.from_pretrained(lm, num_labels=1)
max_length = 300

tokens_train = tokenizer(X_train, padding=True, max_length=max_length,
truncation=True, return_tensors="np").data
tokens_dev = tokenizer(X_dev, padding=True, max_length=max_length,
truncation=True, return_tensors="np").data
tokens_test = tokenizer(X_test, padding=True, max_length=max_length,
truncation=True, return_tensors="np").data

Set loss function, learning rate or learning rate scheduler and optimizer

In [None]:
loss_function = BinaryCrossentropy(from_logits=True)

In [None]:
initial_learning_rate = 3e-5
lr_schedule = tf.keras.optimizers.schedules.PolynomialDecay(
    initial_learning_rate,
    decay_steps=1000,
    end_learning_rate=1e-7,
    power=0.5,
)

In [None]:
optim = Adam(learning_rate=lr_schedule)

Transform string labels to one-hot encodings

In [None]:
encoder = LabelBinarizer()
Y_train_bin = encoder.fit_transform(Y_train)
Y_dev_bin = encoder.fit_transform(Y_dev)
Y_test_bin = encoder.fit_transform(Y_test)

Calculate class weights

In [None]:
weights = class_weight.compute_class_weight(class_weight='balanced', classes=np.unique(Y_train), y=Y_train)
weights_dict = {i: weight for i, weight in enumerate(weights)}

Compile and fit the model using an early stopping callback

In [None]:
batch_size = 16
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
model.compile(loss=loss_function, optimizer=optim, metrics=['accuracy'])
# To use class weights, make sure the parameter class_weight=weights_dict is set
model.fit(tokens_train, Y_train_bin, verbose=1, epochs=5, callbacks=[callback],
batch_size=batch_size, validation_data=(tokens_dev, Y_dev_bin), class_weight=weights_dict)

Print loss and accuracy on the dev set

In [None]:
results = model.evaluate(tokens_dev, Y_dev_bin, batch_size=batch_size)
print(results)

Print classification report and confusion matrix on the test set

In [None]:
def print_measures(Y_test, Y_pred, plot_cm=False):
    ''' Takes in true labels Y_test, predicted labels Y_pred and a boolean
    plot_cm (default=False). Prints a classification report (precision, recall
    and F1 score for each class) and a confusion matrix with labels. To plot a
    visualization of the confusion matrix, set plot_cm to True'''
    report = classification_report(Y_test, Y_pred, digits=3)

    # Create a confusion matrix with labels
    labels = np.unique(Y_test)
    cm = confusion_matrix(Y_test, Y_pred, labels=labels)
    cm_labeled = pd.DataFrame(cm, index=labels, columns=labels)

    print("Classification report:\n\n", report)
    print("Confusion matrix:\n\n", cm_labeled)

    if plot_cm:
      # Plot confusion matrix using pyplot
        display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
        display.plot()
        plt.show()

In [None]:
predictions = model.predict(tokens_dev)['logits']
Y_pred = encoder.inverse_transform(predictions)

In [None]:
print_measures(Y_dev, Y_pred, plot_cm=False)

Push model to HuggingFace

In [None]:
# model.push_to_hub("roberta-offense-telegram")
# model.push_to_hub("roberta-offense-twitter")