In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, matthews_corrcoef
import torch
from transformers import TrainingArguments, Trainer
from transformers import AlbertTokenizer, AlbertForSequenceClassification
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import MobileBertTokenizer, MobileBertForSequenceClassification
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from transformers import SqueezeBertTokenizer, SqueezeBertForSequenceClassification
from transformers import EarlyStoppingCallback
from sklearn.metrics import confusion_matrix, classification_report

### Load Data Set

In [None]:
df_train = pd.read_csv('training.csv')
df_val = pd.read_csv('validation.csv')
df_test = pd.read_csv('testing.csv')

df_train.head()

In [None]:
# Select the model you wish to train

model_name = 

# 'albert-base-v2'
# "bert-base-uncased"
# "vinai/bertweet-base"
# "distilbert-base-uncased"
# 'google/mobilebert-uncased'
# "roberta-base"
# 'squeezebert/squeezebert-uncased'

# Select the appropriate Tokeniser WRT to the model chosen. E.g here we load the ALBERT tokenizer for the ALBERT model

tokenizer = AlbertTokenizer.from_pretrained(model_name)

# Select the appropriate model initialiser WRT to the model chosen. E.g here we load AlbertForSequenceClassification for the ALBERT model

model = AlbertForSequenceClassification.from_pretrained(model_name, num_labels=3)

In [None]:
# ----- 1. Preprocess data -----#
# Preprocess data

X_train = list(df_train["text"])
y_train = list(df_train["label"])

X_val = list(df_val["text"])
y_val = list(df_val["label"])

X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)

In [None]:
# Create a torch data set

class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [None]:
# Create training and validation torch data sets 

train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_val_tokenized, y_val)

In [None]:
# Metrics to log during training

def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred, average='weighted')
    precision = precision_score(y_true=labels, y_pred=pred, average = 'weighted')
    f1 = f1_score(y_true=labels, y_pred=pred, average = 'weighted')

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1, }

In [None]:
# Define Trainer

args = TrainingArguments(
    output_dir="######",   # Where you wish to checkpoint each model at certain steps
    evaluation_strategy="steps", # When to do the evaluation
    eval_steps=50, # How often to do the evaluation
    logging_steps = 50, # How often logging must occur
    per_device_train_batch_size=16, # training batch size
    per_device_eval_batch_size=16, # evaluation batch size
    num_train_epochs=2, # Number of epochs
    seed=0, 
    load_best_model_at_end=True,
    report_to="wandb",  # Enable logging to Weights and Biases
    run_name="albert"  # name of the Weights and Biases run
)

# Initialise the trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
# Fine-tune the pre-trained model
trainer.train()

In [None]:
# Load the test data and tokenise it

X_test = list(df_test['text'])
y_test = list(df_test['label'])

X_test_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length=512)

In [None]:
# Create torch dataset for the test data
test_dataset = Dataset(X_test_tokenized)

In [None]:
# Load trained model
model_path = "#####" # Load the best model from your directory
model = AlbertForSequenceClassification.from_pretrained(model_path, num_labels=3)

In [None]:
# Define test trainer
test_trainer = Trainer(model)

In [None]:
# Make predictions on the rest data set
raw_pred, _, _ = test_trainer.predict(test_dataset)
y_pred = np.argmax(raw_pred, axis=1)

In [None]:
confusion_matrix(y_test, y_pred) # Confusion matrix for predictions

In [None]:
print(classification_report(y_test,y_pred)) # Classification report for predictions