Imports:

In [None]:
#Import information 

! pip install transformers datasets evaluate
! pip install torch torchvision
! pip install tokenizers
! pip3 install emoji==0.6.0
! pip install -U accelerate
! pip install -U transformers
! pip install pandas
! pip install numpy
! pip install sklearn
! pip install scikit-learn

Required Huggingface login:

In [None]:
#Required Huggingface login


from huggingface_hub import notebook_login

notebook_login()

In [None]:
#Starter imports


import torch
from transformers import AutoModel, AutoTokenizer, pipeline
from datasets import load_dataset


In [None]:
#Check if GPU is working

if torch.cuda.is_available():
  device = torch.device('cuda:0')
  print('Running on the GPU')
else:
  device = torch.device('cpu')
  print('Running on the CPU')

device

In [None]:
#Loading dataset (bragging)
import pandas as pd


df = pd.read_csv("bragging_data.csv", engine = "python")

#OHE

ohe_frame = pd.get_dummies(df['label'])
df = df.drop('label', axis = 1)
df = df.join(ohe_frame)

from sklearn.model_selection import train_test_split


df.drop(labels = ["id", "sampling", "round"], axis = 1, inplace = True)
train, test = train_test_split(df, test_size=0.2)

from datasets import Dataset

training = Dataset.from_pandas(train, preserve_index = False)
testing = Dataset.from_pandas(test, preserve_index = False)

In [None]:
#Data processing

labels = ['achievement', 'action', 'affiliation', 'feeling', 'not', 'possession', 'trait']
id2label = {num:label for num, label in enumerate(labels)}
label2id = {label:num for num, label in enumerate(labels)}

In [None]:
#Data Encoding

from transformers import AutoTokenizer
import numpy as np


tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")


def preprocess_function(data):
    text = data["text"]
    encoded = tokenizer(text,padding = "max_length", truncation = True, max_length = 128)
    labels_batch = {k: data[k] for k in data.keys() if k in labels}
    labels_matrix = np.zeros((len(text), len(labels)))
    for idx, label in enumerate(labels):
        labels_matrix[:, idx] = labels_batch[label]
    encoded["labels"] = labels_matrix.tolist()
    return encoded


tokenized_train = training.map(preprocess_function, batched=True, remove_columns= training.column_names)
tokenized_test = testing.map(preprocess_function, batched=True, remove_columns= testing.column_names)



tokenized_train.set_format("torch")
tokenized_test.set_format("torch")

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", 
                                                           problem_type="multi_label_classification", 
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)

In [None]:
batch_size = 16
metric_name = "f1"

args = TrainingArguments(
    f"bert-finetuned-sem_eval-english",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=2,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    #push_to_hub=True,
)

In [None]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction


#Computing loss for multi-class: 

def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result



In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()