#### Utiles and functions implementation

In [None]:
import pandas as pd
import numpy as np
import json
import os
import matplotlib.pyplot as plt
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import precision_score, recall_score, f1_score

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Load the dataset
train_df = pd.read_csv('/content/train_df.csv')
validation_df = pd.read_csv('/content/valid_df.csv')
testing_df = pd.read_csv('/content/test_df.csv')

In [None]:
new_df_pairs['input_sentence'] =new_df_pairs.apply(lambda row : '[CLS] '+row['ex']+' [SEP] '+row['def'],axis=1)

In [None]:
def tokenize_data(data):
    #the text and hypothesis will speperated by [sep] token to differentiate them.
    return tokenizer(text=data['text'].tolist(), truncation=True, padding="max_length", max_length=512)


class WSDDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(value[idx]) for key, value in self.encodings.items()}
        # Here we ensure the label is an integer tensor
        item['labels'] = torch.tensor(int(self.labels[idx]))
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
directory_path = '/content/'

In [None]:
def train(model):
  training_args = TrainingArguments(
    output_dir=directory_path+'results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir=directory_path+'logs',
    logging_steps=10,
    evaluation_strategy="epoch",)

  def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids

    accuracy = (preds == labels).mean()
    precision = precision_score(labels, preds, average='macro')
    recall = recall_score(labels, preds, average='macro')
    f1 = f1_score(labels, preds, average='macro')

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

  trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,)

  trainer.train()

  return trainer

In [None]:
def get_predictions_and_testing_result(trainer,model_name):
  prediction_output = trainer.predict(test_dataset)
  preds = np.argmax(prediction_output.predictions, axis=1)
  testing_df['label_ids'] = prediction_output[1]
  testing_df[f'pred_from_{model_name}'] = preds
  print("Testing_result",prediction_output[2])
  print("---------------------------------------")
  print(testing_df[f'pred_from_{model_name}'])

In [None]:
model_name = "bert-base-arabertv02"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'classifier.bias', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
train_encodings = tokenize_data(train_df)
valid_encodings = tokenize_data(validation_df)
test_encodings = tokenize_data(testing_df)

train_dataset = WSDDataset(train_encodings, train_labels)
valid_dataset = WSDDataset(valid_encodings, valid_labels)
test_dataset = WSDDataset(test_encodings, test_labels)

In [None]:
trainer = train(model)

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,1.071838,0.1,0.05,0.5,0.090909
2,1.094000,1.067238,0.1,0.05,0.5,0.090909
3,1.094000,1.059748,0.1,0.05,0.5,0.090909


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
get_predictions_and_testing_result(trainer,model_name)

  _warn_prf(average, modifier, msg_start, len(result))


Testing_result {'test_loss': 1.0942115783691406, 'test_accuracy': 0.4, 'test_precision': 0.13333333333333333, 'test_recall': 0.3333333333333333, 'test_f1': 0.1904761904761905, 'test_runtime': 0.1218, 'test_samples_per_second': 41.063, 'test_steps_per_second': 8.213}
---------------------------------------
0    1
1    1
2    1
3    1
4    1
Name: pred_from_distilbert-base-uncased, dtype: int64


In [None]:
filename = model_name_1 + "_result.csv"
testing_df.to_csv(directory_path+filename, index=False)