In [1]:
!pip install -qq transformers
!pip install -qq sentencepiece
!pip install -qq datasets


[K     |████████████████████████████████| 3.5 MB 14.0 MB/s 
[K     |████████████████████████████████| 67 kB 4.5 MB/s 
[K     |████████████████████████████████| 6.8 MB 81.9 MB/s 
[K     |████████████████████████████████| 895 kB 90.8 MB/s 
[K     |████████████████████████████████| 596 kB 58.0 MB/s 
[K     |████████████████████████████████| 1.2 MB 14.5 MB/s 
[K     |████████████████████████████████| 311 kB 14.3 MB/s 
[K     |████████████████████████████████| 243 kB 86.2 MB/s 
[K     |████████████████████████████████| 133 kB 74.4 MB/s 
[K     |████████████████████████████████| 1.1 MB 84.9 MB/s 
[K     |████████████████████████████████| 271 kB 87.5 MB/s 
[K     |████████████████████████████████| 94 kB 4.1 MB/s 
[K     |████████████████████████████████| 144 kB 63.8 MB/s 
[?25h

In [2]:
import transformers
import datasets
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_linear_schedule_with_warmup,Trainer, TrainingArguments
from transformers.file_utils import is_tf_available, is_torch_available, is_torch_tpu_available
import torch
import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
import random
from textwrap import wrap
from datetime import datetime
from datasets import load_from_disk
from datasets import Dataset
from sklearn.metrics import accuracy_score,classification_report, confusion_matrix
from sklearn.metrics import precision_recall_fscore_support

In [3]:
from torch import nn

In [4]:
# the model we gonna train, base uncased BERT
# check text classification models here: https://huggingface.co/models?filter=text-classification
MODEL_NAME = "distilbert-base-uncased"
# max sequence length for each document/sentence sample
MAX_LENGTH = 64
BATCH_SIZE = 16
EPOCHS = 3
LEARNING_RATE= 6.58e-5
WEIGHT_DECAY = 0.289
WARMUP_STEPS = 464
RANDOM_SEED=22
LEARNING_RATE_DECAY_MULTIPLIER = 0.95
REINIT_LAYERS = 2

QA_OUTPUT_PATH= "/content/drive/MyDrive/Dissertation/disbert_hate_ml/results/best_model_squad"

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [5]:


def set_seed(seed):
    """Set all seeds to make results reproducible (deterministic mode).
       When seed is None, disables deterministic mode.
    :param seed: an integer to your choosing
    """
    if seed is not None:
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        np.random.seed(seed)
        random.seed(seed)

def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  # calculate accuracy using sklearn's function
  acc = accuracy_score(labels, preds)
  precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
  acc = accuracy_score(labels, preds)
  confusion_matrix = classification_report(labels, preds, digits=4,output_dict=True)
  return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'hate_f1': confusion_matrix["0"]["f1-score"],
        'hate_recall': confusion_matrix["0"]["recall"],
        'hate_precision': confusion_matrix["0"]["precision"],
        'offensive_f1': confusion_matrix["1"]["f1-score"],
        'offensive_recall': confusion_matrix["1"]["recall"],
        'offensive_precision': confusion_matrix["1"]["precision"],
        'normal_f1': confusion_matrix["2"]["f1-score"],
        'normal_recall': confusion_matrix["2"]["recall"],
        'normal_precision': confusion_matrix["2"]["precision"],    
  }


def model_init():
  temp_model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME,num_labels=3).to(device)
  return temp_model

# Code modified from Stabilizer library to handle DistilBERT architecture
#https://github.com/flowerpot-ai/stabilizer


def get_optimizer_parameters_with_llrd(model, peak_lr, multiplicative_factor):
    num_encoder_layers = len(model.distilbert.transformer.layer)
    # Task specific layer gets the peak_lr
    tsl_parameters = [
        {
            "params": [param for name, param in model.named_parameters() if 'distilbert' not in name],
            "param_names": [name for name, param in model.named_parameters() if 'distilbert' not in name],
            "lr": peak_lr,
            "name": "tsl",
        }
    ]

    # Starting from the last encoder layer each encoder layers get a lr defined by
    # current_layer_lr = prev_layer_lr * multiplicative_factor
    # the last encoder layer lr = peak_lr * multiplicative_factor
    encoder_parameters = [
        {
            "params": [param for name, param in model.named_parameters() if f"distilbert.transformer.layer.{layer_num}" in name],
            "param_names": [name for name, param in model.named_parameters() if f"distilbert.transformer.layer.{layer_num}" in name],
            "lr": peak_lr * (multiplicative_factor ** (num_encoder_layers - layer_num)),
            "name": f"layer_{layer_num}",
        }
        for layer_num, layer in enumerate(model.distilbert.transformer.layer)
    ]

    # Embedding layer gets embedding layer lr = first encoder layer lr * multiplicative_factor
    embedding_parameters = [
        {
            "params": [param for name, param in model.named_parameters() if 'embeddings' in name],
            "param_names": [name for name, param in model.named_parameters() if 'embeddings' in name],
            "lr": peak_lr * (multiplicative_factor ** (num_encoder_layers + 1)),
            "name": "embedding",
        }
    ]
    return tsl_parameters + encoder_parameters + embedding_parameters

def reinit_autoencoder_model(model, reinit_num_layers=0):
    """reinitialize autoencoder model layers"""

    if reinit_num_layers:
        for layer in model.distilbert.transformer.layer[-reinit_num_layers:]:
            for module in layer.modules():
                if isinstance(module, nn.Embedding):
                  if module.weight.requires_grad:
                    module.weight.data.normal_(mean=0.0, std=model.config.initializer_range)
                if isinstance(module, nn.Linear):
                  module.weight.data.normal_(mean=0.0, std=model.config.initializer_range)
                elif isinstance(module, nn.LayerNorm):
                  module.bias.data.zero_()
                  module.weight.data.fill_(1.0)
                if isinstance(module, nn.Linear) and module.bias is not None:
                  module.bias.data.zero_()

    return model

def seq_model_init():
  temp_model =  AutoModelForSequenceClassification.from_pretrained(QA_OUTPUT_PATH,num_labels=3).to(device)
  return temp_model

def timestamp():
    dateTimeObj = datetime.now()
    timestampStr = dateTimeObj.strftime("%d-%b-%Y (%H:%M:%S.%f)")
    print(timestampStr)

In [6]:
set_seed(RANDOM_SEED)


In [7]:
hatetwit_dataset_dfs = load_from_disk('/content/drive/MyDrive/Dissertation/datasets/hatetwit_'+str(1))
train_dataset = hatetwit_dataset_dfs ["train"].remove_columns(["input_ids_bert","attention_mask_bert","token_type_ids_bert"])
eval_dataset = hatetwit_dataset_dfs ["validation"].remove_columns(["input_ids_bert","attention_mask_bert","token_type_ids_bert"])
test_dataset = hatetwit_dataset_dfs ["test"].remove_columns(["input_ids_bert","attention_mask_bert","token_type_ids_bert"])

In [8]:
training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/Dissertation/disbert_optimal/results',          # output directory
    num_train_epochs=EPOCHS,              # total number of training epochs
    save_strategy ="epoch" ,
    per_device_train_batch_size=BATCH_SIZE,  # batch size per device during training
    per_device_eval_batch_size=BATCH_SIZE,   # batch size for evaluation
    warmup_steps = WARMUP_STEPS,
    weight_decay= WEIGHT_DECAY,               # strength of weight decay
    learning_rate= LEARNING_RATE, 
    logging_dir='./disbert_optimal/logs',     # directory for storing logs
    load_best_model_at_end=True,     # load the best model when finished training (default metric is loss)
    evaluation_strategy="epoch",
)

results = []

In [9]:
#Model to test combination of Intermediate Task Transfer,  Weight Reinitialization and LLRD

model = seq_model_init()
model = reinit_autoencoder_model(model,2)
parameters = get_optimizer_parameters_with_llrd(model, LEARNING_RATE, LEARNING_RATE_DECAY_MULTIPLIER)
trainer_one = Trainer(
    model =model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=eval_dataset,          # evaluation dataset
    compute_metrics=compute_metrics,     # the callback that computes metrics of interest
)
trainer_one.create_optimizer()
trainer_one.optimizer = AdamW(parameters, lr=LEARNING_RATE,weight_decay=WEIGHT_DECAY)

Some weights of the model checkpoint at /content/drive/MyDrive/Dissertation/disbert_hate_ml/results/best_model_squad were not used when initializing DistilBertForSequenceClassification: ['qa_outputs.bias', 'qa_outputs.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at /content/drive/MyDrive/Dissertation/disbert_hate_ml/results/best_model_squad and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'pre_classifier.w

In [10]:
trainer_one.train()


The following columns in the training set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: sentence.
***** Running training *****
  Num examples = 37265
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 6990


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Hate F1,Hate Recall,Hate Precision,Offensive F1,Offensive Recall,Offensive Precision,Normal F1,Normal Recall,Normal Precision
1,0.568,0.532807,0.780425,0.737912,0.733917,0.745411,0.729302,0.788732,0.678201,0.85359,0.838578,0.869148,0.630844,0.608921,0.654404
2,0.4441,0.485599,0.806825,0.759553,0.768172,0.757841,0.772447,0.817907,0.731773,0.876027,0.88819,0.864193,0.630184,0.567427,0.708549
3,0.2953,0.546233,0.805967,0.762902,0.762476,0.765756,0.780698,0.821932,0.743403,0.87442,0.87264,0.876208,0.633588,0.602697,0.667816


The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: sentence, __index_level_0__.
***** Running Evaluation *****
  Num examples = 4659
  Batch size = 16
Saving model checkpoint to /content/drive/MyDrive/Dissertation/disbert_optimal/results/checkpoint-2330
Configuration saved in /content/drive/MyDrive/Dissertation/disbert_optimal/results/checkpoint-2330/config.json
Model weights saved in /content/drive/MyDrive/Dissertation/disbert_optimal/results/checkpoint-2330/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: sentence, __index_level_0__.
***** Running Evaluation *****
  Num examples = 4659
  Batch size = 16
Saving model checkpoint to /content/drive/MyDrive/Dissertation/disbert_optimal/results/checkpoint-4660
Configuration saved in /content/drive/MyDrive/Dissertation/d

TrainOutput(global_step=6990, training_loss=0.47184749052077746, metrics={'train_runtime': 352.4872, 'train_samples_per_second': 317.16, 'train_steps_per_second': 19.831, 'total_flos': 1851182116709760.0, 'train_loss': 0.47184749052077746, 'epoch': 3.0})

In [11]:
timestamp()


13-Feb-2022 (19:32:52.432718)


In [12]:
eval_results = trainer_one.evaluate(test_dataset)
results.append(eval_results)
eval_results

The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: sentence, __index_level_0__.
***** Running Evaluation *****
  Num examples = 4658
  Batch size = 16


{'epoch': 3.0,
 'eval_accuracy': 0.7930442249892657,
 'eval_f1': 0.7444253875529426,
 'eval_hate_f1': 0.7630827783063748,
 'eval_hate_precision': 0.7231740306582507,
 'eval_hate_recall': 0.8076535750251762,
 'eval_loss': 0.5203225612640381,
 'eval_normal_f1': 0.6066897347174163,
 'eval_normal_precision': 0.6840052015604682,
 'eval_normal_recall': 0.5450777202072539,
 'eval_offensive_f1': 0.8635036496350365,
 'eval_offensive_precision': 0.8510791366906475,
 'eval_offensive_recall': 0.8762962962962964,
 'eval_precision': 0.7527527896364554,
 'eval_recall': 0.7430091971762421,
 'eval_runtime': 3.7721,
 'eval_samples_per_second': 1234.863,
 'eval_steps_per_second': 77.411}

In [13]:
timestamp()

13-Feb-2022 (19:32:56.224873)


In [14]:
#Model to test combination of Intermediate Task Transfer and LLRD

model = seq_model_init()
parameters = get_optimizer_parameters_with_llrd(model, LEARNING_RATE, LEARNING_RATE_DECAY_MULTIPLIER)
trainer_two = Trainer(
    model =model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=eval_dataset,          # evaluation dataset
    compute_metrics=compute_metrics,     # the callback that computes metrics of interest
)
trainer_two.create_optimizer()
trainer_two.optimizer = AdamW(parameters, lr=LEARNING_RATE,weight_decay=WEIGHT_DECAY)

loading configuration file /content/drive/MyDrive/Dissertation/disbert_hate_ml/results/best_model_squad/config.json
Model config DistilBertConfig {
  "_name_or_path": "/content/drive/MyDrive/Dissertation/disbert_hate_ml/results/best_model_squad",
  "activation": "gelu",
  "architectures": [
    "DistilBertForQuestionAnswering"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.16.2",
  "vocab_size": 30522
}

loading weights file /content/drive/MyDrive/Dissertation/disbert_hate_ml/results/best

In [15]:
trainer_two.train()

The following columns in the training set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: sentence.
***** Running training *****
  Num examples = 37265
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 6990


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Hate F1,Hate Recall,Hate Precision,Offensive F1,Offensive Recall,Offensive Precision,Normal F1,Normal Recall,Normal Precision
1,0.573,0.544363,0.779996,0.733491,0.735124,0.735217,0.717536,0.761569,0.678315,0.855661,0.854869,0.856454,0.627278,0.589212,0.670602
2,0.4544,0.4928,0.804035,0.756241,0.765484,0.753694,0.764313,0.805835,0.72686,0.874225,0.887819,0.861041,0.630184,0.567427,0.708549
3,0.3078,0.535096,0.80983,0.768252,0.768975,0.769236,0.776971,0.807847,0.748369,0.876317,0.877453,0.875185,0.651466,0.622407,0.683371


The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: sentence, __index_level_0__.
***** Running Evaluation *****
  Num examples = 4659
  Batch size = 16
Saving model checkpoint to /content/drive/MyDrive/Dissertation/disbert_optimal/results/checkpoint-2330
Configuration saved in /content/drive/MyDrive/Dissertation/disbert_optimal/results/checkpoint-2330/config.json
Model weights saved in /content/drive/MyDrive/Dissertation/disbert_optimal/results/checkpoint-2330/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: sentence, __index_level_0__.
***** Running Evaluation *****
  Num examples = 4659
  Batch size = 16
Saving model checkpoint to /content/drive/MyDrive/Dissertation/disbert_optimal/results/checkpoint-4660
Configuration saved in /content/drive/MyDrive/Dissertation/d

TrainOutput(global_step=6990, training_loss=0.4787922172928402, metrics={'train_runtime': 326.241, 'train_samples_per_second': 342.676, 'train_steps_per_second': 21.426, 'total_flos': 1851182116709760.0, 'train_loss': 0.4787922172928402, 'epoch': 3.0})

In [16]:
eval_results = trainer_two.evaluate(test_dataset)
results.append(eval_results)
eval_results

The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: sentence, __index_level_0__.
***** Running Evaluation *****
  Num examples = 4658
  Batch size = 16


{'epoch': 3.0,
 'eval_accuracy': 0.7977672820953199,
 'eval_f1': 0.7523209872543338,
 'eval_hate_f1': 0.7608178792201616,
 'eval_hate_precision': 0.7207207207207207,
 'eval_hate_recall': 0.8056394763343404,
 'eval_loss': 0.5314891338348389,
 'eval_normal_f1': 0.6307956496851744,
 'eval_normal_precision': 0.7046035805626598,
 'eval_normal_recall': 0.5709844559585492,
 'eval_offensive_f1': 0.8653494328576654,
 'eval_offensive_precision': 0.8550253073029646,
 'eval_offensive_recall': 0.8759259259259259,
 'eval_precision': 0.7601165361954484,
 'eval_recall': 0.7508499527396052,
 'eval_runtime': 3.7954,
 'eval_samples_per_second': 1227.291,
 'eval_steps_per_second': 76.936}

In [17]:
#Model to test combination of Intermediate Task Transfer and  WR

model = seq_model_init()
model = reinit_autoencoder_model(model,2)
trainer_three = Trainer(
    model =model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=eval_dataset,          # evaluation dataset
    compute_metrics=compute_metrics,     # the callback that computes metrics of interest
)

loading configuration file /content/drive/MyDrive/Dissertation/disbert_hate_ml/results/best_model_squad/config.json
Model config DistilBertConfig {
  "_name_or_path": "/content/drive/MyDrive/Dissertation/disbert_hate_ml/results/best_model_squad",
  "activation": "gelu",
  "architectures": [
    "DistilBertForQuestionAnswering"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.16.2",
  "vocab_size": 30522
}

loading weights file /content/drive/MyDrive/Dissertation/disbert_hate_ml/results/best

In [18]:
trainer_three.train()

The following columns in the training set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: sentence.
***** Running training *****
  Num examples = 37265
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 6990


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Hate F1,Hate Recall,Hate Precision,Offensive F1,Offensive Recall,Offensive Precision,Normal F1,Normal Recall,Normal Precision
1,0.5729,0.55415,0.769693,0.724715,0.721087,0.741015,0.712317,0.831992,0.622741,0.851023,0.816364,0.888755,0.610805,0.574689,0.651765
2,0.4359,0.499295,0.804679,0.7593,0.763546,0.761576,0.772536,0.831992,0.721011,0.873546,0.875972,0.871134,0.631818,0.576763,0.698492
3,0.2687,0.576627,0.807684,0.765112,0.765898,0.766351,0.783406,0.816901,0.752549,0.874723,0.876342,0.87311,0.637207,0.605809,0.672037


The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: sentence, __index_level_0__.
***** Running Evaluation *****
  Num examples = 4659
  Batch size = 16
Saving model checkpoint to /content/drive/MyDrive/Dissertation/disbert_optimal/results/checkpoint-2330
Configuration saved in /content/drive/MyDrive/Dissertation/disbert_optimal/results/checkpoint-2330/config.json
Model weights saved in /content/drive/MyDrive/Dissertation/disbert_optimal/results/checkpoint-2330/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: sentence, __index_level_0__.
***** Running Evaluation *****
  Num examples = 4659
  Batch size = 16
Saving model checkpoint to /content/drive/MyDrive/Dissertation/disbert_optimal/results/checkpoint-4660
Configuration saved in /content/drive/MyDrive/Dissertation/d

TrainOutput(global_step=6990, training_loss=0.4599053924516888, metrics={'train_runtime': 328.3502, 'train_samples_per_second': 340.475, 'train_steps_per_second': 21.288, 'total_flos': 1851182116709760.0, 'train_loss': 0.4599053924516888, 'epoch': 3.0})

In [19]:
eval_results = trainer_three.evaluate(test_dataset)
results.append(eval_results)
eval_results

The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: sentence, __index_level_0__.
***** Running Evaluation *****
  Num examples = 4658
  Batch size = 16


{'epoch': 3.0,
 'eval_accuracy': 0.8035637612709318,
 'eval_f1': 0.7583603592175008,
 'eval_hate_f1': 0.7806026365348399,
 'eval_hate_precision': 0.7329796640141468,
 'eval_hate_recall': 0.8348439073514602,
 'eval_loss': 0.5293312668800354,
 'eval_normal_f1': 0.6246418338108882,
 'eval_normal_precision': 0.6987179487179487,
 'eval_normal_recall': 0.5647668393782384,
 'eval_offensive_f1': 0.8698366073067743,
 'eval_offensive_precision': 0.8623953403713142,
 'eval_offensive_recall': 0.8774074074074074,
 'eval_precision': 0.7646976510344698,
 'eval_recall': 0.7590060513790352,
 'eval_runtime': 3.7701,
 'eval_samples_per_second': 1235.499,
 'eval_steps_per_second': 77.451}

In [20]:
#Model to test combination of Weight Reinitialization and LLRD

model = model_init()
model = reinit_autoencoder_model(model,2)
parameters = get_optimizer_parameters_with_llrd(model, LEARNING_RATE, LEARNING_RATE_DECAY_MULTIPLIER)
trainer_four = Trainer(
    model =model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=eval_dataset,          # evaluation dataset
    compute_metrics=compute_metrics,     # the callback that computes metrics of interest
)
trainer_four.create_optimizer()
trainer_four.optimizer = AdamW(parameters, lr=LEARNING_RATE,weight_decay=WEIGHT_DECAY)

https://huggingface.co/distilbert-base-uncased/resolve/main/config.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpalumjqtq


Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

storing https://huggingface.co/distilbert-base-uncased/resolve/main/config.json in cache at /root/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
creating metadata file for /root/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

storing https://huggingface.co/distilbert-base-uncased/resolve/main/pytorch_model.bin in cache at /root/.cache/huggingface/transformers/9c169103d7e5a73936dd2b627e42851bec0831212b677c637033ee4bce9ab5ee.126183e36667471617ae2f0835fab707baa54b731f991507ebbb55ea85adb12a
creating metadata file for /root/.cache/huggingface/transformers/9c169103d7e5a73936dd2b627e42851bec0831212b677c637033ee4bce9ab5ee.126183e36667471617ae2f0835fab707baa54b731f991507ebbb55ea85adb12a
loading weights file https://huggingface.co/distilbert-base-uncased/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/9c169103d7e5a73936dd2b627e42851bec0831212b677c637033ee4bce9ab5ee.126183e36667471617ae2f0835fab707baa54b731f991507ebbb55ea85adb12a
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_transform.weigh

In [21]:
trainer_four.train()

The following columns in the training set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: sentence.
***** Running training *****
  Num examples = 37265
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 6990


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Hate F1,Hate Recall,Hate Precision,Offensive F1,Offensive Recall,Offensive Precision,Normal F1,Normal Recall,Normal Precision
1,0.5709,0.544516,0.775918,0.731199,0.73004,0.740651,0.716204,0.804829,0.645161,0.852602,0.834136,0.871904,0.624792,0.582988,0.673054
2,0.4413,0.492655,0.804035,0.755705,0.766299,0.749104,0.765114,0.77666,0.753906,0.874593,0.895964,0.854218,0.627407,0.574689,0.690773
3,0.2938,0.567789,0.804035,0.76021,0.761015,0.761807,0.780019,0.816901,0.746324,0.872667,0.874121,0.871218,0.627945,0.594398,0.665505


The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: sentence, __index_level_0__.
***** Running Evaluation *****
  Num examples = 4659
  Batch size = 16
Saving model checkpoint to /content/drive/MyDrive/Dissertation/disbert_optimal/results/checkpoint-2330
Configuration saved in /content/drive/MyDrive/Dissertation/disbert_optimal/results/checkpoint-2330/config.json
Model weights saved in /content/drive/MyDrive/Dissertation/disbert_optimal/results/checkpoint-2330/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: sentence, __index_level_0__.
***** Running Evaluation *****
  Num examples = 4659
  Batch size = 16
Saving model checkpoint to /content/drive/MyDrive/Dissertation/disbert_optimal/results/checkpoint-4660
Configuration saved in /content/drive/MyDrive/Dissertation/d

TrainOutput(global_step=6990, training_loss=0.46681851235582084, metrics={'train_runtime': 336.0465, 'train_samples_per_second': 332.677, 'train_steps_per_second': 20.801, 'total_flos': 1851182116709760.0, 'train_loss': 0.46681851235582084, 'epoch': 3.0})

In [22]:
eval_results = trainer_four.evaluate(test_dataset)
results.append(eval_results)
eval_results

The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: sentence, __index_level_0__.
***** Running Evaluation *****
  Num examples = 4658
  Batch size = 16


{'epoch': 3.0,
 'eval_accuracy': 0.8037784456848432,
 'eval_f1': 0.7586395609940008,
 'eval_hate_f1': 0.7647637795275591,
 'eval_hate_precision': 0.7478344562078922,
 'eval_hate_recall': 0.7824773413897281,
 'eval_loss': 0.5146265029907227,
 'eval_normal_f1': 0.6422018348623852,
 'eval_normal_precision': 0.7188703465982028,
 'eval_normal_recall': 0.5803108808290155,
 'eval_offensive_f1': 0.8689530685920578,
 'eval_offensive_precision': 0.8475352112676057,
 'eval_offensive_recall': 0.8914814814814814,
 'eval_precision': 0.7714133380245669,
 'eval_recall': 0.7514232345667416,
 'eval_runtime': 3.8288,
 'eval_samples_per_second': 1216.575,
 'eval_steps_per_second': 76.264}

In [23]:
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,eval_loss,eval_accuracy,eval_f1,eval_precision,eval_recall,eval_hate_f1,eval_hate_recall,eval_hate_precision,eval_offensive_f1,eval_offensive_recall,eval_offensive_precision,eval_normal_f1,eval_normal_recall,eval_normal_precision,eval_runtime,eval_samples_per_second,eval_steps_per_second,epoch
0,0.520323,0.793044,0.744425,0.752753,0.743009,0.763083,0.807654,0.723174,0.863504,0.876296,0.851079,0.60669,0.545078,0.684005,3.7721,1234.863,77.411,3.0
1,0.531489,0.797767,0.752321,0.760117,0.75085,0.760818,0.805639,0.720721,0.865349,0.875926,0.855025,0.630796,0.570984,0.704604,3.7954,1227.291,76.936,3.0
2,0.529331,0.803564,0.75836,0.764698,0.759006,0.780603,0.834844,0.73298,0.869837,0.877407,0.862395,0.624642,0.564767,0.698718,3.7701,1235.499,77.451,3.0
3,0.514627,0.803778,0.75864,0.771413,0.751423,0.764764,0.782477,0.747834,0.868953,0.891481,0.847535,0.642202,0.580311,0.71887,3.8288,1216.575,76.264,3.0
