In [1]:
!pip install -qq transformers
!pip install -qq optuna
!pip install -qq sentencepiece
!pip install -qq datasets

[K     |████████████████████████████████| 3.5 MB 30.9 MB/s 
[K     |████████████████████████████████| 895 kB 58.6 MB/s 
[K     |████████████████████████████████| 6.8 MB 64.3 MB/s 
[K     |████████████████████████████████| 67 kB 5.4 MB/s 
[K     |████████████████████████████████| 596 kB 76.1 MB/s 
[K     |████████████████████████████████| 308 kB 25.4 MB/s 
[K     |████████████████████████████████| 80 kB 9.8 MB/s 
[K     |████████████████████████████████| 210 kB 66.0 MB/s 
[K     |████████████████████████████████| 75 kB 3.6 MB/s 
[K     |████████████████████████████████| 113 kB 77.8 MB/s 
[K     |████████████████████████████████| 49 kB 6.0 MB/s 
[K     |████████████████████████████████| 149 kB 62.4 MB/s 
[?25h  Building wheel for pyperclip (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 1.2 MB 26.0 MB/s 
[K     |████████████████████████████████| 311 kB 30.8 MB/s 
[K     |████████████████████████████████| 243 kB 63.8 MB/s 
[K     |███████████████

In [2]:
import transformers
import datasets
from transformers import AutoTokenizer,AutoModelForQuestionAnswering, AutoModelForSequenceClassification,AdamW, get_linear_schedule_with_warmup,Trainer, TrainingArguments
from transformers import DataCollator, DataCollatorForLanguageModeling,default_data_collator
from transformers.file_utils import is_tf_available, is_torch_available, is_torch_tpu_available
import torch
import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
import random
from textwrap import wrap
from datetime import datetime
from datasets import load_from_disk
from datasets import load_dataset
from datasets import Dataset
from sklearn.metrics import accuracy_score,classification_report, confusion_matrix
from sklearn.metrics import precision_recall_fscore_support

In [3]:
# the model we gonna train, base uncased BERT
# check text classification models here: https://huggingface.co/models?filter=text-classification
MODEL_NAME = "distilbert-base-uncased"
# max sequence length for each document/sentence sample
MAX_LENGTH = 64
BATCH_SIZE = 16
EPOCHS = 3
LEARNING_RATE= 6.58e-5
WEIGHT_DECAY = 0.289
WARMUP_STEPS = 464
RANDOM_SEED=22




QA_OUTPUT_PATH= "/content/drive/MyDrive/Dissertation/disbert_hate_ml/results/best_model_squad"

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [4]:
def set_seed(seed):
    """Set all seeds to make results reproducible (deterministic mode).
       When seed is None, disables deterministic mode.
    :param seed: an integer to your choosing
    """
    if seed is not None:
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        np.random.seed(seed)
        random.seed(seed)

def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  # calculate accuracy using sklearn's function
  acc = accuracy_score(labels, preds)
  precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
  acc = accuracy_score(labels, preds)
  confusion_matrix = classification_report(labels, preds, digits=4,output_dict=True)
  return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'hate_f1': confusion_matrix["0"]["f1-score"],
        'hate_recall': confusion_matrix["0"]["recall"],
        'hate_precision': confusion_matrix["0"]["precision"],
        'offensive_f1': confusion_matrix["1"]["f1-score"],
        'offensive_recall': confusion_matrix["1"]["recall"],
        'offensive_precision': confusion_matrix["1"]["precision"],
        'normal_f1': confusion_matrix["2"]["f1-score"],
        'normal_recall': confusion_matrix["2"]["recall"],
        'normal_precision': confusion_matrix["2"]["precision"],    
  }


def seq_model_init(modelname_or_path):
  temp_model =  AutoModelForSequenceClassification.from_pretrained(modelname_or_path,num_labels=3).to(device)
  return temp_model

def qa_model_init():
  temp_model =  AutoModelForQuestionAnswering.from_pretrained(MODEL_NAME).to(device)
  return temp_model



def timestamp():
    dateTimeObj = datetime.now()
    timestampStr = dateTimeObj.strftime("%d-%b-%Y (%H:%M:%S.%f)")
    print(timestampStr)


def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

def tokenize_function(examples):
    return tokenizer(examples["sentence"], max_length=MAX_LENGTH, truncation=True, padding="max_length", return_special_tokens_mask=True)

In [5]:
set_seed(RANDOM_SEED)




In [6]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [7]:

dataset = load_dataset("squad")

Downloading:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

Downloading and preparing dataset squad/plain_text (download: 33.51 MiB, generated: 85.63 MiB, post-processed: Unknown size, total: 119.14 MiB) to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453...


  0%|          | 0/2 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/8.12M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

  0%|          | 0/2 [00:00<?, ?it/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset squad downloaded and prepared to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
tokenized_squad = dataset.map(preprocess_function, batched=True)

  0%|          | 0/88 [00:00<?, ?ba/s]

  0%|          | 0/11 [00:00<?, ?ba/s]

In [10]:
tokenized_squad

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers', 'input_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers', 'input_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 10570
    })
})

In [None]:
data_collator = default_data_collator

In [None]:


training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/Dissertation/disbert_int_qa/results', 
    evaluation_strategy="epoch",
    learning_rate=1e-3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
model = qa_model_init()

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.16.2",
  "vocab_size": 30522
}

loading weights file https://huggingface.co/distilbert-base-uncased/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/9c169103d7e5a

In [None]:
squad_trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad["train"],
    eval_dataset=tokenized_squad["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
timestamp()


13-Feb-2022 (12:33:27.772424)


In [None]:
squad_trainer.train()


The following columns in the training set  don't have a corresponding argument in `DistilBertForQuestionAnswering.forward` and have been ignored: title, id, question, answers, context.
***** Running training *****
  Num examples = 87599
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 8214


Epoch,Training Loss,Validation Loss


Saving model checkpoint to /content/drive/MyDrive/Dissertation/disbert_int_qa/results/checkpoint-500
Configuration saved in /content/drive/MyDrive/Dissertation/disbert_int_qa/results/checkpoint-500/config.json
Model weights saved in /content/drive/MyDrive/Dissertation/disbert_int_qa/results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/Dissertation/disbert_int_qa/results/checkpoint-500/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/Dissertation/disbert_int_qa/results/checkpoint-500/special_tokens_map.json


Epoch,Training Loss,Validation Loss


Saving model checkpoint to /content/drive/MyDrive/Dissertation/disbert_int_qa/results/checkpoint-1000
Configuration saved in /content/drive/MyDrive/Dissertation/disbert_int_qa/results/checkpoint-1000/config.json
Model weights saved in /content/drive/MyDrive/Dissertation/disbert_int_qa/results/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/Dissertation/disbert_int_qa/results/checkpoint-1000/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/Dissertation/disbert_int_qa/results/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to /content/drive/MyDrive/Dissertation/disbert_int_qa/results/checkpoint-1500
Configuration saved in /content/drive/MyDrive/Dissertation/disbert_int_qa/results/checkpoint-1500/config.json
Model weights saved in /content/drive/MyDrive/Dissertation/disbert_int_qa/results/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/Dissertation/disbert_int_qa/result

In [None]:
timestamp()


In [None]:
squad_trainer.evaluate()

In [None]:
squad_trainer.save_model(QA_OUTPUT_PATH)

In [None]:
result_list = []
for i in range(1,11):

  training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/Dissertation/disbert_hate_task/results/'+str(i),          # output directory
    num_train_epochs=EPOCHS,              # total number of training epochs
    save_strategy ="epoch" ,
    per_device_train_batch_size=BATCH_SIZE,  # batch size per device during training
    per_device_eval_batch_size=BATCH_SIZE,   # batch size for evaluation
    weight_decay= WEIGHT_DECAY,               # strength of weight decay
    learning_rate= LEARNING_RATE, 
    warmup_steps = WARMUP_STEPS,
    logging_dir='./disbert_hate_task/logs',     # directory for storing logs
    load_best_model_at_end=True,     # load the best model when finished training (default metric is loss)
    evaluation_strategy="epoch",
  )

  hatetwit_dataset_dfs = load_from_disk('/content/drive/MyDrive/Dissertation/datasets/hatetwit_'+str(i))
  train_dataset = hatetwit_dataset_dfs ["train"].remove_columns(["input_ids_bert","attention_mask_bert","token_type_ids_bert"])
  eval_dataset = hatetwit_dataset_dfs ["validation"].remove_columns(["input_ids_bert","attention_mask_bert","token_type_ids_bert"])
  test_dataset = hatetwit_dataset_dfs ["test"].remove_columns(["input_ids_bert","attention_mask_bert","token_type_ids_bert"])
  seq_model = seq_model_init(QA_OUTPUT_PATH)

  trainer = Trainer(
      model=seq_model,                         # the instantiated Transformers model to be trained
      args=training_args,                  # training arguments, defined above
      train_dataset= train_dataset,         # training dataset
      eval_dataset=eval_dataset,          # evaluation dataset
      compute_metrics=compute_metrics,     # the callback that computes metrics of interest
  )
  trainer.train()
  trainer.save_model('/content/drive/MyDrive/Dissertation/disbert_hate_task/models/model_'+str(i))
  results = trainer.evaluate(test_dataset)
  results["model_run"] = i
  result_list.append(results)

In [None]:
results_df = pd.DataFrame(result_list)
results_df.to_csv('/content/drive/MyDrive/Dissertation/results/distilbert_task.csv')

In [None]:
results_df = results_df.sort_values(by=['eval_f1'])
#Print min values
results_df.head(1)

In [None]:
#Print max values 
results_df.tail(1)

In [None]:
#Print median f1
results_df["eval_f1"].median()

In [None]:
#Print average values
results_df.mean()

In [None]:
results_df.std()

In [None]:
training_loss_min = [0.5717,0.441300,0.2683]
training_loss_max = [0.57460,0.442,0.2661]
val_loss_min = [0.570152,0.525531,0.596814]
val_loss_max = [0.5456659,0.504951,0.583835]
epoch_list=[1,2,3]

plt.figure()
plt.plot(epoch_list,training_loss_min, label="Training Loss Min Run")
plt.plot(epoch_list,val_loss_min, label="Validation Loss Min Run")
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.xticks(epoch_list)
plt.legend()
plt.show()

In [None]:
plt.figure()
plt.plot(epoch_list,training_loss_max, label="Training Loss Max Run")
plt.plot(epoch_list,val_loss_max, label="Validation Loss Max Run")
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.xticks(epoch_list)
plt.legend()
plt.show()