# importing libraries

In [1]:
import os, re, math, random, json, string

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, HTML
import wandb

import transformers
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import TrainerCallback, AdamW, get_cosine_schedule_with_warmup
from transformers import DataCollatorForTokenClassification, PreTrainedModel, RobertaTokenizerFast

from datasets import load_dataset, ClassLabel, Sequence, load_metric

from seqeval.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

In [2]:
# Need to log in to weights and biases to visualizze our training crosse loss and ephocs
wandb.login()


wandb: Currently logged in as: abdel_tracker (abdel_team). Use `wandb login --relogin` to force relogin


True

## configuration 

In [3]:
# Hugging Face models roberta 
models = dict(
    ROBERTA = "roberta-base",
    DISTILBERT_U = "distilbert-base-uncased",
    DISTILBERT_C = "distilbert-base-cased")

In [4]:
# Logging date for w&b"jsut to show data in dandb visualization
from datetime import date
today = date.today()
log_date = today.strftime("%d-%m-%Y")

In [5]:
# LOAD OR TRAIN MODEL
TRAIN = 1 # 1 to TRAIN WEIGHTS or 0 to LOAD WEIGHTS

# TRAIN/VALIDATION SPLIT
TRAIN_SPLIT = 0.90

# RANDOM SEED FOR REPRODUCIBILITY
RANDOM_SEED = 30

# BATCH SIZE

BATCH_SIZES = 1

# EPOCHS - TRANSFORMERS ARE TYPICALLY FINE-TUNED BETWEEN 1 AND 3 EPOCHS 
EPOCHS = 3

# our model of transformers
MODEL_CHECKPOINT = models['ROBERTA']

# SPECIFY THE WEIGHTS AND BIASES PROJECT NAME
%env WANDB_PROJECT = 'IC-NER' 

# DETERMINE WHETHER TO SAVE THE MODEL IN THE 100GB OF FREE W&B STORAGE
%env WANDB_LOG_MODEL = false 

env: WANDB_PROJECT='IC-NER'
env: WANDB_LOG_MODEL=false


#  File and dataset handling

In [6]:
#########paths
FEATURE_CLASS_LABELS = "feature_class_labels.json"
DATA_FILE = 'v1-annotated.json'
TEMP_MODEL_OUTPUT_DIR = 'temp_model_output_dir'
SAVED_MODEL = f"p2d-NER-Fine-Tune-Transformer-{MODEL_CHECKPOINT}" # Change for notebook version

In [7]:
data_files = DATA_FILE
datasets = load_dataset('json', data_files=data_files, field='data')

# Create train and validation datasets
datasets = datasets['train'].train_test_split(test_size=1-TRAIN_SPLIT, seed=RANDOM_SEED)
print(datasets)

Using custom data configuration default-221eed0ba180e81c
Found cached dataset json (C:/Users/ce pc/.cache/huggingface/datasets/json/default-221eed0ba180e81c/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached split indices for dataset at C:\Users\ce pc\.cache\huggingface\datasets\json\default-221eed0ba180e81c\0.0.0\0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\cache-80e141d39c0af3f1.arrow and C:\Users\ce pc\.cache\huggingface\datasets\json\default-221eed0ba180e81c\0.0.0\0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\cache-24a78c632a3d4f3c.arrow


DatasetDict({
    train: Dataset({
        features: ['id', 'ner_tags', 'split_tokens'],
        num_rows: 43
    })
    test: Dataset({
        features: ['id', 'ner_tags', 'split_tokens'],
        num_rows: 5
    })
})


In [8]:
# Opening the label list created in pre-processing corresponding to the ner_tag indices
with open(FEATURE_CLASS_LABELS, 'r') as f:
    label_list = json.load(f)

for n in range(len(label_list)):
    print(n, label_list[n])

0 B-Doxing
1 B-Harassment
2 B-Insult
3 B-Racism
4 B-Sexism
5 B-Trolling
6 I-Doxing
7 I-Harassment
8 I-Insult
9 I-Racism
10 I-Sexism
11 I-Trolling
12 O


In [9]:
# Checking some random samples to ensure data loaded as expected:
def show_random_elements(dataset, num_examples=1):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
        elif isinstance(typ, Sequence) and isinstance(typ.feature, ClassLabel):
            df[column] = df[column].transform(lambda x: [typ.feature.names[i] for i in x])
    display(HTML(df.to_html()))

show_random_elements(datasets["train"], num_examples=3)

Unnamed: 0,id,ner_tags,split_tokens
0,36,"[12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 4, 10, 10, 10, 10, 10, 10, 10, 12, 12]","["", SO, HILARIOUS, U, WRITE, UR, OWN, MATERIAL, ?, @JesseElJefe, A, lot, of, ppl, call, me, sexist, ., But, those, ppl, are, women, ,, and, their, opinions, do, n't, matter, ., ""]"
1,61,"[12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 2, 8, 3, 12, 12, 12, 12, 2, 12, 12, 12, 12, 12, 12, 12]","["", @D_Paid, :, @Twin_Thing_Two, :, Y'all, see, this, though, right, !, ?, RT, -------&gt, ;, "", "", @tayyoung, _, :, FUCK, OBAMA, ,, dumb, ass, nigger, "", "", "", "", bitch, Fuck, u""""EAT, A, DICK, !, !, ""]"
2,75,"[1, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7]","[Cardi, talks, like, all, the, girls, in, high, school, that, bullied, me]"


In [11]:
# starting  the tokenizer


if MODEL_CHECKPOINT == models['ROBERTA']:
    tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base", add_prefix_space=True)
else:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
        

In [12]:
def word_id_func(input_ids, print_labs=False):
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    
    word_ids = []
    i=0
    spec_toks = ['[CLS]', '[SEP]', '[PAD]']
    for t in tokens:
        if t in spec_toks:
            word_ids.append(-100)
            print(t, i) if print_labs else None
        elif t.startswith('▁'):
            i += 1
            word_ids.append(i)
            print(t, i) if print_labs else None
        else:
            word_ids.append(i)
            print(t, i) if print_labs else None
        print("Total:", i) if print_labs else None
    return word_ids

def tokenize_and_align_labels(examples, label_all_tokens=False):
    tokenized_inputs = tokenizer(examples["split_tokens"],
                                 truncation=True,
                                 is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

def tokenize_and_align_labels_deberta(examples, label_all_tokens=False):
    tokenized_inputs = tokenizer(examples["split_tokens"],
                                 truncation=True,
                                 is_split_into_words=True)
    labels = []
    word_ids_list = []
    for input_ids in tokenized_inputs["input_ids"]:
        wids = word_id_func(input_ids, print_labs=False)
        word_ids_list.append(wids)
    
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = word_ids_list[i]
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx == -100:
                label_ids.append(-100)
            #We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx-1])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx-1] if label_all_tokens else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [13]:
#appling the map method
#if MODEL_CHECKPOINT == models['DEBERTA_V2_XL'] or MODEL_CHECKPOINT == models['DEBERTA_V2_XXL']:
tokenize_and_align_labels = tokenize_and_align_labels_deberta

tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True, load_from_cache_file=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

# Build the  Model

In [14]:
model = AutoModelForTokenClassification.from_pretrained(MODEL_CHECKPOINT, num_labels=len(label_list))

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForTokenClassification: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able

In [15]:
#Optimizer
learning_rate = 0.0000075
lr_max = learning_rate * BATCH_SIZES
weight_decay = 0.05

optimizer = AdamW(
    model.parameters(),
    lr=lr_max,
    weight_decay=weight_decay)

print("The maximum learning rate is: ",lr_max)

# Learning Rate 
num_train_samples = len(datasets["train"])
warmup_ratio = 0.2 # Percentage of total steps to go from zero to max learning rate
num_cycles=0.8 # The cosine exponential rate

num_training_steps = num_train_samples*EPOCHS/BATCH_SIZES
num_warmup_steps = num_training_steps*warmup_ratio

lr_sched = get_cosine_schedule_with_warmup(optimizer=optimizer,
                                           num_warmup_steps=num_warmup_steps,
                                           num_training_steps = num_training_steps,
                                           num_cycles=num_cycles)

The maximum learning rate is:  7.5e-06




In [16]:
#creating args
args = TrainingArguments(output_dir = TEMP_MODEL_OUTPUT_DIR,
                         evaluation_strategy = "epoch",
                         learning_rate=lr_max,
                         per_device_train_batch_size=BATCH_SIZES,
                         per_device_eval_batch_size=BATCH_SIZES,
                         num_train_epochs=EPOCHS,
                         weight_decay=weight_decay,
                         lr_scheduler_type = 'cosine',
                         warmup_ratio=warmup_ratio,
                         logging_strategy="epoch",
                         save_strategy="epoch",
                         seed=RANDOM_SEED,
                         report_to = 'wandb', # enable logging to W&B
                         run_name = MODEL_CHECKPOINT+"-"+log_date,
                         metric_for_best_model="f1",
                         load_best_model_at_end = True)   # name of the W&B run (optional)

In [17]:
data_collator = DataCollatorForTokenClassification(tokenizer)

# Training

In [18]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)]

    # Define the metric parameters
    overall_precision = precision_score(true_labels, true_predictions, zero_division=1)
    overall_recall = recall_score(true_labels, true_predictions, zero_division=1)
    overall_f1 = f1_score(true_labels, true_predictions, zero_division=1)
    overall_accuracy = accuracy_score(true_labels, true_predictions)
    
    # Return a dictionary with the calculated metrics
    return {
        "precision": overall_precision,
        "recall": overall_recall,
        "f1": overall_f1,
        "accuracy": overall_accuracy,}

In [19]:
# Defining and sreating  the Trainer...
trainer = Trainer(
                model=model,
                args=args,
                train_dataset=tokenized_datasets["train"],
                eval_dataset=tokenized_datasets["test"],
                data_collator=data_collator,
                tokenizer=tokenizer,
                compute_metrics=compute_metrics,
                optimizers=(optimizer, lr_sched)
                )

In [20]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: ner_tags, split_tokens, id. If ner_tags, split_tokens, id are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 43
  Num Epochs = 3
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 129
  Number of trainable parameters = 124065037
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01720000000004802, max=1.0)…

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,2.7893,1.895218,1.0,0.0,0.0,0.8
2,0.5965,1.048762,1.0,0.0,0.0,0.8
3,0.2568,1.365016,1.0,0.0,0.0,0.8


The following columns in the evaluation set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: ner_tags, split_tokens, id. If ner_tags, split_tokens, id are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 5
  Batch size = 1
Saving model checkpoint to temp_model_output_dir\checkpoint-43
Configuration saved in temp_model_output_dir\checkpoint-43\config.json
Model weights saved in temp_model_output_dir\checkpoint-43\pytorch_model.bin
tokenizer config file saved in temp_model_output_dir\checkpoint-43\tokenizer_config.json
Special tokens file saved in temp_model_output_dir\checkpoint-43\special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: ner_tags, split_tokens, id. If ner_tags, split_tokens, id are not expected by `RobertaForTokenCl

TrainOutput(global_step=129, training_loss=1.2142170824745828, metrics={'train_runtime': 930.5877, 'train_samples_per_second': 0.139, 'train_steps_per_second': 0.139, 'total_flos': 3125154597786.0, 'train_loss': 1.2142170824745828, 'epoch': 3.0})

In [24]:
# Evaluate based on the chosen epoch (usually best or last)
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: ner_tags, split_tokens, id. If ner_tags, split_tokens, id are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 5
  Batch size = 1


{'eval_loss': 0.7428363561630249,
 'eval_precision': 1.0,
 'eval_recall': 0.0,
 'eval_f1': 0.0,
 'eval_accuracy': 0.896774193548387,
 'eval_runtime': 2.761,
 'eval_samples_per_second': 1.811,
 'eval_steps_per_second': 1.811,
 'epoch': 3.0}

In [25]:
# Finish Weighs & Biases logging for this run
wandb.finish() 

0,1
eval/accuracy,▁▁▁▁
eval/f1,▁▁▁▁
eval/loss,█▄▁█
eval/precision,▁▁▁▁
eval/recall,▁▁▁▁
eval/runtime,█▁▅▃
eval/samples_per_second,▁█▃▆
eval/steps_per_second,▁█▃▆
train/epoch,▁▁▅▅████
train/global_step,▁▁▅▅████

0,1
eval/accuracy,0.89677
eval/f1,0.0
eval/loss,0.74284
eval/precision,1.0
eval/recall,0.0
eval/runtime,2.761
eval/samples_per_second,1.811
eval/steps_per_second,1.811
train/epoch,3.0
train/global_step,129.0


In [26]:
trainer.save_model(SAVED_MODEL)

Saving model checkpoint to p2d-NER-Fine-Tune-Transformer-roberta-base
Configuration saved in p2d-NER-Fine-Tune-Transformer-roberta-base\config.json
Model weights saved in p2d-NER-Fine-Tune-Transformer-roberta-base\pytorch_model.bin
tokenizer config file saved in p2d-NER-Fine-Tune-Transformer-roberta-base\tokenizer_config.json
Special tokens file saved in p2d-NER-Fine-Tune-Transformer-roberta-base\special_tokens_map.json


In [27]:
loaded_model = AutoModelForTokenClassification.from_pretrained(SAVED_MODEL)

pred_trainer = Trainer(
    loaded_model,
    args,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

loading configuration file p2d-NER-Fine-Tune-Transformer-roberta-base\config.json
Model config RobertaConfig {
  "_name_or_path": "p2d-NER-Fine-Tune-Transformer-roberta-base",
  "architectures": [
    "RobertaForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_10": 10,
    "LABEL_11": 11,
    "LABEL_12": 12,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6,
    "LABEL_7": 7,
    "LABEL_

In [28]:
# Extract the predictions and produce a classification report
predictions, labels, _ = pred_trainer.predict(tokenized_datasets["test"])
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
    ]

# Generate the metrics and display
results = classification_report(true_labels, true_predictions, zero_division=1)
print(results)

The following columns in the test set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: ner_tags, split_tokens, id. If ner_tags, split_tokens, id are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 5
  Batch size = 1


              precision    recall  f1-score   support

      Insult       1.00      0.00      0.00        13
      Racism       1.00      0.00      0.00         2

   micro avg       1.00      0.00      0.00        15
   macro avg       1.00      0.00      0.00        15
weighted avg       1.00      0.00      0.00        15



In [51]:
check = 3

print(len(datasets["test"][check]['split_tokens']))
print(len(true_predictions[check]))
print(len(true_labels[check]))

27
27
27


In [52]:
# Have a look at the predicted extracted data
check_pred = zip(datasets["test"][check]['split_tokens'], true_predictions[check])
for tup in check_pred:
    if tup[1] != 'O':
        print(tup)

In [54]:
# Compare to the actual labels
check_true = zip(datasets["test"][check]['split_tokens'], true_labels[check])
for tup in check_true:
    if tup[1] != 'O':
        print(tup)

('Muthafuckas', 'B-Insult')
('fuck', 'B-Insult')
('dumb', 'B-Insult')
('ass', 'I-Insult')
('niggers', 'B-Racism')


In [55]:
from huggingface_hub import notebook_login

notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (manager-core).
Your token has been saved to C:\Users\ce pc\.huggingface\token
Login successful


In [22]:
#!git init && git remote add origin && git pull origin main


In [21]:
#trainer.push_to_hub(commit_message="Training complete")