<a href="https://colab.research.google.com/github/Chiamakac/TRAININGS/blob/main/TRAININGS/IgboBERT%202.0/FINE%20TUNING/Fine_tuned_IgboBert_2_0_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Token Classification (PyTorch)-Fine-Tunning 

Install the Transformers and Datasets libraries to run this notebook.

In [None]:
!pip install datasets transformers[sentencepiece]
#!pip install accelerate
# To run the training on TPU, you will need to uncomment the followin line:
# !pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl
!apt install git-lfs

You will need to setup git, adapt your email and name in the following cell.

In [32]:
!git config --global user.email "you@example.com"
!git config --global user.name "Your Name"

You will also need to be logged in to the Hugging Face Hub. Execute the following and enter your credentials.

In [33]:
from huggingface_hub import notebook_login

notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.huggingface/token
Login successful


In [None]:
#We use the load_dataset() method from the Datasets library to download our dataset.
from datasets import load_dataset

raw_datasets = load_dataset('masakhane/masakhaner2', 'ibo')

In [None]:
#shows us the columns present and the split between the training, validation, and test sets
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 7634
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 1090
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 2181
    })
})

In [5]:
# displays the first element of the training set
raw_datasets["train"][0]["tokens"]

['Ọ',
 'dị',
 'ọtụtụ',
 'ihe',
 'mere',
 'ka',
 'Ekeresimesi',
 'dị',
 'iche',
 "n'oge",
 'ugbua',
 '.']

In [6]:
#displays the features attribute of our dataset
ner_feature = raw_datasets["train"].features["ner_tags"]
ner_feature

Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-DATE', 'I-DATE'], id=None), length=-1, id=None)

In [7]:
#we can access the list of names in the ner_feature by looking at the names attribute of that feature
label_names = ner_feature.feature.names
label_names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-DATE', 'I-DATE']

In [8]:
#decoding the labels we saw earlier and prints line 1 and line 2 of the training set with the labels
words = raw_datasets["train"][0]["tokens"]
labels = raw_datasets["train"][0]["ner_tags"]
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = label_names[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)

print(line1)
print(line2)

Ọ dị ọtụtụ ihe mere ka Ekeresimesi dị iche n'oge ugbua . 
O O  O     O   O    O  B-DATE      O  O    O     O     O 


In [9]:
#mount the gdrive
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [12]:
#model path in the gdrive
MODEL_PATH = "/content/gdrive/MyDrive/IboBERT_2.0/IgboBert_2.0" #replace with the model path you want to work with

**Processing the data**

In [13]:
#we will be using a IgboBERT pretrained model.
#Download and cache the associated tokenizer

#You can replace the model_checkpoint with any other model you prefer from the Hub, 
#or with a local folder in which you’ve saved a pretrained model and a tokenizer
from transformers import AutoTokenizer

model_checkpoint = MODEL_PATH
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint,add_prefix_space=True)

In [14]:
#Tokenizing our pre-tokenized input with our tokenizer
inputs = tokenizer(raw_datasets["train"][0]["tokens"], is_split_into_words=True)
inputs.tokens()

['<s>',
 'Ġá»Į',
 'Ġdá»ĭ',
 'Ġá»įtá»¥tá»¥',
 'Ġihe',
 'Ġmere',
 'Ġka',
 'ĠEkeresimesi',
 'Ġdá»ĭ',
 'Ġiche',
 'Ġn',
 "'",
 'oge',
 'Ġugbua',
 'Ġ.',
 '</s>']

In [15]:
# Expanding our label list to match the tokens and assigning a label of -100 to special tokens 
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [16]:
# trying out the above line of code on our first sentence to see if -100 was assigned to the special tokens
labels = raw_datasets["train"][0]["ner_tags"]
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))

[0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0]
[-100, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, -100]


In [17]:
#we tokenize all the inputs and apply align_labels_with_tokens() on all the labels
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [18]:
#applying all that preprocessing in one go on the other splits of our dataset
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

  0%|          | 0/8 [00:00<?, ?ba/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

**Fine-tuning the model with the Trainer API**

In [19]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [20]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
batch["labels"]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


tensor([[-100,    0,    0,    0,    0,    0,    0,    7,    0,    0,    0,    0,
            0,    0,    0, -100, -100, -100, -100, -100, -100, -100, -100, -100],
        [-100,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    7,
            8,    8,    0,    0,    0,    0,    0,    0,    0,    7,    0, -100]])


To have the Trainer compute a metric every epoch, we will need to define a compute_metrics() function that takes the arrays of predictions and    labels, and returns a dictionary with the metric names and values.To use this metric, we first need to install the seqeval library.

In [21]:
!pip install seqeval

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 KB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16179 sha256=298fba9cf08f4352b9678175fc910900373fb7a6cc98072fcb76b8d8fd95662d
  Stored in directory: /root/.cache/pip/wheels/ad/5c/ba/05fa33fa5855777b7d686e843ec07452f22a66a138e290e732
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [22]:
from datasets import load_metric

metric = load_metric("seqeval")

  metric = load_metric("seqeval")


Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

In [23]:
import numpy as np


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [24]:
id2label = {str(i): label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [25]:
#defining the model we want to finetune
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

Some weights of the model checkpoint at /content/gdrive/MyDrive/IboBERT_2.0/IgboBert_2.0 were not used when initializing RobertaForTokenClassification: ['lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at /content/gdrive/MyDrive/IboBERT_2.0/IgboBert_2.0 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRA

In [26]:
#log in to Hugging Face 
from huggingface_hub import notebook_login

notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.huggingface/token
Login successful


In [27]:
#we define our TrainingArguments
from transformers import TrainingArguments

args = TrainingArguments(
    "IgboBert2.0-finetuned-ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-4,
    num_train_epochs=30,
    weight_decay=0.01,
    #push_to_hub=True, (uncomment if you want to upload your results to the Model Hub)
)

In [28]:
#we pass everything to the Trainer and start training
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

***** Running training *****
  Num examples = 7634
  Num Epochs = 30
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 28650
  Number of trainable parameters = 82867209


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1628,0.090082,0.646688,0.732143,0.686767,0.973372
2,0.0563,0.083761,0.664662,0.789286,0.721633,0.977168
3,0.0339,0.078584,0.75793,0.810714,0.783434,0.980784
4,0.0221,0.092547,0.721116,0.808036,0.762105,0.981146
5,0.0163,0.102827,0.748339,0.804464,0.775387,0.979622
6,0.011,0.119185,0.747826,0.767857,0.757709,0.978899
7,0.0089,0.122469,0.749789,0.794643,0.771565,0.98112
8,0.0073,0.120478,0.742149,0.801786,0.770815,0.980862
9,0.0071,0.138398,0.723281,0.779464,0.750322,0.979932
10,0.0049,0.144206,0.71406,0.807143,0.757754,0.977349


***** Running Evaluation *****
  Num examples = 1090
  Batch size = 8
Saving model checkpoint to IgboBert2.0-finetuned-ner/checkpoint-955
Configuration saved in IgboBert2.0-finetuned-ner/checkpoint-955/config.json
Model weights saved in IgboBert2.0-finetuned-ner/checkpoint-955/pytorch_model.bin
tokenizer config file saved in IgboBert2.0-finetuned-ner/checkpoint-955/tokenizer_config.json
Special tokens file saved in IgboBert2.0-finetuned-ner/checkpoint-955/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1090
  Batch size = 8
Saving model checkpoint to IgboBert2.0-finetuned-ner/checkpoint-1910
Configuration saved in IgboBert2.0-finetuned-ner/checkpoint-1910/config.json
Model weights saved in IgboBert2.0-finetuned-ner/checkpoint-1910/pytorch_model.bin
tokenizer config file saved in IgboBert2.0-finetuned-ner/checkpoint-1910/tokenizer_config.json
Special tokens file saved in IgboBert2.0-finetuned-ner/checkpoint-1910/special_tokens_map.json
***** Running Evaluation **

TrainOutput(global_step=28650, training_loss=0.010868117061165692, metrics={'train_runtime': 2401.8228, 'train_samples_per_second': 95.353, 'train_steps_per_second': 11.928, 'total_flos': 3746107070950860.0, 'train_loss': 0.010868117061165692, 'epoch': 30.0})

**Using the fine-tuned model**

In [29]:
from transformers import pipeline

# Replace this with your own checkpoint
model_checkpoint = "/content/IgboBert2.0-finetuned-ner/checkpoint-28650"
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)
token_classifier("Google chetara ụbọchị ọmụmụ Keshi mgbe Obasanjo chịrị Naịjirịa afọ asatọ na ọchịchị onye kwuo uche ya."
                 "Onyeisi ndị na-emenyu ọkụ na, Legọọsi steeti bụ Rasak Fadipe ekwuola na onweghi onye nwụrụ n'ime ọkụ ahụ gbara n'ehihie ụbọchị 24 Jenuwarị, 2018.")

loading configuration file /content/IgboBert2.0-finetuned-ner/checkpoint-28650/config.json
Model config RobertaConfig {
  "_name_or_path": "/content/IgboBert2.0-finetuned-ner/checkpoint-28650",
  "architectures": [
    "RobertaForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-PER",
    "2": "I-PER",
    "3": "B-ORG",
    "4": "I-ORG",
    "5": "B-LOC",
    "6": "I-LOC",
    "7": "B-DATE",
    "8": "I-DATE"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-DATE": "7",
    "B-LOC": "5",
    "B-ORG": "3",
    "B-PER": "1",
    "I-DATE": "8",
    "I-LOC": "6",
    "I-ORG": "4",
    "I-PER": "2",
    "O": "0"
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers":

[{'entity_group': 'PER',
  'score': 0.99995863,
  'word': ' Google',
  'start': 0,
  'end': 6},
 {'entity_group': 'PER',
  'score': 0.99998486,
  'word': ' Obasanjo',
  'start': 39,
  'end': 47},
 {'entity_group': 'DATE',
  'score': 0.9995365,
  'word': ' asatọ',
  'start': 67,
  'end': 72},
 {'entity_group': 'LOC',
  'score': 0.5052215,
  'word': ' Legọọsi',
  'start': 132,
  'end': 139},
 {'entity_group': 'PER',
  'score': 0.99997354,
  'word': ' Rasak Fadipe',
  'start': 150,
  'end': 162},
 {'entity_group': 'DATE',
  'score': 0.9789229,
  'word': " n'ehihie ụbọchị 24 Jenuwarị, 2018",
  'start': 213,
  'end': 246}]

In [30]:
#save the fine-tuned model in gdrive
import shutil
shutil.move('/content/IgboBert2.0-finetuned-ner/checkpoint-28650','/content/gdrive/MyDrive/IboBERT_2.0/FINE TUNNED')

'/content/gdrive/MyDrive/IboBERT_2.0/FINE TUNNED'