<a href="https://colab.research.google.com/github/Arshad221b/Named-Entity-Recognition/blob/master/NER_updated.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install seqeval

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Installing transformers and Datasets

In [None]:
! pip install datasets
! pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Importing all the necessary libraries 

In [None]:
# visualization libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# pytorch libraries
import torch # the main pytorch library
import torch.nn as nn 
import torch.optim as optim 

# huggingface's transformers library
from transformers import RobertaForTokenClassification, RobertaTokenizer, pipeline, AutoTokenizer, AutoModelForTokenClassification
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from transformers import XLMRobertaConfig
from transformers.modeling_outputs import TokenClassifierOutput
from transformers.models.roberta.modeling_roberta import RobertaModel
from transformers.models.roberta.modeling_roberta import RobertaPreTrainedModel

# huggingface's datasets library
from datasets import load_dataset

# the tqdm library used to show the iteration progress
import tqdm
tqdmn = tqdm.notebook.tqdm

from seqeval.metrics import f1_score, classification_report

In [None]:
roberta_version = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(roberta_version)

## Importing data

In [None]:
dataset = load_dataset("conll2003")



  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
dataset.shape

{'train': (14041, 5), 'validation': (3250, 5), 'test': (3453, 5)}



*   Dataset contains three sets and they can be accessed as List object in python



In [None]:
dataset['train'][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

#### NER lables 

In [None]:
num_labels = dataset['train'].features['ner_tags'].feature.num_classes
num_labels

9

In [None]:
dataset['train'].features['ner_tags'].feature

ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None)

In [None]:
labels = dataset['train'].features['ner_tags'].feature
label2id = { k: labels.str2int(k) for k in labels.names } # Created these for the model config 
id2label = { v: k for k, v in label2id.items() }. # Created these for the model config

In [None]:
tags = dataset['train'].features["ner_tags"].feature

In [None]:
# Creating tag names i.e. int to str conversion
def create_tag_names(batch):
  return {"ner_tags_str": [tags.int2str(idx) for idx in batch["ner_tags"]]}

In [None]:
dataset_x = dataset.map(create_tag_names)

  0%|          | 0/14041 [00:00<?, ?ex/s]

  0%|          | 0/3250 [00:00<?, ?ex/s]

  0%|          | 0/3453 [00:00<?, ?ex/s]

In [None]:
dataset_x

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'ner_tags_str'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'ner_tags_str'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'ner_tags_str'],
        num_rows: 3453
    })
})

## Align Tokens

In [None]:
checkpoint = 'prajjwal1/bert-small' # I have loaded this model as it was pretrained small-bert

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
new_checkpoint = "_".join(checkpoint.split("/")[-2:])
new_checkpoint

'prajjwal1_bert-small'

In [None]:
def tokenize_and_align_labels(examples):
    
    # examples represent a batch similar to dataset['train'][:10]
    tokenized_inputs = tokenizer(examples["tokens"], truncation=False, is_split_into_words=True) 
    
    # New labels for whole batch
    labels = []
    
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        
        # New labels for individual example in a batch
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
        
    return tokenized_inputs

In [None]:
batch_size_num = 8
dataset_encoded = dataset.map(tokenize_and_align_labels, batched = True, batch_size = batch_size_num)

print(dataset_encoded["train"].column_names)

  0%|          | 0/1756 [00:00<?, ?ba/s]

  0%|          | 0/407 [00:00<?, ?ba/s]

  0%|          | 0/432 [00:00<?, ?ba/s]

['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels']


In [None]:
dataset_encoded

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3453
    })
})

Now dataset has attention_mask and labels columns too. Which is generated by tokeniser

In [None]:
dataset_encoded = dataset_encoded.remove_columns(["tokens","ner_tags"])
print(dataset_encoded.column_names)

{'train': ['id', 'pos_tags', 'chunk_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'], 'validation': ['id', 'pos_tags', 'chunk_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'], 'test': ['id', 'pos_tags', 'chunk_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels']}


In [None]:
dataset_encoded = dataset_encoded.filter(lambda example: len(example['input_ids']) <= 512) # setting max len to 512 as it is faster to train 
print(len(dataset_encoded['train']))

  0%|          | 0/15 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

14041


In [None]:
def align_predictions(predictions, label_ids):
    preds = np.argmax(predictions, axis=-1)
    batch_size, seq_len = preds.shape
    labels_list, preds_list = [], []
    
    for batch_idx in range(batch_size):
        example_labels, example_preds = [], []
        for seq_idx in range(seq_len):
            # Ignore label IDs = -100
            if label_ids[batch_idx, seq_idx] != -100:
                example_labels.append(id2label[label_ids[batch_idx][seq_idx]])
                example_preds.append(id2label[preds[batch_idx][seq_idx]])
                
        labels_list.append(example_labels)
        preds_list.append(example_preds)
    return preds_list, labels_list

In [None]:
def compute_metrics(eval_pred):
    y_pred, y_true = align_predictions(eval_pred.predictions,eval_pred.label_ids)
    detailed_report = classification_report(y_true, y_pred, output_dict = True)
    detailed_report = pd.DataFrame(detailed_report).T
    print(detailed_report)
    
    return {"f1": f1_score(y_true, y_pred)} # The F1 score seems better option compared to accuracy as there are multiple predictions in the same sentence

# Loading DataCollector
This loads the datacollactor which adds padding to the data making it equi-length

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

# Setting up hyperparameters

In [None]:
total_epochs = 1 #25 # for the demo I have kept only 1 epoch as the colab taking longer to train the model
batch_size_num = 4
gradient_accumulation_steps = 2 #finding gradient twice per batch
effective_batch_size = batch_size_num * gradient_accumulation_steps # hence the effective batch size becomes twice as batch size

model_name = f"{new_checkpoint}-ner_{total_epochs}_epochs_{effective_batch_size}_batch_size"
model_name

'prajjwal1_bert-small-ner_1_epochs_8_batch_size'

# Training the model

In [None]:
model = AutoModelForTokenClassification.from_pretrained(checkpoint, num_labels = num_labels, id2label=id2label, label2id=label2id)

In [None]:
training_args = TrainingArguments(
    # load_best_model_at_end = True,
    
    output_dir = "../models/NER/" + model_name,
    
    per_device_train_batch_size = batch_size_num,
    gradient_accumulation_steps = gradient_accumulation_steps,
    
    num_train_epochs = total_epochs,
    learning_rate = 1e-5,
    weight_decay = 0.01,
    
#     gradient_checkpointing = True,
    
    evaluation_strategy = "epoch",
#     eval_steps = 500,
    per_device_eval_batch_size = batch_size_num,
    
    save_strategy = 'epoch',
#     save_steps = 500,
    
    logging_strategy = 'epoch',
#     logging_steps = 500,
    log_level = "error"

)

## Loading Trainer

In [None]:
trainer = Trainer(
    model = model,
    args = training_args,
    data_collator = data_collator,
    train_dataset = dataset_encoded['train'],
#     eval_dataset = dataset_encoded['train'],
    eval_dataset = dataset_encoded['validation'],
    compute_metrics = compute_metrics,
    tokenizer = tokenizer
)

In [None]:
result = trainer.train()



{'loss': 0.2218, 'learning_rate': 0.0, 'epoch': 1.0}
              precision    recall  f1-score  support
LOC            0.846708  0.896026  0.870669   1837.0
MISC           0.707751  0.604121  0.651843    922.0
ORG            0.643186  0.728561  0.683217   1341.0
PER            0.922513  0.956569  0.939232   1842.0
micro avg      0.802273  0.831706  0.816725   5942.0
macro avg      0.780040  0.796319  0.786240   5942.0
weighted avg   0.802715  0.831706  0.815664   5942.0
{'eval_loss': 0.1240999847650528, 'eval_f1': 0.8167245083457281, 'eval_runtime': 5.8066, 'eval_samples_per_second': 559.709, 'eval_steps_per_second': 140.013, 'epoch': 1.0}
{'train_runtime': 78.6941, 'train_samples_per_second': 178.425, 'train_steps_per_second': 22.302, 'train_loss': 0.22184872532162572, 'epoch': 1.0}


In [None]:
trainer.evaluate()

              precision    recall  f1-score  support
LOC            0.846708  0.896026  0.870669   1837.0
MISC           0.707751  0.604121  0.651843    922.0
ORG            0.643186  0.728561  0.683217   1341.0
PER            0.922513  0.956569  0.939232   1842.0
micro avg      0.802273  0.831706  0.816725   5942.0
macro avg      0.780040  0.796319  0.786240   5942.0
weighted avg   0.802715  0.831706  0.815664   5942.0
{'eval_loss': 0.1240999847650528, 'eval_f1': 0.8167245083457281, 'eval_runtime': 5.812, 'eval_samples_per_second': 559.187, 'eval_steps_per_second': 139.883, 'epoch': 1.0}


{'eval_loss': 0.1240999847650528,
 'eval_f1': 0.8167245083457281,
 'eval_runtime': 5.812,
 'eval_samples_per_second': 559.187,
 'eval_steps_per_second': 139.883,
 'epoch': 1.0}

# Saving the model locally

In [None]:
trainer.save_model("./my_model")

In [None]:
PATH = '/content/my_model'
# tokenizer = AutoTokenizer.from_pretrained(PATH, local_files_only=True)

# Inference using pipeline

In [None]:
model_ = AutoModelForTokenClassification.from_pretrained(PATH, local_files_only= True)

In [None]:
from transformers import pipeline
nlp = pipeline("ner", model=model_, tokenizer=tokenizer)
example = "My name is Wolfgang and I live in Berlin"

ner_results = nlp(example)
print(ner_results)

[{'entity': 'B-PER', 'score': 0.84195936, 'index': 4, 'word': 'wolfgang', 'start': 11, 'end': 19}, {'entity': 'B-LOC', 'score': 0.9583987, 'index': 9, 'word': 'berlin', 'start': 34, 'end': 40}]
