<a href="https://colab.research.google.com/github/AliEbadi110/Natural-Language-Processing-Token-Classification-Sample-Projects/blob/main/NLP_Transformers_NER_Conll2003.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **NLP - Transformers - NER - Conll2003**

In [None]:
!pip install datasets
!pip install transformers[torch]
!pip install seqeval

In [None]:
import numpy as np
import torch
from sklearn.metrics import classification_report, confusion_matrix

from datasets import load_dataset, load_metric
from transformers import AutoTokenizer, DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification
from transformers import TrainingArguments, Trainer
from transformers import pipeline

## 1. Loading Data

In [None]:
dataset = load_dataset('conll2003')

Downloading builder script:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [None]:
dataset['train'][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [None]:
dataset['train'].features

{'id': Value(dtype='string', id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'pos_tags': Sequence(feature=ClassLabel(names=['"', "''", '#', '$', '(', ')', ',', '.', ':', '``', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'NN|SYM', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB'], id=None), length=-1, id=None),
 'chunk_tags': Sequence(feature=ClassLabel(names=['O', 'B-ADJP', 'I-ADJP', 'B-ADVP', 'I-ADVP', 'B-CONJP', 'I-CONJP', 'B-INTJ', 'I-INTJ', 'B-LST', 'I-LST', 'B-NP', 'I-NP', 'B-PP', 'I-PP', 'B-PRT', 'I-PRT', 'B-SBAR', 'I-SBAR', 'B-UCP', 'I-UCP', 'B-VP', 'I-VP'], id=None), length=-1, id=None),
 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)}

In [None]:
label_names = dataset['train'].features['ner_tags'].feature.names

## 2. Preprocessing

In [None]:
checkpoint = 'bert-base-cased'

In [None]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [None]:
tokenizer(dataset['train'][0]['tokens'], is_split_into_words=True)

{'input_ids': [101, 7270, 22961, 1528, 1840, 1106, 21423, 1418, 2495, 12913, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
tokenizer(dataset['train'][0]['tokens'], is_split_into_words=True).word_ids()

[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]

In [None]:
# ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']
begin2inside = {
    1: 2,
    3: 4,
    5: 6,
    7: 8,
}

In [None]:
def align_targets(labels, word_ids):
  aligned_labels = []
  last_word = None
  for word in word_ids:
    if word is None: # like [CLS]
      label = -100
    elif word != last_word:
      label = labels[word]
    else:
      label = labels[word]

      if label in begin2inside:
        label = begin2inside[label]

    aligned_labels.append(label)
    last_word = word
  return aligned_labels

In [None]:
def tokenize_func(example):
  tokenized_inputs = tokenizer(example['tokens'], truncation=True, is_split_into_words=True)
  labels_example = example['ner_tags']
  aligned_labels_example = []
  for i, labels in enumerate(labels_example):
    word_ids = tokenized_inputs.word_ids(i)
    aligned_labels_example.append(align_targets(labels, word_ids))
  tokenized_inputs['labels'] = aligned_labels_example
  return tokenized_inputs

In [None]:
tokenized_datasets = dataset.map(tokenize_func, batched=True, remove_columns=dataset['train'].column_names)
tokenized_datasets

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3453
    })
})

## 3. Train Model

In [None]:
id2label = {k: v for k, v in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [None]:
model = AutoModelForTokenClassification.from_pretrained(
    checkpoint,
    id2label=id2label,
    label2id=label2id,
    )

Downloading model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
metric = load_metric('seqeval')

def compute_metrics(eval_preds):
  logits, labels = eval_preds
  preds = np.argmax(logits, axis=-1)
  str_labels = [
      [label_names[t] for t in label if t != -100] for label in labels
  ]
  str_preds = [
      [label_names[p] for p, t in zip(pred, targ) if t != -100] \
      for pred, targ in zip(preds, labels)
  ]
  the_metrics = metric.compute(predictions=str_preds, references=str_preds)

  return {
      'precision': the_metrics['overall_precision'],
      'recall': the_metrics['overall_recall'],
      'f1': the_metrics['overall_f1'],
      'accuracy': the_metrics['overall_accuracy'],
  }

In [None]:
training_args = TrainingArguments('trainer_dir',
                                  per_device_train_batch_size=16,
                                  evaluation_strategy='epoch',
                                  save_strategy='epoch',
                                  num_train_epochs=3,
                                  logging_steps=200,
                                  load_best_model_at_end=True,
                                  learning_rate=2e-5,
                                  weight_decay=0.01,
                                  )

In [None]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)

In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.051,0.06872,1.0,1.0,1.0,1.0
2,0.0311,0.059719,1.0,1.0,1.0,1.0
3,0.0182,0.059587,1.0,1.0,1.0,1.0


TrainOutput(global_step=2634, training_loss=0.03681645943481269, metrics={'train_runtime': 487.9811, 'train_samples_per_second': 86.321, 'train_steps_per_second': 5.398, 'total_flos': 1054683418795902.0, 'train_loss': 0.03681645943481269, 'epoch': 3.0})

## 4. Evaluate

In [None]:
trainer.evaluate()

{'eval_loss': 0.0595865435898304,
 'eval_precision': 1.0,
 'eval_recall': 1.0,
 'eval_f1': 1.0,
 'eval_accuracy': 1.0,
 'eval_runtime': 14.2878,
 'eval_samples_per_second': 227.466,
 'eval_steps_per_second': 28.486,
 'epoch': 3.0}

## 5. Predict

In [None]:
trainer.save_model('my_saved_model')

In [None]:
ner = pipeline(
    'token-classification',
    model='my_saved_model',
    aggregation_strategy='simple',
    device=0
)

In [None]:
ner('Bill Gates was the CEO of Microsoft in Seattle.')

[{'entity_group': 'PER',
  'score': 0.9989754,
  'word': 'Bill Gates',
  'start': 0,
  'end': 10},
 {'entity_group': 'ORG',
  'score': 0.9986278,
  'word': 'Microsoft',
  'start': 26,
  'end': 35},
 {'entity_group': 'LOC',
  'score': 0.9985378,
  'word': 'Seattle',
  'start': 39,
  'end': 46}]