<a href="https://colab.research.google.com/github/AliEbadi110/Natural-Language-Processing-Token-Classification-Sample-Projects/blob/main/NLP_Transformers_Token_Classification_POS_Tagger_Brown.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **NLP - Transformers - Token Classification - POS Tagger - Brown**

In [None]:
!pip install datasets
!pip install transformers[torch]

Collecting datasets
  Downloading datasets-2.14.4-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.14.0 (from datasets)
  Downloading huggingface_hub-0.16.4-py3-none-a

In [None]:
import numpy as np
import torch
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix

import nltk
from nltk.corpus import brown
import json

from datasets import load_dataset, load_metric, DatasetDict
from transformers import AutoTokenizer, DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification
from transformers import TrainingArguments, Trainer
from transformers import pipeline

## 1. Loading Data

In [None]:
nltk.download('brown')
nltk.download('universal_tagset')

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


True

In [None]:
corpus = brown.tagged_sents(tagset='universal')
corpus

[[('The', 'DET'), ('Fulton', 'NOUN'), ('County', 'NOUN'), ('Grand', 'ADJ'), ('Jury', 'NOUN'), ('said', 'VERB'), ('Friday', 'NOUN'), ('an', 'DET'), ('investigation', 'NOUN'), ('of', 'ADP'), ("Atlanta's", 'NOUN'), ('recent', 'ADJ'), ('primary', 'NOUN'), ('election', 'NOUN'), ('produced', 'VERB'), ('``', '.'), ('no', 'DET'), ('evidence', 'NOUN'), ("''", '.'), ('that', 'ADP'), ('any', 'DET'), ('irregularities', 'NOUN'), ('took', 'VERB'), ('place', 'NOUN'), ('.', '.')], [('The', 'DET'), ('jury', 'NOUN'), ('further', 'ADV'), ('said', 'VERB'), ('in', 'ADP'), ('term-end', 'NOUN'), ('presentments', 'NOUN'), ('that', 'ADP'), ('the', 'DET'), ('City', 'NOUN'), ('Executive', 'ADJ'), ('Committee', 'NOUN'), (',', '.'), ('which', 'DET'), ('had', 'VERB'), ('over-all', 'ADJ'), ('charge', 'NOUN'), ('of', 'ADP'), ('the', 'DET'), ('election', 'NOUN'), (',', '.'), ('``', '.'), ('deserves', 'VERB'), ('the', 'DET'), ('praise', 'NOUN'), ('and', 'CONJ'), ('thanks', 'NOUN'), ('of', 'ADP'), ('the', 'DET'), ('City

In [None]:
inputs = []
targets = []
for sentence_tag_pairs in corpus:
  tokens = []
  target = []
  for token, tag in sentence_tag_pairs:
    tokens.append(token)
    target.append(tag)
  inputs.append(tokens)
  targets.append(target)

In [None]:
with open('data.json', 'w') as f:
  for x, y in zip(inputs, targets):
    j = {'inputs': x, 'targets': y}
    s = json.dumps(j)
    f.write(f'{s}\n')

In [None]:
raw_dataset = load_dataset('json', data_files='data.json')

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['inputs', 'targets'],
        num_rows: 57340
    })
})

In [None]:
ds = raw_dataset['train'].train_test_split(test_size=0.2)
dataset = DatasetDict({
    'train': ds['train'],
    'val': ds['test']
    })

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['inputs', 'targets'],
        num_rows: 45872
    })
    val: Dataset({
        features: ['inputs', 'targets'],
        num_rows: 11468
    })
})

In [None]:
dataset['train'][0]

{'inputs': ['His', 'best', 'mile', 'to', 'date', 'is', '2:32.2', '.'],
 'targets': ['DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'VERB', 'NUM', '.']}

In [None]:
dataset['train'].features

{'inputs': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'targets': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)}

In [None]:
target_set = set()
for target in targets:
  target_set = target_set.union(target)

target_list = list(target_set)
id2label = {k: v for k, v in enumerate(target_list)}
label2id = {v: k for k, v in id2label.items()}

## 2. Preprocessing

In [None]:
checkpoint = 'bert-base-cased'

In [None]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [None]:
tokenizer(dataset['train'][0]['inputs'], is_split_into_words=True)

{'input_ids': [101, 1230, 1436, 2837, 1106, 2236, 1110, 123, 131, 2724, 119, 123, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
tokenizer(dataset['train'][0]['inputs'], is_split_into_words=True).word_ids()

[None, 0, 1, 2, 3, 4, 5, 6, 6, 6, 6, 6, 7, None]

In [None]:
def align_targets(labels, word_ids):
  aligned_labels = []
  last_word = None
  for word in word_ids:
    if word is None: # like [CLS]
      label = -100
    else:
      label = label2id[labels[word]]

    aligned_labels.append(label)
  return aligned_labels

In [None]:
def tokenize_func(example):
  tokenized_inputs = tokenizer(example['inputs'], truncation=True, is_split_into_words=True)
  labels_example = example['targets']
  aligned_labels_example = []
  for i, labels in enumerate(labels_example):
    word_ids = tokenized_inputs.word_ids(i)
    aligned_labels_example.append(align_targets(labels, word_ids))
  tokenized_inputs['labels'] = aligned_labels_example
  return tokenized_inputs

In [None]:
tokenized_datasets = dataset.map(tokenize_func, batched=True, remove_columns=dataset['train'].column_names)
tokenized_datasets

Map:   0%|          | 0/45872 [00:00<?, ? examples/s]

Map:   0%|          | 0/11468 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 45872
    })
    val: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 11468
    })
})

## 3. Train Model

In [None]:
model = AutoModelForTokenClassification.from_pretrained(
    checkpoint,
    id2label=id2label,
    label2id=label2id,
    )

Downloading model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
def flatten(list_of_lists):
  flattened = [val for sublist in list_of_lists for val in sublist]
  return flattened

In [None]:
def compute_metrics(eval_preds):
  logits, labels = eval_preds
  preds = np.argmax(logits, axis=-1)

  labels_jagged = [[t for t in label if t != -100] for label in labels]
  preds_jagged = [[p for p, t in zip(ps, ts) if t != -100] for ps, ts in zip(preds, labels)]

  labels_flat = flatten(labels_jagged)
  preds_flat = flatten(preds_jagged)

  acc = accuracy_score(labels_flat, preds_flat)
  f1 = f1_score(labels_flat, preds_flat, average='macro')
  return {
      'f1': f1,
      'accuracy': acc,
  }

In [None]:
training_args = TrainingArguments('trainer_dir',
                                  per_device_train_batch_size=16,
                                  evaluation_strategy='epoch',
                                  save_strategy='epoch',
                                  num_train_epochs=1,
                                  load_best_model_at_end=True,
                                  )

In [None]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['val'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)

In [None]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0417,0.036752,0.969445,0.989036


TrainOutput(global_step=2867, training_loss=0.06804664200675334, metrics={'train_runtime': 591.381, 'train_samples_per_second': 77.568, 'train_steps_per_second': 4.848, 'total_flos': 1392323153118336.0, 'train_loss': 0.06804664200675334, 'epoch': 1.0})

## 4. Evaluate

In [None]:
trainer.evaluate()

{'eval_loss': 0.03675195202231407,
 'eval_f1': 0.9694453977585531,
 'eval_accuracy': 0.9890361387591629,
 'eval_runtime': 41.684,
 'eval_samples_per_second': 275.118,
 'eval_steps_per_second': 34.402,
 'epoch': 1.0}

## 5. Predict

In [None]:
trainer.save_model('my_saved_model')

In [None]:
ner = pipeline(
    'token-classification',
    model='my_saved_model',
    aggregation_strategy='simple',
    device=0
)

In [None]:
ner('Bill Gates was the CEO of Microsoft in Seattle.')

[{'entity_group': 'NOUN',
  'score': 0.9997301,
  'word': 'Bill Gates',
  'start': 0,
  'end': 10},
 {'entity_group': 'VERB',
  'score': 0.9998596,
  'word': 'was',
  'start': 11,
  'end': 14},
 {'entity_group': 'DET',
  'score': 0.9998857,
  'word': 'the',
  'start': 15,
  'end': 18},
 {'entity_group': 'NOUN',
  'score': 0.99951696,
  'word': 'CEO',
  'start': 19,
  'end': 22},
 {'entity_group': 'ADP',
  'score': 0.9998803,
  'word': 'of',
  'start': 23,
  'end': 25},
 {'entity_group': 'NOUN',
  'score': 0.9996233,
  'word': 'Microsoft',
  'start': 26,
  'end': 35},
 {'entity_group': 'ADP',
  'score': 0.9997682,
  'word': 'in',
  'start': 36,
  'end': 38},
 {'entity_group': 'NOUN',
  'score': 0.99975437,
  'word': 'Seattle',
  'start': 39,
  'end': 46},
 {'entity_group': '.',
  'score': 0.99988997,
  'word': '.',
  'start': 46,
  'end': 47}]