In [1]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [2]:
# import library
from typing import List
import numpy as np
import torch
import evaluate
from sklearn.model_selection import train_test_split
import nltk
nltk.download('treebank')

[nltk_data] Downloading package treebank to /usr/share/nltk_data...
[nltk_data]   Package treebank is already up-to-date!


True

In [3]:
# load tree bank dataset
tagged_sentences = nltk.corpus.treebank.tagged_sents()
print("Number of samples:", len(tagged_sentences))

Number of samples: 3914


In [4]:
tagged_sentences[0]

[('Pierre', 'NNP'),
 ('Vinken', 'NNP'),
 (',', ','),
 ('61', 'CD'),
 ('years', 'NNS'),
 ('old', 'JJ'),
 (',', ','),
 ('will', 'MD'),
 ('join', 'VB'),
 ('the', 'DT'),
 ('board', 'NN'),
 ('as', 'IN'),
 ('a', 'DT'),
 ('nonexecutive', 'JJ'),
 ('director', 'NN'),
 ('Nov.', 'NNP'),
 ('29', 'CD'),
 ('.', '.')]

In [5]:
# save sentences and tags
sentences, sentence_tags =[], []

for tagged_sentence in tagged_sentences:
    sentence, tags = zip(*tagged_sentence)
    sentences.append([word.lower() for word in sentence])
    sentence_tags.append([tag for tag in tags])

In [6]:
# Create label mappings
unique_labels = sorted(set(label for sublist in sentence_tags for label in sublist))
label2id = {label: idx for idx, label in enumerate(unique_labels)}
label2id["0"] = len(label2id)  # Padding label ID
id2label = {i: l for l, i in label2id.items()}
print(id2label)

{0: '#', 1: '$', 2: "''", 3: ',', 4: '-LRB-', 5: '-NONE-', 6: '-RRB-', 7: '.', 8: ':', 9: 'CC', 10: 'CD', 11: 'DT', 12: 'EX', 13: 'FW', 14: 'IN', 15: 'JJ', 16: 'JJR', 17: 'JJS', 18: 'LS', 19: 'MD', 20: 'NN', 21: 'NNP', 22: 'NNPS', 23: 'NNS', 24: 'PDT', 25: 'POS', 26: 'PRP', 27: 'PRP$', 28: 'RB', 29: 'RBR', 30: 'RBS', 31: 'RP', 32: 'SYM', 33: 'TO', 34: 'UH', 35: 'VB', 36: 'VBD', 37: 'VBG', 38: 'VBN', 39: 'VBP', 40: 'VBZ', 41: 'WDT', 42: 'WP', 43: 'WP$', 44: 'WRB', 45: '``', 46: '0'}


In [7]:
train_sentences, test_sentences, train_tags, test_tags = train_test_split(
    sentences,
    sentence_tags,
    test_size =0.3
)

valid_sentences, test_sentences, valid_tags, test_tags = train_test_split(
    test_sentences,
    test_tags,
    test_size =0.5
)

In [8]:
# tokenization
from transformers import AutoTokenizer
from torch.utils.data import Dataset

model_name = "QCRI/bert-base-multilingual-cased-pos-english"

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    use_fast=True
)


MAX_LEN = 256
class PosTagging_Dataset(Dataset):
    def __init__(self ,
        sentences: List[List[str]],
        tags: List[List[str]],
        tokenizer,
        label2id,
        max_len=MAX_LEN
    ):
        super().__init__ ()
        self.sentences = sentences
        self.tags = tags
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.label2id = label2id

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self , idx):
        input_token = self.sentences[idx]
        label_token = self.tags[idx]

        input_token = self.tokenizer.convert_tokens_to_ids(input_token)
        attention_mask = [1] * len(input_token)
        labels = [self.label2id[token] for token in label_token]

        return {
            "input_ids": self.pad_and_truncate(input_token, pad_id=self.tokenizer.pad_token_id),
            "labels": self.pad_and_truncate(labels, pad_id=label2id["0"]),
            "attention_mask": self.pad_and_truncate(attention_mask , pad_id =0)
        }

    def pad_and_truncate(self, inputs: List[int], pad_id: int):
        if len(inputs) < self.max_len:
            padded_inputs = inputs + [pad_id] * (self.max_len - len(inputs))
        else:
            padded_inputs = inputs[:self.max_len]
        return torch.as_tensor(padded_inputs)


train_dataset = PosTagging_Dataset(train_sentences, train_tags, tokenizer, label2id)
val_dataset = PosTagging_Dataset(valid_sentences, valid_tags, tokenizer, label2id)
test_dataset = PosTagging_Dataset(test_sentences, test_tags, tokenizer, label2id)


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.12k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [9]:
next(iter(iter(train_dataset)))

{'input_ids': tensor([  100,   100,   115, 10155,   100, 53147,   115, 10114, 16068, 12748,
         39282, 10106,   100, 10407, 45751, 12742,   100, 42919,   117, 10319,
           100,   100, 10842, 10114, 65036,   119,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,   

In [10]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

model_name = "QCRI/bert-base-multilingual-cased-pos-english"

model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label2id),
    ignore_mismatched_sizes=True
)

pytorch_model.bin:   0%|          | 0.00/712M [00:00<?, ?B/s]

Some weights of the model checkpoint at QCRI/bert-base-multilingual-cased-pos-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at QCRI/bert-base-multilingual-cased-pos-english and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([46, 768]) in the checkpoint and torch.Size([47, 768]) in the model i

In [11]:
accuracy = evaluate.load("accuracy")

ignore_label = len(label2id)

def compute_metrics(eval_pred):
    predictions , labels = eval_pred
    mask = labels != ignore_label
    predictions = np.argmax(predictions , axis=-1)
    return accuracy.compute(predictions=predictions[mask], references=labels[mask])

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [12]:
import os
os.environ['WANDB_DISABLED'] = 'true'

In [13]:
model.resize_token_embeddings(len(tokenizer))

Embedding(119547, 768, padding_idx=0)

In [14]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="out_dir",
    learning_rate =1e-5,
    per_device_train_batch_size =16,
    per_device_eval_batch_size =16,
    num_train_epochs =9,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model ,
    args=training_args ,
    train_dataset=train_dataset ,
    eval_dataset=val_dataset ,
    tokenizer = tokenizer ,
    compute_metrics=compute_metrics ,
)

trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.158819,0.96865
2,No log,0.089101,0.978579
3,No log,0.067945,0.982172
4,No log,0.058019,0.984954
5,No log,0.052765,0.986285
6,0.168500,0.049531,0.987057
7,0.168500,0.047996,0.987236
8,0.168500,0.046635,0.987662
9,0.168500,0.046404,0.987729




TrainOutput(global_step=774, training_loss=0.12685396381742886, metrics={'train_runtime': 801.953, 'train_samples_per_second': 30.739, 'train_steps_per_second': 0.965, 'total_flos': 3221923456654848.0, 'train_loss': 0.12685396381742886, 'epoch': 9.0})

In [20]:
# tokenization
test_sentence = "We are exploring the topic of deep learning"
input = torch.as_tensor([ tokenizer.convert_tokens_to_ids(test_sentence.split())])
input = input.to("cuda")

# prediction
outputs = model(input)
_, preds = torch.max(outputs.logits , -1)
preds = preds.cpu().numpy()

# decode
pred_tags = ""
for pred in preds[0]:
    pred_tags += id2label[pred] + " "

pred_tags 

'DT VBP VBG DT NN IN JJ NN '