In [1]:
!pip install datasets transformers[torch] tokenizers seqeval -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [2]:
from transformers import ConvBertForTokenClassification, ConvBertTokenizer, DataCollatorForTokenClassification, Trainer, TrainingArguments
import torch
import numpy as np
import datasets
from datasets import load_dataset

In [3]:
metric = datasets.load_metric("seqeval")
conll = load_dataset('conll2003')
label_list = conll["train"].features["ner_tags"].feature.names
tokenizer = ConvBertTokenizer.from_pretrained("YituTech/conv-bert-base")
data_collator = DataCollatorForTokenClassification(tokenizer)
model = ConvBertForTokenClassification.from_pretrained("YituTech/conv-bert-base",  num_labels=9)

  metric = datasets.load_metric("seqeval")


Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

vocab.txt:   0%|          | 0.00/267k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/674 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/423M [00:00<?, ?B/s]

Some weights of ConvBertForTokenClassification were not initialized from the model checkpoint at YituTech/conv-bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
def RmvColumns(data, columns_to_remove = ['id', 'pos_tags', 'chunk_tags']):
    data = data.remove_columns(columns_to_remove)
    return data

def Labler(token, label):
    length = len(token)
    lst = [label]
    if length>1:
        if label%2 == 1:
            label = label + 1
        lst = lst + [label]*(length - 1)
    return lst

def SplitLabeler(tokens, labels):
    lst = []
    for i in range(len(tokens)):
        token = tokens[i]
        label = labels[i]
        label = Labler(token, label)
        lst = lst + [label]
    lst = sum(lst, [])
    return lst

def LoadData(data, part):
    data = data[part]

    lst_labels = []
    lst_tokens = []
    lst_token_type_ids = []
    lst_attention_mask = []

    for i in range(len(data)):
        data_i = data[i]
        tokens = data_i['tokens']
        labels = data_i['ner_tags']
        dt = tokenizer(tokens, add_special_tokens=False)
        tokens = dt['input_ids']
        labels = SplitLabeler(tokens, labels)
        tokens = [item for sublist in tokens for item in sublist]
        token_type_ids = dt['token_type_ids']
        token_type_ids = [item for sublist in token_type_ids for item in sublist]
        attention_mask = dt['attention_mask']
        attention_mask = [item for sublist in attention_mask for item in sublist]

        lst_labels = lst_labels + [labels]
        lst_tokens = lst_tokens + [tokens]
        lst_token_type_ids = lst_token_type_ids + [token_type_ids]
        lst_attention_mask = lst_attention_mask + [attention_mask]

    data = data.add_column('input_ids', lst_tokens)
    data = data.add_column('token_type_ids', lst_token_type_ids)
    data = data.add_column('attention_mask', lst_attention_mask)
    data = data.add_column('labels', lst_labels)
    return data

def compute_metrics(eval_preds):
    pred_logits, labels = eval_preds

    pred_logits = np.argmax(pred_logits, axis=2)
    # the logits and the probabilities are in the same order,
    # so we don’t need to apply the softmax

    # We remove all the values where the label is -100
    predictions = [
        [label_list[eval_preds] for (eval_preds, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(pred_logits, labels)
    ]

    true_labels = [
      [label_list[l] for (eval_preds, l) in zip(prediction, label) if l != -100]
       for prediction, label in zip(pred_logits, labels)
   ]
    results = metric.compute(predictions=predictions, references=true_labels)

    return {
            "precision": results["overall_precision"],
            "recall": results["overall_recall"],
            "f1": results["overall_f1"],
            "accuracy": results["overall_accuracy"],
           }

def Train(train, valid, model, tokenizer, data_collator):
    args = TrainingArguments(
                "test-ner",
                evaluation_strategy = "epoch",
                learning_rate=2e-5,
                per_device_train_batch_size=16,
                per_device_eval_batch_size=16,
                num_train_epochs=3,
                weight_decay=0.01,
          )

    trainer = Trainer(
                model,
                args,
                train_dataset=train,
                eval_dataset=valid,
                data_collator=data_collator,
                tokenizer=tokenizer,
                compute_metrics=compute_metrics
          )

    trainer.train()
    return model

In [5]:
conll = RmvColumns(conll)
train = LoadData(conll, 'train')
valid = LoadData(conll, 'validation')
print(train)
train[0]

Dataset({
    features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 14041
})


{'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0],
 'input_ids': [7327, 19164, 2446, 2655, 2000, 17757, 2329, 12559, 1012],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [6]:
X = train[13068]['input_ids']
Y = train[13068]['labels']
print(len(X),len(Y))

162 162


In [7]:
model = Train(train, valid, model, tokenizer, data_collator)

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.285,0.082166,0.864504,0.892292,0.878178,0.976742
2,0.0587,0.070113,0.888453,0.91417,0.901128,0.980666
3,0.0366,0.068012,0.906977,0.925446,0.916118,0.982986


Saving vocabulary to test-ner/checkpoint-500/vocab.txt: vocabulary indices are not consecutive. Please check that the vocabulary is not corrupted!
Saving vocabulary to test-ner/checkpoint-500/vocab.txt: vocabulary indices are not consecutive. Please check that the vocabulary is not corrupted!
Saving vocabulary to test-ner/checkpoint-500/vocab.txt: vocabulary indices are not consecutive. Please check that the vocabulary is not corrupted!
Saving vocabulary to test-ner/checkpoint-500/vocab.txt: vocabulary indices are not consecutive. Please check that the vocabulary is not corrupted!
Saving vocabulary to test-ner/checkpoint-500/vocab.txt: vocabulary indices are not consecutive. Please check that the vocabulary is not corrupted!
Saving vocabulary to test-ner/checkpoint-500/vocab.txt: vocabulary indices are not consecutive. Please check that the vocabulary is not corrupted!
Saving vocabulary to test-ner/checkpoint-500/vocab.txt: vocabulary indices are not consecutive. Please check that the 

In [8]:
model.save_pretrained("ConvBertForToken")
tokenizer.save_pretrained("ConvBertTokenizer")

Saving vocabulary to ConvBertTokenizer/vocab.txt: vocabulary indices are not consecutive. Please check that the vocabulary is not corrupted!
Saving vocabulary to ConvBertTokenizer/vocab.txt: vocabulary indices are not consecutive. Please check that the vocabulary is not corrupted!
Saving vocabulary to ConvBertTokenizer/vocab.txt: vocabulary indices are not consecutive. Please check that the vocabulary is not corrupted!
Saving vocabulary to ConvBertTokenizer/vocab.txt: vocabulary indices are not consecutive. Please check that the vocabulary is not corrupted!
Saving vocabulary to ConvBertTokenizer/vocab.txt: vocabulary indices are not consecutive. Please check that the vocabulary is not corrupted!
Saving vocabulary to ConvBertTokenizer/vocab.txt: vocabulary indices are not consecutive. Please check that the vocabulary is not corrupted!
Saving vocabulary to ConvBertTokenizer/vocab.txt: vocabulary indices are not consecutive. Please check that the vocabulary is not corrupted!
Saving vocabu

('ConvBertTokenizer/tokenizer_config.json',
 'ConvBertTokenizer/special_tokens_map.json',
 'ConvBertTokenizer/vocab.txt',
 'ConvBertTokenizer/added_tokens.json')

In [9]:
from transformers import AutoModelForTokenClassification
from transformers import pipeline
from prettytable import PrettyTable

label_map = {'LABEL_0':'O',
             'LABEL_1':'B-PER',
             'LABEL_2':'I-PER',
             'LABEL_3':'B-ORG',
             'LABEL_4':'I-ORG',
             'LABEL_5':'B-LOC',
             'LABEL_6':'I-LOC',
             'LABEL_7':'B-MISC',
             'LABEL_8':'I-MISC'}
model = AutoModelForTokenClassification.from_pretrained("ConvBertForToken")
nlp = pipeline("ner", model=model, tokenizer=tokenizer)

In [10]:
def DecodeOutput(data, model = model, nlp = nlp, label_map = label_map):
    data = nlp(data)
    data = [{'word': item['word'], 'entity': item['entity']} for item in data]
    data = [{'word': item['word'], 'entity': label_map.get(item['entity'], item['entity'])} for item in data]
    table = PrettyTable(['Word', 'Entity'])

    for item in data:
        table.add_row([item['word'], item['entity']])

    print(table)
    return data

In [11]:
example = "Bill Gates is the Founder of Microsoft"
example = DecodeOutput(example)

+-----------+--------+
|    Word   | Entity |
+-----------+--------+
|    bill   | B-PER  |
|   gates   | I-PER  |
|     is    |   O    |
|    the    |   O    |
|  founder  |   O    |
|     of    |   O    |
| microsoft | B-ORG  |
+-----------+--------+
