In [5]:
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification 
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, num_layers=9)

In [6]:
model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels=9)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

In [7]:
import datasets 
import numpy as np 
from transformers import BertTokenizerFast 
from transformers import DataCollatorForTokenClassification 
from transformers import AutoModelForTokenClassification 

conll2003 = datasets.load_dataset("conll2003") 

Found cached dataset conll2003 (/Users/jasonz/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98)


  0%|          | 0/3 [00:00<?, ?it/s]

In [8]:
conll2003

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [9]:
conll2003.shape

{'train': (14041, 5), 'validation': (3250, 5), 'test': (3453, 5)}

In [10]:
conll2003["train"].features["ner_tags"]

Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)

In [11]:
conll2003['train'].description

'The shared task of CoNLL-2003 concerns language-independent named entity recognition. We will concentrate on\nfour types of named entities: persons, locations, organizations and names of miscellaneous entities that do\nnot belong to the previous three groups.\n\nThe CoNLL-2003 shared task data files contain four columns separated by a single space. Each word has been put on\na separate line and there is an empty line after each sentence. The first item on each line is a word, the second\na part-of-speech (POS) tag, the third a syntactic chunk tag and the fourth the named entity tag. The chunk tags\nand the named entity tags have the format I-TYPE which means that the word is inside a phrase of type TYPE. Only\nif two phrases of the same type immediately follow each other, the first word of the second phrase will have tag\nB-TYPE to show that it starts a new phrase. A word with tag O is not part of a phrase. Note the dataset uses IOB2\ntagging scheme, whereas the original dataset uses 

In [12]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased") 

In [14]:
example_text = conll2003['train'][0]

tokenized_input = tokenizer(example_text["tokens"], is_split_into_words=True)

tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])

word_ids = tokenized_input.word_ids()

print(word_ids)



print(tokenized_input)


[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, None]
{'input_ids': [101, 7327, 19164, 2446, 2655, 2000, 17757, 2329, 12559, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [15]:
len(example_text['ner_tags']), len(tokenized_input["input_ids"])
# (9, 11)

(9, 11)

In [16]:
def tokenize_and_align_labels(examples, label_all_tokens=True): 
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True) 
    labels = [] 
    for i, label in enumerate(examples["ner_tags"]): 
        word_ids = tokenized_inputs.word_ids(batch_index=i) 
        # word_ids() => Return a list mapping the tokens
        # to their actual word in the initial sentence.
        # It Returns a list indicating the word corresponding to each token. 
        previous_word_idx = None 
        label_ids = []
        # Special tokens like `<s>` and `<\s>` are originally mapped to None 
        # We need to set the label to -100 so they are automatically ignored in the loss function.
        for word_idx in word_ids: 
            if word_idx is None: 
                # set â€“100 as the label for these special tokens
                label_ids.append(-100)
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            elif word_idx != previous_word_idx:
                # if current word_idx is != prev then its the most regular case
                # and add the corresponding token                 
                label_ids.append(label[word_idx]) 
            else: 
                # to take care of sub-words which have the same word_idx
                # set -100 as well for them, but only if label_all_tokens == False
                label_ids.append(label[word_idx] if label_all_tokens else -100) 
                # mask the subword representations after the first subword
                 
            previous_word_idx = word_idx 
        labels.append(label_ids) 
    tokenized_inputs["labels"] = labels 
    return tokenized_inputs 

In [17]:
toks = tokenize_and_align_labels(conll2003['train'][4:5]) 
print(toks) 

{'input_ids': [[101, 2762, 1005, 1055, 4387, 2000, 1996, 2647, 2586, 1005, 1055, 15651, 2837, 14121, 1062, 9328, 5804, 2056, 2006, 9317, 10390, 2323, 4965, 8351, 4168, 4017, 2013, 3032, 2060, 2084, 3725, 2127, 1996, 4045, 6040, 2001, 24509, 1012, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[-100, 5, 0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 1, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, -100]]}


In [18]:
for token, label in zip(tokenizer.convert_ids_to_tokens(toks["input_ids"][0]),toks["labels"][0]): 
    print(f"{token:_<10} {label}") 

[CLS]_____ -100
germany___ 5
'_________ 0
s_________ 0
representative 0
to________ 0
the_______ 0
european__ 3
union_____ 4
'_________ 0
s_________ 0
veterinary 0
committee_ 0
werner____ 1
z_________ 2
##wing____ 2
##mann____ 2
said______ 0
on________ 0
wednesday_ 0
consumers_ 0
should____ 0
buy_______ 0
sheep_____ 0
##me______ 0
##at______ 0
from______ 0
countries_ 0
other_____ 0
than______ 0
britain___ 5
until_____ 0
the_______ 0
scientific 0
advice____ 0
was_______ 0
clearer___ 0
._________ 0
[SEP]_____ -100


In [19]:
tokenized_datasets = conll2003.map(tokenize_and_align_labels, batched=True)

Loading cached processed dataset at /Users/jasonz/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98/cache-356be150d420186b.arrow
Loading cached processed dataset at /Users/jasonz/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98/cache-1478eb4cb239280c.arrow
Loading cached processed dataset at /Users/jasonz/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98/cache-5e5bfa8107146ef2.arrow


In [21]:
from transformers import TrainingArguments, Trainer 
args = TrainingArguments( 
"test-ner",
evaluation_strategy = "epoch", 
learning_rate=2e-5, 
per_device_train_batch_size=16, 
per_device_eval_batch_size=16, 
num_train_epochs=3, 
weight_decay=0.01,
lr_scheduler_type='cosine',
warmup_ratio=0.001,
#gradient_accumulation_steps=2,
) 

In [None]:
# Try out Grid search parameter hyperpameter tuning later
# from transformers import Trainer, TrainingArguments
# from sklearn.model_selection import GridSearchCV

# param_grid = {
#     'learning_rate': [1e-5, 2e-5, 5e-5],
#     'per_device_train_batch_size': [16, 32, 64],
#     'num_train_epochs': [3, 5, 10],
# }

# grid_search = GridSearchCV(Trainer, param_grid, cv=3)
# grid_search.fit(train_dataset, eval_dataset)

# best_params = grid_search.best_params_
# print("Best hyperparameters: ", best_params)

# args = TrainingArguments(
#     "test-ner",
#     evaluation_strategy="epoch",
#     learning_rate=best_params['learning_rate'],
#     per_device_train_batch_size=best_params['per_device_train_batch_size'],
#     per_device_eval_batch_size=16,
#     num_train_epochs=best_params['num_train_epochs'],
#     weight_decay=0.01,
# )


In [22]:
data_collator = DataCollatorForTokenClassification(tokenizer) 

In [23]:
metric = datasets.load_metric("seqeval") 

  metric = datasets.load_metric("seqeval")


In [24]:
example = conll2003['train'][0]

In [25]:
label_list = conll2003["train"].features["ner_tags"].feature.names 

label_list

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [26]:
labels = [label_list[i] for i in example["ner_tags"]] 

metric.compute(predictions=[labels], references=[labels]) 

{'MISC': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 2},
 'ORG': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 1.0,
 'overall_f1': 1.0,
 'overall_accuracy': 1.0}

In [27]:
def compute_metrics(eval_preds): 
    pred_logits, labels = eval_preds 
    
    pred_logits = np.argmax(pred_logits, axis=2) 
    # the logits and the probabilities are in the same order,
    # so we donâ€™t need to apply the softmax
    
    # We remove all the values where the label is -100
    predictions = [ 
        [label_list[eval_preds] for (eval_preds, l) in zip(prediction, label) if l != -100] 
        for prediction, label in zip(pred_logits, labels) 
    ] 
    
    true_labels = [ 
      [label_list[l] for (eval_preds, l) in zip(prediction, label) if l != -100] 
       for prediction, label in zip(pred_logits, labels) 
   ] 
    results = metric.compute(predictions=predictions, references=true_labels) 
    return { 
   "precision": results["overall_precision"], 
   "recall": results["overall_recall"], 
   "f1": results["overall_f1"], 
  "accuracy": results["overall_accuracy"], 
  } 

In [28]:
trainer = Trainer( 
    model, 
    args, 
   train_dataset=tokenized_datasets["train"], 
   eval_dataset=tokenized_datasets["validation"], 
   data_collator=data_collator, 
   tokenizer=tokenizer, 
   compute_metrics=compute_metrics 
) 

In [29]:
trainer.train() 

The following columns in the training set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: id, tokens, chunk_tags, pos_tags, ner_tags. If id, tokens, chunk_tags, pos_tags, ner_tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 14041
  Num Epochs = 2
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 878
  Number of trainable parameters = 108898569


  0%|          | 0/878 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: id, tokens, chunk_tags, pos_tags, ner_tags. If id, tokens, chunk_tags, pos_tags, ner_tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3250
  Batch size = 32


  0%|          | 0/102 [00:00<?, ?it/s]

{'eval_loss': 0.07025192677974701, 'eval_precision': 0.908154221138932, 'eval_recall': 0.9169929522317932, 'eval_f1': 0.9125521848037851, 'eval_accuracy': 0.9811110934595771, 'eval_runtime': 2354.7719, 'eval_samples_per_second': 1.38, 'eval_steps_per_second': 0.043, 'epoch': 1.0}


Saving model checkpoint to test-ner/checkpoint-500
Configuration saved in test-ner/checkpoint-500/config.json


{'loss': 0.1836, 'learning_rate': 8.610478359908885e-06, 'epoch': 1.14}


Model weights saved in test-ner/checkpoint-500/pytorch_model.bin
tokenizer config file saved in test-ner/checkpoint-500/tokenizer_config.json
Special tokens file saved in test-ner/checkpoint-500/special_tokens_map.json


KeyboardInterrupt: 

In [30]:
model.save_pretrained("bert_finetuned_model")

Configuration saved in bert_finetuned_model/config.json
Model weights saved in bert_finetuned_model/pytorch_model.bin


In [31]:
tokenizer.save_pretrained("tokenizer")

tokenizer config file saved in tokenizer/tokenizer_config.json
Special tokens file saved in tokenizer/special_tokens_map.json


('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/vocab.txt',
 'tokenizer/added_tokens.json',
 'tokenizer/tokenizer.json')

In [32]:
id2label = {
    str(i): label for i,label in enumerate(label_list)
}
label2id = {
    label: str(i) for i,label in enumerate(label_list)
}

In [33]:
import json


In [34]:
config = json.load(open("bert_finetuned_model/config.json"))

In [35]:
config["id2label"] = id2label
config["label2id"] = label2id

In [36]:
json.dump(config, open("bert_finetuned_model/config.json","w"))

In [37]:
model_fine_tuned = AutoModelForTokenClassification.from_pretrained("bert_finetuned_model")

loading configuration file bert_finetuned_model/config.json
Model config BertConfig {
  "_name_or_path": "bert_finetuned_model",
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-PER",
    "2": "I-PER",
    "3": "B-ORG",
    "4": "I-ORG",
    "5": "B-LOC",
    "6": "I-LOC",
    "7": "B-MISC",
    "8": "I-MISC"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-LOC": "5",
    "B-MISC": "7",
    "B-ORG": "3",
    "B-PER": "1",
    "I-LOC": "6",
    "I-MISC": "8",
    "I-ORG": "4",
    "I-PER": "2",
    "O": "0"
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dt

In [38]:
from transformers import pipeline

In [39]:
nlp = pipeline("ner", model=model_fine_tuned, tokenizer=tokenizer)


example = "Barack Obama was born in Hawaii, and likes Google. Urbana Champaugn is a good place"

ner_results = nlp(example)

print(ner_results)

[{'entity': 'B-PER', 'score': 0.9412754, 'index': 1, 'word': 'barack', 'start': 0, 'end': 6}, {'entity': 'I-PER', 'score': 0.9700916, 'index': 2, 'word': 'obama', 'start': 7, 'end': 12}, {'entity': 'B-LOC', 'score': 0.9931465, 'index': 6, 'word': 'hawaii', 'start': 25, 'end': 31}, {'entity': 'B-ORG', 'score': 0.9473436, 'index': 10, 'word': 'google', 'start': 43, 'end': 49}, {'entity': 'B-LOC', 'score': 0.916526, 'index': 12, 'word': 'urbana', 'start': 51, 'end': 57}, {'entity': 'I-LOC', 'score': 0.82250124, 'index': 13, 'word': 'champ', 'start': 58, 'end': 63}, {'entity': 'I-LOC', 'score': 0.77609175, 'index': 14, 'word': '##au', 'start': 63, 'end': 65}, {'entity': 'I-LOC', 'score': 0.7220281, 'index': 15, 'word': '##gn', 'start': 65, 'end': 67}]


In [42]:
from ipymarkup import show_span_ascii_markup, show_dep_ascii_markup
from ipymarkup import show_span_box_markup
from ipymarkup.palette import palette, BLUE, RED, GREEN
from ipymarkup import format_span_box_markup


text = 'Barack Obama was born in Hawaii, and likes Google. Urbana Champaugn is a good place'

spans = []
for entity in ner_results:
    spans.append((entity["start"], entity["end"], entity["entity"]))

#show_span_ascii_markup(text, spans)

show_span_box_markup(text, spans, palette=palette(PER=BLUE, ORG=RED, LOC=GREEN))

#list(format_span_box_markup(text, spans))
