In [1]:
# we can use hugging face website for models and data https://huggingface.co/

# install
!pip install transformers datasets tokenizers seqeval -q


In [2]:
!pip install -U ipywidgets


Defaulting to user installation because normal site-packages is not writeable


In [3]:
import datasets
import numpy as np
from transformers import BertTokenizerFast
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification

In [4]:
conll2003= datasets.load_dataset("lhoestq/conll2003")

In [5]:
conll2003

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [6]:
conll2003["train"][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [7]:
conll2003["train"].features["ner_tags"].feature

Value('int64')

In [8]:
conll2003["train"].description

''

In [9]:
# init tokenizer

tokenizer=BertTokenizerFast.from_pretrained("bert-base-uncased")

In [10]:
# test the tokenizer

tokenized_input=tokenizer(conll2003["train"][0]["tokens"],is_split_into_words=True)
tokens=tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
print(conll2003["train"][0]["tokens"],"\n",tokenized_input,"\n",tokens) # the tokens converted back has cls and sep tags

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'] 
 {'input_ids': [101, 7327, 19164, 2446, 2655, 2000, 17757, 2329, 12559, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]} 
 ['[CLS]', 'eu', 'rejects', 'german', 'call', 'to', 'boycott', 'british', 'lamb', '.', '[SEP]']


In [11]:
print(len(tokens))
print(len(conll2003["train"][0]["ner_tags"]))

11
9


In [12]:
def tokenize_and_align_labels(examples,label_all_tokens=True):
    tokenized_inputs=tokenizer(examples["tokens"],truncation=True,is_split_into_words=True)
    labels=[]
    for i,label in enumerate(examples["ner_tags"]):
        word_ids=tokenized_inputs.word_ids(batch_index=i)
        # word_ids() returns a list mapping the tokens
        # to their actual word in the init sentence
        # it returns a list indicating the word corresponding to each token
        previous_word_idx=None
        label_ids=[]
        # special tokens like `` and `<\s>` are orig mapped to None
        # we need to set the label to -100 so that they are automatically ignored in the loss function
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100) # set -100 for special tokens
            elif word_idx!=previous_word_idx:
                # if the currect word_idx != prv then it is the ost regular case and add the corresponding token
                label_ids.append(label[word_idx])
            else:
                # to take care of sub-words which have the same word idx; set -100 as well for them but only if label_all_tokens==False
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx=word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"]=labels
    return tokenized_inputs
        
        

In [13]:
print(conll2003["train"][4:5],type(conll2003["train"][4:5])) # we get only the 5th training sample
q=tokenize_and_align_labels(conll2003["train"][4:5])
print(q)

{'id': ['4'], 'tokens': [['Germany', "'s", 'representative', 'to', 'the', 'European', 'Union', "'s", 'veterinary', 'committee', 'Werner', 'Zwingmann', 'said', 'on', 'Wednesday', 'consumers', 'should', 'buy', 'sheepmeat', 'from', 'countries', 'other', 'than', 'Britain', 'until', 'the', 'scientific', 'advice', 'was', 'clearer', '.']], 'pos_tags': [[22, 27, 21, 35, 12, 22, 22, 27, 16, 21, 22, 22, 38, 15, 22, 24, 20, 37, 21, 15, 24, 16, 15, 22, 15, 12, 16, 21, 38, 17, 7]], 'chunk_tags': [[11, 11, 12, 13, 11, 12, 12, 11, 12, 12, 12, 12, 21, 13, 11, 12, 21, 22, 11, 13, 11, 1, 13, 11, 17, 11, 12, 12, 21, 1, 0]], 'ner_tags': [[5, 0, 0, 0, 0, 3, 4, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0]]} <class 'dict'>
{'input_ids': [[101, 2762, 1005, 1055, 4387, 2000, 1996, 2647, 2586, 1005, 1055, 15651, 2837, 14121, 1062, 9328, 5804, 2056, 2006, 9317, 10390, 2323, 4965, 8351, 4168, 4017, 2013, 3032, 2060, 2084, 3725, 2127, 1996, 4045, 6040, 2001, 24509, 1012, 102]], 'token_ty

In [14]:
for token,label in zip(tokenizer.convert_ids_to_tokens(q["input_ids"][0]),q["labels"][0]):
    print(f"{token:_<40}{label}")
# to see the correspondance of labels vs input tokens

[CLS]___________________________________-100
germany_________________________________5
'_______________________________________0
s_______________________________________0
representative__________________________0
to______________________________________0
the_____________________________________0
european________________________________3
union___________________________________4
'_______________________________________0
s_______________________________________0
veterinary______________________________0
committee_______________________________0
werner__________________________________1
z_______________________________________2
##wing__________________________________2
##mann__________________________________2
said____________________________________0
on______________________________________0
wednesday_______________________________0
consumers_______________________________0
should__________________________________0
buy_____________________________________0
sheep__________________________

In [15]:
# let's apply on the entire data
tokenized_dataset=conll2003.map(tokenize_and_align_labels,batched=True)

In [16]:
tokenized_dataset["train"][0]
# preprocessing of data is done

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0],
 'input_ids': [101,
  7327,
  19164,
  2446,
  2655,
  2000,
  17757,
  2329,
  12559,
  1012,
  102],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, -100]}

In [17]:
# define the model
model=AutoModelForTokenClassification.from_pretrained("bert-base-uncased",num_labels=9)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
!pip install accelerate>=0.26.0

In [20]:
# configuration on the model to prepare it for the fine-tuning
from transformers import TrainingArguments,Trainer # define training args
args=TrainingArguments(
    "test-ner", # any name!
    eval_strategy="epoch",
    learning_rate=2e-5,per_device_train_batch_size=16,
    per_device_eval_batch_size=16,num_train_epochs=1, # 3 would be good!
    weight_decay=0.01)

data_collator=DataCollatorForTokenClassification(tokenizer)


In [24]:
#!pip install evaluate

Defaulting to user installation because normal site-packages is not writeable
Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
Installing collected packages: evaluate
Successfully installed evaluate-0.4.6


In [33]:
import evaluate
metric=evaluate.load("seqeval")

example=conll2003["train"][0]
# label_list=conll2003["train"].features["ner_tags"].feature.names  does not work
# define the label list manually
label_list = [
    "O", "B-MISC", "I-MISC",
    "B-PER", "I-PER",
    "B-ORG", "I-ORG",
    "B-LOC", "I-LOC"
]
print("label list",label_list)

for i in example["ner_tags"]:
    print(i)

labels=[label_list[i] for i in example["ner_tags"]]
labels

# calculate the metric
metric.compute(predictions=[labels],references=[labels])

label list ['O', 'B-MISC', 'I-MISC', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']
3
0
7
0
0
0
7
0
0


{'LOC': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 2},
 'PER': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 1.0,
 'overall_f1': 1.0,
 'overall_accuracy': 1.0}

In [34]:
def compute_metrics(eval_preds):
    pred_logits,labels=eval_preds # [batch_size, seq_len, num_labels] ,batch_size x seq_len
    pred_logits=np.argmax(pred_logits,axis=2)
    # the logits and the probs are in the same order; so we don't need to apply the softmax

    # we remove all the values where the label is -100
    predictions=[
    [label_list[eval_preds] for (eval_preds,l) in zip(prediction,label) if l!=-100]
    for prediction,label in zip(pred_logits,labels)
    ] # list of lists; per batch then per token
    results=metric.compute(predictions=predictions,references=true_labels)
    return {
    "precision":results["overall_precision"],
    "recall":results["overall_recall"],
    "f1": results["overall_f1"],
    "accuracy":results["overall_accuracy"],
    }

In [36]:
trainer=Trainer(
    model,
    args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics)

  trainer=Trainer(


In [None]:
trainer.train()

  return FileStore(store_uri, store_uri)


Epoch,Training Loss,Validation Loss


In [None]:
# save model
model.save_pretrained("ner_model")

# save the tokenizer
tokenizer.save_pretrained("tokenizer")

# conversion for label list
id2label={
    str(i): label for i,label in enumerate(label_list)
}
label2id={
    label: str(i)  for i,label in enumerate(label_list)
}

# save config
import json
config=json.load(open("ner_model/config.json"))
config["id2label"]=id2label
config["label2id"]=label2id
json.dump(config,open("ner_model/config.json","w"))

# load model
model_fine_tuned=AutoModelForTokenClassification.from_pretrained("ner_model")

from transformers import pipeline
nlp=pipeline("ner",model=model_fine_tuned,tokenizer=tokenizer)
example="Bill Gates is the founder of Microsoft"
ner_results=nlp(example)
print(ner_results) # inference
