In [1]:
import sys

# Insert utils folder into path
sys.path.insert(1, '../utils')

## Training / Fine-tuning Process

In [2]:
task = "ner"
model_checkpoint = "bert-base-multilingual-cased" # mBERT pre-trained from HuggingFace Hub
batch_size = 16

### Loading the dataset

In [3]:
from datasets import load_dataset, load_metric, concatenate_datasets

datasets = load_dataset("conll2003")

all_data = concatenate_datasets([datasets['train'], datasets['validation'], datasets['test']])

Reusing dataset conll2003 (C:\Users\Bernard\.cache\huggingface\datasets\conll2003\conll2003\1.0.0\40e7cb6bcc374f7c349c83acd1e9352a4f09474eb691f64f364ee62eb65d0ca6)


In [4]:
label_list = all_data.features[f"{task}_tags"].feature.names
label_list

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

### Processing the data

In [5]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [6]:
label_all_tokens = True

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"{task}_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [7]:
tokenized_dataset = all_data.map(tokenize_and_align_labels, batched=True)

  0%|          | 0/21 [00:00<?, ?ba/s]

In [8]:
from sklearn.model_selection import KFold

In [9]:
n = 3
seed = 40
kf = KFold(n_splits=n, random_state=seed, shuffle=True)

In [10]:
import numpy as np

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
        "LOC-f1": results['LOC']["f1"],
        "LOC-precision": results['LOC']["precision"],
        "LOC-recall": results['LOC']["recall"],
    }

In [11]:
from cross_validation_mBERT import cross_validation_mBERT

In [12]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from transformers import EarlyStoppingCallback

fold = 1

args = TrainingArguments(
    output_dir=f"fold-{fold}-{task}",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=1,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
)

metric = load_metric("seqeval")
data_collator = DataCollatorForTokenClassification(tokenizer)

for train_index, eval_index in kf.split(tokenized_dataset):
    
    train_data = tokenized_dataset.select(train_index)
    eval_data = tokenized_dataset.select(eval_index)
    
    model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))
    
    trainer = Trainer(model,
                      args,
                      train_dataset=train_data,
                      eval_dataset=eval_data,
                      data_collator=data_collator,
                      tokenizer=tokenizer,
                      compute_metrics=compute_metrics)
    
    trainer.add_callback(EarlyStoppingCallback(early_stopping_patience=3))
    
    trainer.train()
    
    cross_validation_mBERT(model, tokenizer, label_list, fold)
    
    fold += 1


Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at 

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy,Loc-f1,Loc-precision,Loc-recall,Runtime,Samples Per Second
1,0.1876,0.081899,0.905605,0.913827,0.909697,0.978133,0.940904,0.94415,0.937679,10.2617,673.867


  0%|          | 0/2 [00:00<?, ?ba/s]

Do you want to overwrite results? (y/n)y
Evaluation mode: strict
fp: 1668 | tp: 3519 | fn: 2024
precision: 0.678 | recall: 0.635 | f-score: 0.656 | accuracy: 0.635
------------------------------------------------------------------------

Do you want to overwrite results? (y/n)y
Evaluation mode: forgiving
fp: 219 | tp: 4968 | fn: 1069
precision: 0.958 | recall: 0.823 | f-score: 0.885 | accuracy: 0.896
------------------------------------------------------------------------

Yay


Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at 

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy,Loc-f1,Loc-precision,Loc-recall,Runtime,Samples Per Second
1,0.1945,0.068335,0.918261,0.924684,0.921461,0.981445,0.944656,0.945709,0.943606,10.5736,653.988


  0%|          | 0/2 [00:00<?, ?ba/s]

Do you want to overwrite results? (y/n)y
Evaluation mode: strict
fp: 1755 | tp: 3619 | fn: 1924
precision: 0.673 | recall: 0.653 | f-score: 0.663 | accuracy: 0.653
------------------------------------------------------------------------

Do you want to overwrite results? (y/n)y
Evaluation mode: forgiving
fp: 247 | tp: 5127 | fn: 978
precision: 0.954 | recall: 0.840 | f-score: 0.893 | accuracy: 0.925
------------------------------------------------------------------------

Yay


Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at 

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy,Loc-f1,Loc-precision,Loc-recall,Runtime,Samples Per Second
1,0.1972,0.077795,0.927106,0.919427,0.923251,0.980528,0.951858,0.956748,0.947017,10.2896,671.943


  0%|          | 0/2 [00:00<?, ?ba/s]

Do you want to overwrite results? (y/n)y
Evaluation mode: strict
fp: 1746 | tp: 3741 | fn: 1802
precision: 0.682 | recall: 0.675 | f-score: 0.678 | accuracy: 0.675
------------------------------------------------------------------------

Do you want to overwrite results? (y/n)y
Evaluation mode: forgiving
fp: 244 | tp: 5243 | fn: 828
precision: 0.956 | recall: 0.864 | f-score: 0.907 | accuracy: 0.946
------------------------------------------------------------------------

Yay


In [37]:
len(tokenized_dataset[train_index]['tokens'])

16595

### Fine-tuning

In [7]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at 

In [9]:
args = TrainingArguments(
    output_dir=f"fold-{i}-{task}",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
)

In [10]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

In [13]:
metric = load_metric("seqeval")

In [14]:
import numpy as np

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
        "LOC-f1": results['LOC']["f1"],
        "LOC-precision": results['LOC']["precision"],
        "LOC-recall": results['LOC']["recall"],
    }

In [15]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [16]:
# Add early stopping to trainer

from transformers import EarlyStoppingCallback

trainer.add_callback(EarlyStoppingCallback(early_stopping_patience=3))

In [17]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy,Loc-f1,Loc-precision,Loc-recall,Runtime,Samples Per Second
1,0.1954,0.071772,0.93076,0.929266,0.930013,0.982589,0.95621,0.966338,0.946292,4.954,656.036
2,0.0536,0.061716,0.942976,0.940127,0.941549,0.986185,0.961155,0.977099,0.945723,4.911,661.78
3,0.0291,0.059609,0.948968,0.946548,0.947757,0.986697,0.969828,0.98082,0.959079,4.9144,661.317
4,0.0188,0.065065,0.952422,0.949004,0.95071,0.98734,0.973489,0.976551,0.970446,4.9377,658.206
5,0.0127,0.069545,0.94606,0.952403,0.949221,0.987355,0.970258,0.971779,0.968741,5.021,647.281
6,0.0074,0.075646,0.945748,0.951553,0.948642,0.987164,0.970147,0.975302,0.965047,4.944,657.362
7,0.0061,0.079545,0.950359,0.949193,0.949776,0.987296,0.968781,0.971968,0.965615,5.0436,644.384


TrainOutput(global_step=6146, training_loss=0.038375676109819505, metrics={'train_runtime': 644.1141, 'train_samples_per_second': 13.631, 'total_flos': 5198621045563836.0, 'epoch': 7.0})

### Save best model to disk

```python
model_dir = 'ner-multilingual-bert-fine-tuned'

model.save_pretrained(model_dir)

tokenizer.save_pretrained(model_dir)
```

In [None]:
# Evaluate using trainer.evaluate method
trainer.evaluate(tokenized_datasets['test'])

In [None]:
# Evaluate using trainer.predict method
predictions, labels, _ = trainer.predict(tokenized_datasets["validation"])
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

### Predict single example

In [None]:
import torch

In [None]:
testie_text = 'Hi, I am Francesca and I like to eat tacos in Mexico'

In [None]:
outputs = model(**tokenizer(testie_text, truncation=True, return_tensors='pt').to('cuda'))

In [None]:
input_ids = torch.tensor([tokenized_sentence]).cuda()

In [None]:
# outputs = model(input_ids)

In [None]:
import numpy as np

In [None]:
label_indices = np.argmax(outputs[0].to('cpu').detach().numpy(), axis=2)

In [None]:
# join bpe split tokens
tokens = tokenizer.convert_ids_to_tokens(input_ids.to('cpu').detach().numpy()[0])
new_tokens, new_labels = [], []
for token, label_idx in zip(tokens, label_indices[0]):
    if token.startswith("##"):
        new_tokens[-1] = new_tokens[-1] + token[2:]
    else:
        new_labels.append(label_list[label_idx])
        new_tokens.append(token)

In [None]:
for token, label in zip(new_tokens, new_labels):
    print("{}\t{}".format(label, token))