<a href="https://colab.research.google.com/github/Calcifer777/learn-nlp/blob/main/learn-transformers/ner_ft.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers datasets seqeval

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from typing import List, Dict

import numpy as np
import pandas as pd

import transformers
from datasets import load_dataset

from transformers import (
    RobertaForTokenClassification,
    RobertaConfig,
    AutoConfig,
    AutoModelForTokenClassification,
    AutoTokenizer,
    RobertaTokenizer,
    RobertaTokenizerFast,

    TrainingArguments,

    DataCollatorForTokenClassification,
)

import torch

from seqeval.metrics import f1_score

In [3]:
ds = load_dataset("xtreme", name="PAN-X.en")



  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
ds.set_format("pd")

In [5]:
pd.DataFrame(ds["train"][:5])

Unnamed: 0,tokens,ner_tags,langs
0,"[R.H., Saunders, (, St., Lawrence, River, ), (...","[3, 4, 0, 3, 4, 4, 0, 0, 0, 0, 0]","[en, en, en, en, en, en, en, en, en, en, en]"
1,"[;, ', '', Anders, Lindström, '', ']","[0, 0, 0, 1, 2, 0, 0]","[en, en, en, en, en, en, en]"
2,"[Karl, Ove, Knausgård, (, born, 1968, )]","[1, 2, 2, 0, 0, 0, 0]","[en, en, en, en, en, en, en]"
3,"[Atlantic, City, ,, New, Jersey]","[5, 6, 6, 6, 6]","[en, en, en, en, en]"
4,"[Her, daughter, from, the, second, marriage, w...","[0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, ...","[en, en, en, en, en, en, en, en, en, en, en, e..."


In [6]:
ds.reset_format()

In [7]:
class_labels = ds["train"].features["ner_tags"].feature

In [8]:
ds = ds.map(
    function=lambda batch: {"ner_tags_str": [class_labels.int2str(tag) for tag in batch["ner_tags"]]},
    batched=True,
    batch_size=64,
)



In [9]:
tags_freqs = dict()

for split, ds_split in ds.items():
  df_split = pd.DataFrame(ds_split[:])
  tmp = df_split.explode("ner_tags_str").groupby("ner_tags_str").size()
  tags_freqs[split] = tmp / tmp.sum()

pd.DataFrame.from_dict(tags_freqs, orient="columns")

Unnamed: 0_level_0,train,validation,test
ner_tags_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
B-LOC,0.058263,0.060023,0.057976
B-ORG,0.058743,0.058073,0.059072
B-PER,0.057134,0.057552,0.056719
I-LOC,0.082154,0.078934,0.08026
I-ORG,0.144806,0.144507,0.144499
I-PER,0.091637,0.093374,0.093121
O,0.507263,0.507537,0.508353


In [10]:
model_name = "roberta-base"

In [11]:
idx2tag = {idx: v for idx, v in enumerate(class_labels.names)}
tag2idx = {v: idx for idx, v in enumerate(class_labels.names)}

In [12]:
config = AutoConfig.from_pretrained(
    model_name,
    num_labels=class_labels.num_classes,
    id2label=idx2tag, 
    label2id=tag2idx,
)

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = (
    RobertaForTokenClassification
    .from_pretrained(model_name, config=config)
    .to(device)
)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForTokenClassification: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able

In [14]:
#https://stackoverflow.com/questions/61134275/difficulty-in-understanding-the-tokenizer-used-in-roberta-model
tokenizer = RobertaTokenizerFast.from_pretrained(model_name, add_prefix_space=True)

In [15]:
tokens = tokenizer([["hi", "my"], ["ciao", "sono"]], is_split_into_words=True)

In [16]:
tokens.word_ids(batch_index=0)

[None, 0, 1, None]

In [17]:
def tokenize_and_align_labels(batch):
  tokenized_inputs = tokenizer(
      batch["tokens"], 
      truncation=True, 
      is_split_into_words=True,
  )
  labels = []
  for idx, label in enumerate(batch["ner_tags"]):
    # Get word ids of each sample
    word_ids = tokenized_inputs.word_ids(batch_index=idx)
    #
    previous_word_idx = None
    label_ids = []
    for word_idx in word_ids:
      if word_idx is None or word_idx == previous_word_idx:
        label_ids.append(-100)
      else:
        label_ids.append(label[word_idx])
      previous_word_idx = word_idx
    labels.append(label_ids)
  tokenized_inputs["labels"] = labels
  return tokenized_inputs

In [18]:
ds_tkn = ds.map(
    tokenize_and_align_labels, 
    batched=True, 
    batch_size=32,
    remove_columns=['langs', 'ner_tags', 'tokens']
)
ds_tkn



  0%|          | 0/313 [00:00<?, ?ba/s]



DatasetDict({
    train: Dataset({
        features: ['ner_tags_str', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 20000
    })
    validation: Dataset({
        features: ['ner_tags_str', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['ner_tags_str', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 10000
    })
})

In [19]:
def align_predictions(predictions, label_ids):
    preds = np.argmax(predictions, axis=2)
    batch_size, seq_len = preds.shape
    labels_list, preds_list = [], []

    for batch_idx in range(batch_size):
        example_labels, example_preds = [], []
        for seq_idx in range(seq_len):
            # Ignore label IDs = -100
            if label_ids[batch_idx, seq_idx] != -100:
                example_labels.append(idx2tag[label_ids[batch_idx][seq_idx]])
                example_preds.append(idx2tag[preds[batch_idx][seq_idx]])

        labels_list.append(example_labels)
        preds_list.append(example_preds)

    return preds_list, labels_list

In [20]:
num_epochs = 3
batch_size = 16
logging_steps = len(ds_tkn["train"]) // batch_size
model_name_ft = f"{model_name}-finetuned-panx-it"

training_args = TrainingArguments(
    output_dir=model_name_ft, 
    log_level="error", 
    num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size, 
    evaluation_strategy="epoch",
    save_steps=1e6, 
    weight_decay=0.01, 
    disable_tqdm=False,
    logging_steps=logging_steps, 
    push_to_hub=False
)

In [21]:
def compute_metrics(eval_pred):
  y_pred, y_true = align_predictions(
    eval_pred.predictions,
    eval_pred.label_ids
  )
  return {"f1": f1_score(y_true, y_pred)}

In [22]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [23]:
def model_init():
    return (
      RobertaForTokenClassification
        .from_pretrained(model_name, config=config)
        .to(device)
    )

In [24]:
trainer = transformers.Trainer(
  model_init=model_init,
  args=training_args,
  data_collator=data_collator, 
  compute_metrics=compute_metrics,
  train_dataset=ds_tkn["train"],
  eval_dataset=ds_tkn["validation"],
  tokenizer=tokenizer
)

In [25]:
trainer.train()



Epoch,Training Loss,Validation Loss,F1
1,0.3685,0.26399,0.798139
2,0.2209,0.244988,0.827408
3,0.1524,0.249918,0.839982


TrainOutput(global_step=3750, training_loss=0.2472626953125, metrics={'train_runtime': 598.2258, 'train_samples_per_second': 100.297, 'train_steps_per_second': 6.269, 'total_flos': 940335215806272.0, 'train_loss': 0.2472626953125, 'epoch': 3.0})

In [26]:
from collections import defaultdict

d = defaultdict()

In [27]:
type(d)

collections.defaultdict