# Multibert, fine-tune on the union of ru and uk datasets

## Data

In [1]:
import os
# DATA_PATH = "../data/" 
DATA_PATH = "/kaggle/input/" 
SAVE_PATH = "/kaggle/working/"

In [2]:
import pandas as pd

In [3]:
ru_geo = pd.read_csv("/kaggle/input/ner-geo-dataset-light/ru_geo_dataset_light.csv", converters={"loc_markers": eval})
uk_geo = pd.read_csv("/kaggle/input/ner-geo-dataset-light/ru_geo_dataset_light.csv", converters={"loc_markers": eval})
df_locations = pd.concat([ru_geo, uk_geo])

### Preprocess

In [4]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

Here we partially fixed skipping entites, which labeling doesn't align with separation to tokens. There are sill skipped entites. As we can see from the output, this happens when there is missing space before or after the entity. 

In [5]:
import spacy

from spacy.training.iob_utils import biluo_to_iob, doc_to_biluo_tags
from tqdm.autonotebook import tqdm
tqdm.pandas()

df_locations.loc_markers = df_locations.loc_markers.apply(lambda x: [[y[0], y[1], 'LOC']  for y in x])

nlp = spacy.blank("xx")

def convert_to_conll(row):
    data = {
        "text": row['text'],
        "label": row['loc_markers']
    }
    doc = nlp(data["text"])
    ents = []
    for start, end, label in data["label"]:
        span = doc.char_span(start, end, label=label, alignment_mode='contract')
        if span is None:
            span_extended = doc.char_span(start, end, label=label, alignment_mode='expand')
            print(f"Skipping span {span_extended} expanded from {doc.text[start:end]}")
        else:
            ents.append(span)
    doc.ents = ents
    return {
        'tokens': list([t.text for t in doc]),
        'labels': list(biluo_to_iob(doc_to_biluo_tags(doc)))
    }

df_locations['conll'] = df_locations.progress_apply(convert_to_conll, axis=1)



  0%|          | 0/220000 [00:00<?, ?it/s]

Skipping span США.Чиновники expanded from США
Skipping span Москвыразработает expanded from Москвы
Skipping span РФпришли expanded from РФ
Skipping span вМоскве expanded from Москве
Skipping span РФпоМоскве expanded from РФ
Skipping span РФпоМоскве expanded from Москве
Skipping span Скалиcтых expanded from Скали
Skipping span РФВладимир expanded from РФ
Skipping span вЛондоне expanded from Лондоне
Skipping span ВМоскве expanded from Москве
Skipping span заРоссией expanded from Россией
Skipping span вСирию expanded from Сирию
Skipping span ВСША expanded from США
Skipping span вНайроби expanded from Найроби
Skipping span ВБангкоке expanded from Бангкоке
Skipping span ВРиге expanded from Риге
Skipping span комитетРФ expanded from РФ
Skipping span ВМоскве expanded from Москве
Skipping span ВСША expanded from США
Skipping span РФАлександр expanded from РФ
Skipping span РФ" expanded from РФ
Skipping span Wombwell)в expanded from Wombwell
Skipping span воФранции expanded from Франции
Skipping

In [6]:
label2id = {'O': 0, 'B-LOC': 1, 'I-LOC': 2}

df_locations['tokens'] = df_locations.conll.str['tokens']
df_locations['ner_tags'] = df_locations.conll.str['labels'].apply(lambda x: [label2id[t] for t in x])

df_train = df_locations[df_locations.is_valid == 0]
df_valid = df_locations[df_locations.is_valid == 1]

In [7]:
df_train[['tokens', 'ner_tags']].to_json(
    os.path.join(SAVE_PATH, 'train_light.json'), orient='records', lines=True)
df_valid[['tokens', 'ner_tags']].to_json(
    os.path.join(SAVE_PATH, 'valid_light.json'), orient='records', lines=True)

In [8]:
import datasets

In [9]:
from datasets import load_dataset

raw_datasets = load_dataset(
    "json",
    data_files={
        'train': os.path.join(SAVE_PATH, 'train_light.json'),
        'val': os.path.join(SAVE_PATH, 'valid_light.json'),
    },
)

Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-b0ef5f7488cbb67d/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-b0ef5f7488cbb67d/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [10]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

label2id = {'O': 0, 'B-LOC': 1, 'I-LOC': 2}
id2label = {v: k for k, v in label2id.items()}

model = AutoModelForTokenClassification.from_pretrained(
    'bert-base-multilingual-cased',
    id2label=id2label,
    label2id=label2id,
)
tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

In [11]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id is None:
            new_labels.append(-100)
        elif word_id != current_word:
            # Start of a new word!
            current_word = word_id
            new_labels.append(labels[word_id])
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [12]:
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

  0%|          | 0/200 [00:00<?, ?ba/s]

  0%|          | 0/20 [00:00<?, ?ba/s]

In [13]:
from transformers import TrainingArguments

args = TrainingArguments(
    "multibert-ner-combined",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=5
)

In [14]:
from transformers import AdamW, get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

optimizer = AdamW([
    {'params': list(model.bert.parameters()), 'lr': 1e-5},
    {'params': list(model.classifier.parameters()), 'lr': 1e-3}
])

# scheduler = get_linear_schedule_with_warmup(
#     optimizer,
#     num_warmup_steps=0.1*3*(tokenized_datasets['train'].num_rows/16),
#     num_training_steps=3*(tokenized_datasets['train'].num_rows/16)
# )
scheduler = get_cosine_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0.1*5*(tokenized_datasets['train'].num_rows/16),
    num_training_steps=5*(tokenized_datasets['train'].num_rows/16)
)



In [15]:
!pip install evaluate
!pip install seqeval


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: evaluate
Successfully installed evaluate-0.4.1
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.5 MB/

In [16]:
import evaluate
import numpy as np

metric = evaluate.load("seqeval")

label_names = list(label2id.keys())

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [17]:
import wandb

try:
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    api_key = user_secrets.get_secret("wandb_api")
    wandb.login(key=api_key)
    anony = None
except:
    anony = "must"
    print('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. \nGet your W&B access token from here: https://wandb.ai/authorize')

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [18]:
from transformers import Trainer

train_sample = [tokenized_datasets['train'][i] for i in range(10)]
val_sample = [tokenized_datasets['val'][i] for i in range(10)]

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
#     train_dataset=train_sample,
    eval_dataset=tokenized_datasets["val"],
#     eval_dataset=val_sample,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    optimizers=(optimizer, scheduler)
)

In [19]:
trainer.train()
wandb.finish()

[34m[1mwandb[0m: Currently logged in as: [33myevhenii-azarov[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: wandb version 0.15.12 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
[34m[1mwandb[0m: Tracking run with wandb version 0.15.9
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20231029_024118-k2dnxsjo[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33meager-eon-7[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/yevhenii-azarov/huggingface[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/yevhenii-azarov/huggingface/runs/k2dnxsjo[0m
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.012,0.012222,0.961317,0.978011,0.969592,0.996642
2,0.0072,0.013121,0.963325,0.982429,0.972783,0.996803
3,0.0036,0.018655,0.960954,0.981196,0.97097,0.996626
4,0.002,0.021431,0.963682,0.978833,0.971198,0.996835
5,0.0012,0.02639,0.963802,0.979449,0.971563,0.996798


[34m[1mwandb[0m: Waiting for W&B process to finish... [32m(success).[0m
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:                  eval/accuracy ▂▇▁█▇
[34m[1mwandb[0m:                        eval/f1 ▁█▄▅▅
[34m[1mwandb[0m:                      eval/loss ▁▁▄▆█
[34m[1mwandb[0m:                 eval/precision ▂▇▁██
[34m[1mwandb[0m:                    eval/recall ▁█▆▂▃
[34m[1mwandb[0m:                   eval/runtime ██▁▇▁
[34m[1mwandb[0m:        eval/samples_per_second ▁▁█▂█
[34m[1mwandb[0m:          eval/steps_per_second ▁▂█▂█
[34m[1mwandb[0m:                    train/epoch ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
[34m[1mwandb[0m:              train/global_step ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
[34m[1mwandb[0m:            train/learning_rate ▂▃▅▇██████▇▇▇▇▇▆▆▆▆▅▅▅▄▄▄▃▃▃▃▂▂▂▂▂▁▁▁▁▁▁
[34m[1mwandb[0m:                     train/loss █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:               train/total