# Multibert, fine-tune on the ru dataset

In [1]:
import os
# DATA_PATH = "../data/" 
DATA_PATH = "/kaggle/input/" 
SAVE_PATH = "/kaggle/working/"

In [2]:
DATASETS_PATH = os.path.join(DATA_PATH, "nlp-ua-locations-extractions")

## Data

We will use approximately the same amount of data, as for the uk multiebert fine-tuning, to reduce training time

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split



### Load and sample data

In [4]:
ru_geo_dataset = pd.read_csv(os.path.join(DATASETS_PATH, 'ru_geo_dataset.csv'), converters={"loc_markers": eval})
ru_geo_dataset.shape

(8028840, 6)

In ru dataset samples are grouped by doc, from which they were taken. Since we will use only small part of dataset, we will take sample from different docs, to increase diversity in the training data. This also means that we don't need to worry about grouping samples by `doc_id` for train-val split

In [5]:
N_train = 100000
N_val = 10000

In [6]:
ru_geo_dataset_light = ru_geo_dataset.groupby('doc_id').first()\
    .sample(N_train + N_val, random_state=42).reset_index()

In [7]:
ru_geo_dataset_light['num_loc'] = ru_geo_dataset_light['loc_markers'].apply(len)

In [8]:
ru_geo_dataset_light['num_loc'].value_counts()

num_loc
1     44226
0     39109
2     19361
3      5614
4      1261
5       304
6        76
7        26
8        10
9         7
12        2
10        1
14        1
13        1
11        1
Name: count, dtype: int64

In [9]:
ru_geo_dataset_light.loc[ru_geo_dataset_light['num_loc'].isin([10, 11, 12, 13, 14]), 'num_loc'] = -1

In [10]:
# to avoid copying the dataset, mark valid inplace (may be cruacial when using full dataset)
train_idx, val_idx = train_test_split(ru_geo_dataset_light.index, stratify=ru_geo_dataset_light['num_loc'], test_size=N_val)

In [11]:
ru_geo_dataset_light['is_valid'] = 0
ru_geo_dataset_light.loc[val_idx, 'is_valid'] = 1

In [12]:
ru_geo_dataset_light.loc[ru_geo_dataset_light['is_valid'] == 1]

Unnamed: 0,doc_id,text,loc_markers,org_markers,per_markers,sent_id,num_loc,is_valid
26,656797,"20 ноября в московском кинотеатре ""Пушкинский ...",[],"[(35, 45)]","[(77, 83), (86, 98)]",0,0,1
39,355447,"Российская теннисистка Мария Шарапова, занимаю...","[(127, 136)]",[],"[(23, 37)]",0,1,1
48,43590,На форме олимпийской сборной России по хоккею ...,"[(29, 35)]",[],[],0,1,1
50,545412,Более 20 тысяч израильтян собрались вечером в ...,"[(63, 73), (131, 138)]","[(161, 169)]","[(139, 150)]",0,2,1
52,215758,Актер Иван Охлобыстин назвал своего коллегу Ми...,[],[],"[(6, 21), (44, 63)]",0,0,1
...,...,...,...,...,...,...,...,...
109902,126580,МИД Турции вызвал посла Швеции в Анкаре после ...,"[(4, 10), (24, 30), (33, 39)]","[(0, 3), (54, 61)]",[],0,3,1
109933,100571,Турецкая полиция опубликовала фотографию мужчи...,"[(88, 96)]",[],[],0,1,1
109958,655197,Американский институт кино (American Film Inst...,[],"[(0, 57), (156, 163)]",[],0,0,1
109971,517365,"Конфликт вокруг анимационного телеканала ""2х2""...",[],"[(42, 45)]",[],0,0,1


In [13]:
ru_geo_dataset_light[['text', 'loc_markers', 'is_valid']].to_csv(os.path.join(SAVE_PATH, "ru_geo_dataset_light.csv"), index=None)

In [14]:
df_locations = pd.read_csv(os.path.join(SAVE_PATH, "ru_geo_dataset_light.csv"), converters={"loc_markers": eval})

### Preprocess

In [15]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

Here we partially fixed skipping entites, which labeling doesn't align with separation to tokens. There are sill skipped entites. As we can see from the output, this happens when there is missing space before or after the entity. 

In [16]:
import spacy

from spacy.training.iob_utils import biluo_to_iob, doc_to_biluo_tags
from tqdm.autonotebook import tqdm
tqdm.pandas()

df_locations.loc_markers = df_locations.loc_markers.apply(lambda x: [[y[0], y[1], 'LOC']  for y in x])

nlp = spacy.blank("xx")

def convert_to_conll(row):
    data = {
        "text": row['text'],
        "label": row['loc_markers']
    }
    doc = nlp(data["text"])
    ents = []
    for start, end, label in data["label"]:
        span = doc.char_span(start, end, label=label, alignment_mode='contract')
        if span is None:
            span_extended = doc.char_span(start, end, label=label, alignment_mode='expand')
            print(f"Skipping span {span_extended} expanded from {doc.text[start:end]}")
        else:
            ents.append(span)
    doc.ents = ents
    return {
        'tokens': list([t.text for t in doc]),
        'labels': list(biluo_to_iob(doc_to_biluo_tags(doc)))
    }

df_locations['conll'] = df_locations.progress_apply(convert_to_conll, axis=1)

  0%|          | 0/110000 [00:00<?, ?it/s]

Skipping span США.Чиновники expanded from США
Skipping span Москвыразработает expanded from Москвы
Skipping span РФпришли expanded from РФ
Skipping span вМоскве expanded from Москве
Skipping span РФпоМоскве expanded from РФ
Skipping span РФпоМоскве expanded from Москве
Skipping span Скалиcтых expanded from Скали
Skipping span РФВладимир expanded from РФ
Skipping span вЛондоне expanded from Лондоне
Skipping span ВМоскве expanded from Москве
Skipping span заРоссией expanded from Россией
Skipping span вСирию expanded from Сирию
Skipping span ВСША expanded from США
Skipping span вНайроби expanded from Найроби
Skipping span ВБангкоке expanded from Бангкоке
Skipping span ВРиге expanded from Риге
Skipping span комитетРФ expanded from РФ
Skipping span ВМоскве expanded from Москве
Skipping span ВСША expanded from США
Skipping span РФАлександр expanded from РФ
Skipping span РФ" expanded from РФ
Skipping span Wombwell)в expanded from Wombwell
Skipping span воФранции expanded from Франции
Skipping

In [17]:
label2id = {'O': 0, 'B-LOC': 1, 'I-LOC': 2}

df_locations['tokens'] = df_locations.conll.str['tokens']
df_locations['ner_tags'] = df_locations.conll.str['labels'].apply(lambda x: [label2id[t] for t in x])

df_train = df_locations[df_locations.is_valid == 0]
df_valid = df_locations[df_locations.is_valid == 1]

In [18]:
df_train[['tokens', 'ner_tags']].to_json(
    os.path.join(SAVE_PATH, 'train_light.json'), orient='records', lines=True)
df_valid[['tokens', 'ner_tags']].to_json(
    os.path.join(SAVE_PATH, 'valid_light.json'), orient='records', lines=True)

In [19]:
import datasets

In [20]:
from datasets import load_dataset

raw_datasets = load_dataset(
    "json",
    data_files={
        'train': os.path.join(SAVE_PATH, 'train_light.json'),
        'val': os.path.join(SAVE_PATH, 'valid_light.json'),
    },
)

Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-eb3e2ad572a9611c/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-eb3e2ad572a9611c/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [21]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

label2id = {'O': 0, 'B-LOC': 1, 'I-LOC': 2}
id2label = {v: k for k, v in label2id.items()}

model = AutoModelForTokenClassification.from_pretrained(
    'bert-base-multilingual-cased',
    id2label=id2label,
    label2id=label2id,
)
tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

In [22]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id is None:
            new_labels.append(-100)
        elif word_id != current_word:
            # Start of a new word!
            current_word = word_id
            new_labels.append(labels[word_id])
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [23]:
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

  0%|          | 0/100 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

In [24]:
from transformers import TrainingArguments

args = TrainingArguments(
    "multibert-ner-ru",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=3
)

In [25]:
from transformers import AdamW, get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

optimizer = AdamW([
    {'params': list(model.bert.parameters()), 'lr': 1e-5},
    {'params': list(model.classifier.parameters()), 'lr': 1e-3}
])

# scheduler = get_linear_schedule_with_warmup(
#     optimizer,
#     num_warmup_steps=0.1*3*(tokenized_datasets['train'].num_rows/16),
#     num_training_steps=3*(tokenized_datasets['train'].num_rows/16)
# )
scheduler = get_cosine_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0.1*3*(tokenized_datasets['train'].num_rows/16),
    num_training_steps=3*(tokenized_datasets['train'].num_rows/16)
)



In [26]:
from typing import List

def metrics(y_true: List[List[str]], y_pred: List[List[str]]):
    assert len(y_true) == len(y_pred)
    tp, fp, fn = 0.0, 0.0, 0.0

    for y_true_sample, y_pred_sample in zip(y_true, y_pred):
        tp += len(set(y_true_sample) & set(y_pred_sample))
        fp += len(set(y_pred_sample) - set(y_true_sample))
        fn += len(set(y_true_sample) - set(y_pred_sample))
    
    precision = tp / (tp + fp) if tp + fp != 0 else 0.0 if tp + fn != 0.0 else 1.0
    recall = tp / (tp + fn) if tp + fn != 0 else 1.0
    f1 = 2 * precision * recall / (precision + recall) if precision + recall != 0 else 0.0

    return {
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

In [27]:
import numpy as np

label_names = list(label2id.keys())

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    return metrics(y_true=true_predictions, y_pred=true_labels)

In [28]:
import wandb

try:
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    api_key = user_secrets.get_secret("wandb_api")
    wandb.login(key=api_key)
    anony = None
except:
    anony = "must"
    print('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. \nGet your W&B access token from here: https://wandb.ai/authorize')

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [29]:
from transformers import Trainer

train_sample = [tokenized_datasets['train'][i] for i in range(10)]
val_sample = [tokenized_datasets['val'][i] for i in range(10)]

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
#     train_dataset=train_sample,
    eval_dataset=tokenized_datasets["val"],
#     eval_dataset=val_sample,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    optimizers=(optimizer, scheduler)
)

In [30]:
trainer.train()
wandb.finish()

[34m[1mwandb[0m: Currently logged in as: [33myevhenii-azarov[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: wandb version 0.15.12 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
[34m[1mwandb[0m: Tracking run with wandb version 0.15.9
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20231028_212539-aotgpzq9[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mfirm-salad-6[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/yevhenii-azarov/huggingface[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/yevhenii-azarov/huggingface/runs/aotgpzq9[0m
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.0145,0.01233,0.993711,0.988603,0.99115
2,0.0101,0.010352,0.995076,0.989145,0.992102
3,0.0071,0.011573,0.995369,0.988909,0.992128


[34m[1mwandb[0m: Waiting for W&B process to finish... [32m(success).[0m
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:                        eval/f1 ▁██
[34m[1mwandb[0m:                      eval/loss █▁▅
[34m[1mwandb[0m:                 eval/precision ▁▇█
[34m[1mwandb[0m:                    eval/recall ▁█▅
[34m[1mwandb[0m:                   eval/runtime █▆▁
[34m[1mwandb[0m:        eval/samples_per_second ▁▃█
[34m[1mwandb[0m:          eval/steps_per_second ▁▃█
[34m[1mwandb[0m:                    train/epoch ▁▁▁▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
[34m[1mwandb[0m:              train/global_step ▁▁▁▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
[34m[1mwandb[0m:            train/learning_rate ▃▅▇██████▇▇▇▇▇▆▆▆▅▅▅▄▄▄▃▃▃▃▂▂▂▂▁▁▁▁▁▁
[34m[1mwandb[0m:                     train/loss █▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:               train/total_flos ▁
[34m[1mwandb[0m:               train/train_loss ▁
[34m[1mwandb[