# Multibert combined 

In [1]:
import os
# DATA_PATH = "../data/" 
DATA_PATH = "/kaggle/input/" 
SAVE_PATH = "/kaggle/working/"

In [2]:
DATASETS_PATH = os.path.join(DATA_PATH, "nlp-ua-locations-extractions")

In [3]:
from sklearn.model_selection import train_test_split



## Data

### Investigate languages ratio in the test set

In [4]:
!pip install langid



In [5]:
import pandas as pd
import langid
langid.set_languages(['ru', 'uk'])

In [6]:
df_test = pd.read_csv(os.path.join(DATASETS_PATH, "test.csv"))
df_test['lang'] = df_test['text'].apply(lambda x: langid.classify(x)[0])

In [7]:
df_test['lang'].value_counts()

lang
uk    423
ru     54
Name: count, dtype: int64

In [8]:
lang_ratio = (df_test['lang'] == 'uk').sum() / (df_test['lang'] == 'ru').sum()
lang_ratio

7.833333333333333

### Load and sample data

In [9]:
uk_geo_dataset = pd.read_csv(os.path.join(DATASETS_PATH, 'uk_geo_dataset.csv'), converters={"loc_markers": eval})
ru_geo_dataset = pd.read_csv(os.path.join(DATASETS_PATH, 'ru_geo_dataset.csv'), converters={"loc_markers": eval})
uk_geo_dataset.shape, ru_geo_dataset.shape

((1010000, 5), (8028840, 6))

In [10]:
_, uk_geo_dataset_light = train_test_split(uk_geo_dataset, stratify=uk_geo_dataset["is_valid"], test_size=0.1, random_state=42)
val_ratio = (uk_geo_dataset_light['is_valid'] == 1).sum() / uk_geo_dataset_light.shape[0]

In [11]:
# sample from different docs to increase diversity of the sub-dataset
ru_geo_dataset_light = ru_geo_dataset.groupby('doc_id').first().sample(int(uk_geo_dataset_light.shape[0] / lang_ratio)).reset_index()

In [12]:
ru_geo_dataset_light['is_valid'] = ru_geo_dataset_light['text'].apply(lambda x: 1 if hash(x) % int(1/val_ratio) == 0 else 0)

In [13]:
uk_geo_dataset_light.sample(5)

Unnamed: 0,text,loc_markers,org_markers,per_markers,is_valid
576618,Барбадоська співачка та дизайнерка Ріанна поді...,[],[],"[(35, 41)]",0
191299,«Підозрюваний використовував орендовані кварти...,"[(52, 65)]",[],[],0
576296,Всесвітня організація охорони здоров'я заявила...,[],"[(108, 116)]",[],0
697127,"Американський глава заявив, що на минулих вибо...",[],[],[],0
587999,Щоб дістати з деформованої кабіни водія автобу...,[],[],[],0


In [14]:
uk_geo_dataset_light[['text', 'loc_markers', 'is_valid']].to_csv(os.path.join(SAVE_PATH, "uk_geo_dataset_light.csv"), index=None)

In [15]:
ru_geo_dataset_light[['text', 'loc_markers', 'is_valid']].to_csv(os.path.join(SAVE_PATH, "ru_geo_dataset_light.csv"), index=None)

In [16]:
uk_geo_dataset = pd.read_csv(os.path.join(SAVE_PATH, "uk_geo_dataset_light.csv"), converters={'loc_markers': eval})
ru_geo_dataset = pd.read_csv(os.path.join(SAVE_PATH, "ru_geo_dataset_light.csv"), converters={'loc_markers': eval})

uk_geo_dataset['lang'] = 'uk'
ru_geo_dataset['lang'] = 'ru'
df_locations = pd.concat([uk_geo_dataset, ru_geo_dataset])

### Preprocess

In [17]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

In [18]:
import spacy

from spacy.training.iob_utils import biluo_to_iob, doc_to_biluo_tags
from tqdm.autonotebook import tqdm
tqdm.pandas()

df_locations.loc_markers = df_locations.loc_markers.apply(lambda x: [[y[0], y[1], 'LOC']  for y in x])

nlp = spacy.blank("xx")

def convert_to_conll(row):
    data = {
        "text": row['text'],
        "label": row['loc_markers']
    }
    doc = nlp(data["text"])
    ents = []
    for start, end, label in data["label"]:
        span = doc.char_span(start, end, label=label, alignment_mode='contract')
        if span is None:
            span_extended = doc.char_span(start, end, label=label, alignment_mode='expand')
            print(f"Skipping span {span_extended} expanded from {doc.text[start:end]}")
        else:
            ents.append(span)
    doc.ents = ents
    return {
        'tokens': list([t.text for t in doc]),
        'labels': list(biluo_to_iob(doc_to_biluo_tags(doc)))
    }

df_locations['conll'] = df_locations.progress_apply(convert_to_conll, axis=1)

  0%|          | 0/113893 [00:00<?, ?it/s]

Skipping span ВЛувре expanded from Лувре
Skipping span изСамофракии expanded from Самофракии
Skipping span вРоссии expanded from России
Skipping span вТоронто expanded from Торонто
Skipping span изТехаса expanded from Техаса
Skipping span Лакокраска"(Лида expanded from Лида
Skipping span Стекловолокно"(Полоцк expanded from Полоцк
Skipping span дом"(Минск expanded from Минск
Skipping span вСКФО expanded from СКФО
Skipping span РФВладимир expanded from РФ
Skipping span изНоттингема expanded from Ноттингема
Skipping span силРФ expanded from РФ
Skipping span отРоссии expanded from России
Skipping span изГермании expanded from Германии
Skipping span поМоскве expanded from Москве
Skipping span вГааге expanded from Гааге


In [19]:
label2id = {'O': 0, 'B-LOC': 1, 'I-LOC': 2}

df_locations['tokens'] = df_locations.conll.str['tokens']
df_locations['ner_tags'] = df_locations.conll.str['labels'].apply(lambda x: [label2id[t] for t in x])

df_train = df_locations[df_locations.is_valid == 0]
df_valid = df_locations[df_locations.is_valid == 1]

In [20]:
df_train[['tokens', 'ner_tags']].to_json(
    os.path.join(SAVE_PATH, 'train_light.json'), orient='records', lines=True)
df_valid[['tokens', 'ner_tags']].to_json(
    os.path.join(SAVE_PATH, 'valid_light.json'), orient='records', lines=True)

In [21]:
import datasets

In [22]:
from datasets import load_dataset

raw_datasets = load_dataset(
    "json",
    data_files={
        'train': os.path.join(SAVE_PATH, 'train_light.json'),
        'val': os.path.join(SAVE_PATH, 'valid_light.json'),
    },
)

Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-85a7927c0922684e/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-85a7927c0922684e/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [23]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

label2id = {'O': 0, 'B-LOC': 1, 'I-LOC': 2}
id2label = {v: k for k, v in label2id.items()}

model = AutoModelForTokenClassification.from_pretrained(
    # 'youscan/ukr-roberta-base',
    'bert-base-multilingual-cased',
    id2label=id2label,
    label2id=label2id,
)
tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

In [24]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id is None:
            new_labels.append(-100)
        elif word_id != current_word:
            # Start of a new word!
            current_word = word_id
            new_labels.append(labels[word_id])
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [25]:
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

  0%|          | 0/113 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [26]:
from transformers import TrainingArguments

args = TrainingArguments(
    "bert-ua-loc-ner",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=3
)

In [27]:
from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW([
    {'params': list(model.bert.parameters()), 'lr': 1e-5},
    {'params': list(model.classifier.parameters()), 'lr': 1e-3}
])

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0.1*3*(tokenized_datasets['train'].num_rows/16),
    num_training_steps=3*(tokenized_datasets['train'].num_rows/16)
)



In [28]:
from typing import List

def metrics(y_true: List[List[str]], y_pred: List[List[str]]):
    assert len(y_true) == len(y_pred)
    tp, fp, fn = 0.0, 0.0, 0.0

    for y_true_sample, y_pred_sample in zip(y_true, y_pred):
        tp += len(set(y_true_sample) & set(y_pred_sample))
        fp += len(set(y_pred_sample) - set(y_true_sample))
        fn += len(set(y_true_sample) - set(y_pred_sample))
    
    precision = tp / (tp + fp) if tp + fp != 0 else 0.0 if tp + fn != 0.0 else 1.0
    recall = tp / (tp + fn) if tp + fn != 0 else 1.0
    f1 = 2 * precision * recall / (precision + recall) if precision + recall != 0 else 0.0

    return {
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

In [29]:
# import evaluate
import numpy as np

# metric = evaluate.load("seqeval")

label_names = list(label2id.keys())

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    return metrics(y_true=true_predictions, y_pred=true_labels)

    
#     all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
#     return {
#         "precision": all_metrics["overall_precision"],
#         "recall": all_metrics["overall_recall"],
#         "f1": all_metrics["overall_f1"],
#         "accuracy": all_metrics["overall_accuracy"],
#     }

In [30]:
import wandb

try:
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    api_key = user_secrets.get_secret("wandb_api")
    wandb.login(key=api_key)
    anony = None
except:
    anony = "must"
    print('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. \nGet your W&B access token from here: https://wandb.ai/authorize')

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [31]:
from transformers import Trainer

train_sample = [tokenized_datasets['train'][i] for i in range(10)]
val_sample = [tokenized_datasets['val'][i] for i in range(10)]

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
#     train_dataset=train_sample,
    eval_dataset=tokenized_datasets["val"],
#     eval_dataset=val_sample,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    optimizers=(optimizer, scheduler)
)
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33myevhenii-azarov[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: wandb version 0.15.12 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
[34m[1mwandb[0m: Tracking run with wandb version 0.15.9
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20231026_011938-eepkgtp8[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mearthy-sound-3[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/yevhenii-azarov/huggingface[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/yevhenii-azarov/huggingface/runs/eepkgtp8[0m
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.0163,0.014714,0.977976,0.986787,0.982362
2,0.0096,0.016584,0.98869,0.977634,0.983131
3,0.0067,0.017385,0.983333,0.982164,0.982748


TrainOutput(global_step=21144, training_loss=0.01744256693781031, metrics={'train_runtime': 3063.5724, 'train_samples_per_second': 110.42, 'train_steps_per_second': 6.902, 'total_flos': 1.2080078747309664e+16, 'train_loss': 0.01744256693781031, 'epoch': 3.0})

In [32]:
from transformers import pipeline

# Replace this with your own checkpoint
model_checkpoint = "bert-ua-loc-ner/checkpoint-3/"
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)

OSError: Can't load the configuration of 'bert-ua-loc-ner/checkpoint-3/'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'bert-ua-loc-ner/checkpoint-3/' is the correct path to a directory containing a config.json file

In [None]:
uk_geo_dataset_light.loc[501309].text

In [None]:
token_classifier(uk_geo_dataset_light.loc[501309].text)