In [1]:
import os
DATA_PATH = "../data/" 
# DATA_PATH = "/kaggle/input/" 

In [2]:
DATASETS_PATH = os.path.join(DATA_PATH, "nlp-ua-locations-extractions")

In [3]:
from sklearn.model_selection import train_test_split

## Data

I see there two possible approaches to deal with the two-languages nature of the test data:
- Train two separate models for the uk and ru NER, on the inference detect language of the sample first and apply correspondent model 
- Train multilingual model on the dataset which includes samples from both languages, with the same proportion as in the test data 

I will try the second approach first here

### Investigate languages ratio in the test set

In [4]:
!pip install langid

You should consider upgrading via the '/home/evgeniy/.local/bin/miniconda3/envs/iasa_nlp_env/bin/python -m pip install --upgrade pip' command.[0m


In [4]:
import pandas as pd
import langid
langid.set_languages(['ru', 'uk'])

In [5]:
df_test = pd.read_csv(os.path.join(DATASETS_PATH, "test.csv"))
df_test['lang'] = df_test['text'].apply(lambda x: langid.classify(x)[0])

In [6]:
df_test['lang'].value_counts()

lang
uk    423
ru     54
Name: count, dtype: int64

In [7]:
lang_ratio = (df_test['lang'] == 'uk').sum() / (df_test['lang'] == 'ru').sum()
lang_ratio

7.833333333333333

### Load and sample data

In [8]:
uk_geo_dataset = pd.read_csv(os.path.join(DATASETS_PATH, 'uk_geo_dataset.csv'), converters={"loc_markers": eval})
ru_geo_dataset = pd.read_csv(os.path.join(DATASETS_PATH, 'ru_geo_dataset.csv'), converters={"loc_markers": eval})
uk_geo_dataset.shape, ru_geo_dataset.shape

((1010000, 5), (8028840, 6))

In [9]:
_, uk_geo_dataset_light = train_test_split(uk_geo_dataset, stratify=uk_geo_dataset["is_valid"], test_size=0.1, random_state=42)
val_ratio = (uk_geo_dataset_light['is_valid'] == 1).sum() / uk_geo_dataset_light.shape[0]

In [10]:
# sample from different docs to increase diversity of the sub-dataset
ru_geo_dataset_light = ru_geo_dataset.groupby('doc_id').first().sample(int(uk_geo_dataset_light.shape[0] / lang_ratio)).reset_index()

In [11]:
ru_geo_dataset_light['is_valid'] = ru_geo_dataset_light['text'].apply(lambda x: 1 if hash(x) % int(1/val_ratio) == 0 else 0)

In [12]:
uk_geo_dataset_light.sample(5)

Unnamed: 0,text,loc_markers,org_markers,per_markers,is_valid
494552,Прокуратура не бере участі у виборчому процесі...,[],[],[],0
419794,”Особливо хочу відзначити наше недавнє досягне...,"[(72, 90)]",[],[],0
104648,Сергій Стаховський є лідером чоловічої збірної...,"[(47, 54)]",[],"[(0, 18)]",0
115911,Невже цифрові концтабори будують не лише на ум...,"[(53, 58), (81, 87)]",[],[],0
532374,"Все лікування хлопчику проводили в Україні, ал...","[(35, 42), (88, 94)]",[],[],0


In [13]:
uk_geo_dataset_light[['text', 'loc_markers', 'is_valid']].to_csv(os.path.join(DATASETS_PATH, "uk_geo_dataset_light.csv"), index=None)

In [14]:
ru_geo_dataset_light[['text', 'loc_markers', 'is_valid']].to_csv(os.path.join(DATASETS_PATH, "ru_geo_dataset_light.csv"), index=None)

In [15]:
uk_geo_dataset = pd.read_csv(os.path.join(DATASETS_PATH, "uk_geo_dataset_light.csv"), converters={'loc_markers': eval})
ru_geo_dataset = pd.read_csv(os.path.join(DATASETS_PATH, "ru_geo_dataset_light.csv"), converters={'loc_markers': eval})

uk_geo_dataset['lang'] = 'uk'
ru_geo_dataset['lang'] = 'ru'
df_locations = pd.concat([uk_geo_dataset, ru_geo_dataset])

### Preprocess

In [16]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

In [17]:
import spacy

from spacy.training.iob_utils import biluo_to_iob, doc_to_biluo_tags
from tqdm.autonotebook import tqdm
tqdm.pandas()

df_locations.loc_markers = df_locations.loc_markers.apply(lambda x: [[y[0], y[1], 'LOC']  for y in x])

nlp = spacy.blank("xx")

def convert_to_conll(row):
    data = {
        "text": row['text'],
        "label": row['loc_markers']
    }
    doc = nlp(data["text"])
    ents = []
    for start, end, label in data["label"]:
        span = doc.char_span(start, end, label=label, alignment_mode='contract')
        if span is None:
            span_extended = doc.char_span(start, end, label=label, alignment_mode='expand')
            print(f"Skipping span {span_extended} expanded from {doc.text[start:end]}")
        else:
            ents.append(span)
    doc.ents = ents
    return {
        'tokens': list([t.text for t in doc]),
        'labels': list(biluo_to_iob(doc_to_biluo_tags(doc)))
    }

df_locations['conll'] = df_locations.progress_apply(convert_to_conll, axis=1)

  return torch._C._cuda_getDeviceCount() if nvml_count < 0 else nvml_count


  0%|          | 0/113893 [00:00<?, ?it/s]

Skipping span /Ставропольский край/ expanded from Ставропольский край
Skipping span вВолгограде expanded from Волгограде
Skipping span вРоссии expanded from России
Skipping span РФВладимир expanded from РФ
Skipping span РФВладислав expanded from РФ
Skipping span РФВладимира expanded from РФ
Skipping span вМоскве expanded from Москве
Skipping span ВЛондоне expanded from Лондоне
Skipping span Спартак"(Москва expanded from Москва
Skipping span ВСША expanded from США
Skipping span вБостоне expanded from Бостоне
Skipping span Глостер(Gloucester expanded from Глостер
Skipping span комитетРФ expanded from РФ
Skipping span комитетРФ expanded from РФ
Skipping span вРоссии expanded from России
Skipping span ВКирове expanded from Кирове
Skipping span вМоскву expanded from Москву
Skipping span Израильcкие expanded from Израиль
Skipping span РФЮлия expanded from РФ


In [23]:
label2id = {'O': 0, 'B-LOC': 1, 'I-LOC': 2}

df_locations['tokens'] = df_locations.conll.str['tokens']
df_locations['ner_tags'] = df_locations.conll.str['labels'].apply(lambda x: [label2id[t] for t in x])

df_train = df_locations[df_locations.is_valid == 0]
df_valid = df_locations[df_locations.is_valid == 1]

In [24]:
df_train[['tokens', 'ner_tags']].to_json(
    os.path.join(DATASETS_PATH, 'train_light.json'), orient='records', lines=True)
df_valid[['tokens', 'ner_tags']].to_json(
    os.path.join(DATASETS_PATH, 'valid_light.json'), orient='records', lines=True)

In [25]:
import datasets

In [26]:
from datasets import load_dataset

raw_datasets = load_dataset(
    "json",
    data_files={
        'train': os.path.join(DATASETS_PATH, 'train_light.json'),
        'val': os.path.join(DATASETS_PATH, 'valid_light.json'),
    },
)

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating val split: 0 examples [00:00, ? examples/s]

In [27]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

label2id = {'O': 0, 'B-LOC': 1, 'I-LOC': 2}
id2label = {v: k for k, v in label2id.items()}

model = AutoModelForTokenClassification.from_pretrained(
    # 'youscan/ukr-roberta-base',
    'bert-base-multilingual-cased',
    id2label=id2label,
    label2id=label2id,
)
tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at 

In [28]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id is None:
            new_labels.append(-100)
        elif word_id != current_word:
            # Start of a new word!
            current_word = word_id
            new_labels.append(labels[word_id])
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [29]:
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

Map:   0%|          | 0/112770 [00:00<?, ? examples/s]

Map:   0%|          | 0/1123 [00:00<?, ? examples/s]

In [30]:
from transformers import TrainingArguments

args = TrainingArguments(
    "bert-ua-loc-ner",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=3
)

In [31]:
from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW([
    {'params': list(model.bert.parameters()), 'lr': 1e-5},
    {'params': list(model.classifier.parameters()), 'lr': 1e-3}
])

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0.1*3*(tokenized_datasets['train'].num_rows/16),
    num_training_steps=3*(tokenized_datasets['train'].num_rows/16)
)



In [32]:
import evaluate
import numpy as np

metric = evaluate.load("seqeval")

label_names = list(label2id.keys())

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [33]:
from typing import List

def comp_metrics(y_true: List[List[str]], y_pred: List[List[str]]):
    assert len(y_true) == len(y_pred)
    tp, fp, fn = 0.0, 0.0, 0.0

    for y_true_sample, y_pred_sample in zip(y_true, y_pred):
        tp += len(set(y_true_sample) & set(y_pred_sample))
        fp += len(set(y_pred_sample) - set(y_true_sample))
        fn += len(set(y_true_sample) - set(y_pred_sample))
    
    precision = tp / (tp + fp) if tp + fp != 0 else 0.0 if tp + fn != 0.0 else 1.0
    recall = tp / (tp + fn) if tp + fn != 0 else 1.0
    f1 = 2 * precision * recall / (precision + recall) if precision + recall != 0 else 0.0

    return {
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

In [34]:
tokenized_datasets["train"]

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 112770
})

In [35]:
from transformers import Trainer

train_sample = [tokenized_datasets['train'][i] for i in range(10)]
val_sample = [tokenized_datasets['val'][i] for i in range(10)]

trainer = Trainer(
    model=model,
    args=args,
    # train_dataset=tokenized_datasets["train"],
    train_dataset=train_sample,
    # eval_dataset=tokenized_datasets["val"],
    eval_dataset=val_sample,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    optimizers=(optimizer, scheduler)
)
trainer.train()

***** Running training *****
  Num examples = 10
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 3


  0%|          | 0/3 [00:00<?, ?it/s]

***** Running Evaluation *****
  Num examples = 10
  Batch size = 16


  0%|          | 0/1 [00:00<?, ?it/s]

Saving model checkpoint to bert-ua-loc-ner/checkpoint-1
Configuration saved in bert-ua-loc-ner/checkpoint-1/config.json


{'eval_loss': 1.0088510513305664, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.5663265306122449, 'eval_runtime': 0.9954, 'eval_samples_per_second': 10.046, 'eval_steps_per_second': 1.005, 'epoch': 1.0}


Model weights saved in bert-ua-loc-ner/checkpoint-1/pytorch_model.bin
tokenizer config file saved in bert-ua-loc-ner/checkpoint-1/tokenizer_config.json
Special tokens file saved in bert-ua-loc-ner/checkpoint-1/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 10
  Batch size = 16


  0%|          | 0/1 [00:00<?, ?it/s]

Saving model checkpoint to bert-ua-loc-ner/checkpoint-2
Configuration saved in bert-ua-loc-ner/checkpoint-2/config.json


{'eval_loss': 1.0086519718170166, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.5663265306122449, 'eval_runtime': 1.1357, 'eval_samples_per_second': 8.805, 'eval_steps_per_second': 0.881, 'epoch': 2.0}


Model weights saved in bert-ua-loc-ner/checkpoint-2/pytorch_model.bin
tokenizer config file saved in bert-ua-loc-ner/checkpoint-2/tokenizer_config.json
Special tokens file saved in bert-ua-loc-ner/checkpoint-2/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 10
  Batch size = 16


  0%|          | 0/1 [00:00<?, ?it/s]

Saving model checkpoint to bert-ua-loc-ner/checkpoint-3
Configuration saved in bert-ua-loc-ner/checkpoint-3/config.json


{'eval_loss': 1.008270502090454, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.5663265306122449, 'eval_runtime': 1.2271, 'eval_samples_per_second': 8.15, 'eval_steps_per_second': 0.815, 'epoch': 3.0}


Model weights saved in bert-ua-loc-ner/checkpoint-3/pytorch_model.bin
tokenizer config file saved in bert-ua-loc-ner/checkpoint-3/tokenizer_config.json
Special tokens file saved in bert-ua-loc-ner/checkpoint-3/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




{'train_runtime': 39.5886, 'train_samples_per_second': 0.758, 'train_steps_per_second': 0.076, 'train_loss': 1.0679607391357422, 'epoch': 3.0}


TrainOutput(global_step=3, training_loss=1.0679607391357422, metrics={'train_runtime': 39.5886, 'train_samples_per_second': 0.758, 'train_steps_per_second': 0.076, 'train_loss': 1.0679607391357422, 'epoch': 3.0})

In [36]:
from transformers import pipeline

# Replace this with your own checkpoint
model_checkpoint = "bert-ua-loc-ner/checkpoint-3/"
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)

loading configuration file bert-ua-loc-ner/checkpoint-3/config.json
Model config BertConfig {
  "_name_or_path": "bert-ua-loc-ner/checkpoint-3/",
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-LOC",
    "2": "I-LOC"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-LOC": 1,
    "I-LOC": 2,
    "O": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.17.0",


In [47]:
uk_geo_dataset_light.loc[501309].text

'Всього таких полонених на підконтрольній угрупованню ДНР території кілька десятків.'

In [48]:
token_classifier(uk_geo_dataset_light.loc[501309].text)

[{'entity_group': 'LOC',
  'score': 0.4564191,
  'word': 'пол',
  'start': 13,
  'end': 16},
 {'entity_group': 'LOC',
  'score': 0.33921558,
  'word': 'на',
  'start': 23,
  'end': 25},
 {'entity_group': 'LOC',
  'score': 0.39106214,
  'word': 'у',
  'start': 41,
  'end': 42},
 {'entity_group': 'LOC',
  'score': 0.3782667,
  'word': '##НР',
  'start': 54,
  'end': 56},
 {'entity_group': 'LOC',
  'score': 0.4021538,
  'word': 'території',
  'start': 57,
  'end': 66},
 {'entity_group': 'LOC',
  'score': 0.43995804,
  'word': '.',
  'start': 82,
  'end': 83}]