### Importing train and test data

In [1]:
import spacy
from spacy.util import filter_spans
from spacy.tokens import DocBin
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

train = pd.read_json('../data/train.jsonl', lines=True)
test = pd.read_json('../data/test.jsonl', lines=True)
train, val = train_test_split(train, test_size=0.2)

### Preprocessing

In [3]:
nlp = spacy.blank("ru")
doc_bin = DocBin()

In [12]:
def process_row(row, doc_bin, name):
    """
    process a single row of the dataset and add it to the DocBin
    """
    skipped = errors = 0  # errors and skipped entities
    tokens = row['sentences']
    doc = nlp.make_doc(tokens)
    ents = []

    for start, end, label in row['ners']:
        start, end = int(start), int(end)
        span = doc.char_span(start, end + 1, label=label, alignment_mode="contract")

        if span is None: # skip if the entity span is not aligned
            skipped += 1
        elif span.text.strip() != span.text: # skip if the entity span has leading or trailing spaces
            errors += 1
        else: # add the entity to the doc.ents if it's accepted
            ents.append(span)

    filtered_ents = filter_spans(ents) 
    doc.ents = filtered_ents
    doc_bin.add(doc)

    return skipped, errors

def create_spacy_dataset(dataset, name):
    """
    create a spacy training dataset from the pandas dataframe
    """
    skipped_total = errors_total = 0
    for _, row in tqdm(dataset.iterrows()):
        skipped, errors = process_row(row, doc_bin, name)
        skipped_total += skipped
        errors_total += errors

    print(f"{name} skipped: {skipped_total}")
    print(f"{name} errors: {errors_total}")
    doc_bin.to_disk(f"{name}_data.spacy")

doc_bin = spacy.tokens.DocBin()
create_spacy_dataset(train, "training")
create_spacy_dataset(val, "validating")

415it [00:01, 397.20it/s]


training skipped: 125
training errors: 8


104it [00:00, 391.72it/s]


validating skipped: 28
validating errors: 14


### Training

The base_config was created by https://spacy.io/usage/training, after which I created config using the following command:
`python -m spacy init fill-config base_config.cfg config.cfg`
The `training_data.spacy` and `validating_data.spacy` files that were created above are used for training and validation. Next, I manually stopped the training process as the score on validation was high.

In [13]:
!python -m spacy train config.cfg --output ./ --paths.train ./training_data.spacy --paths.dev ./validating_data.spacy --gpu-id 0

[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00    108.34    0.00    0.00    0.00    0.00
  0     200       1295.13  16367.27   31.22   46.11   23.60    0.31
  0     400        608.65  11789.07   47.64   62.44   38.51    0.48
  1     600       3134.64  10015.29   63.15   69.92   57.57    0.63
  1     800        482.23   8494.28   64.41   71.99   58.28    0.64
  2    1000       2504.82   7765.15   69.49   77.01   63.32    0.69
  2    1200       1159.97   7413.42   74.04   78.39   70.15    0.74
  3    1400        873.28   6544.46   76.14   75.92   76.35    0.76
  3    1600        694.81   6071.32   

### Making predicitions

In [26]:
nlp_ner = spacy.load("model-best") # my best pretrained model

def predict_named_entities(text):
    doc = nlp_ner(text) # predict entities
    labels = [[ent.start_char, ent.end_char - 1, ent.label_] for ent in doc.ents] # save indices with a class in a format acceptable for codalab
    return labels

test['ners'] = test["senences"].apply(predict_named_entities) # new column created

# file for loading on codalab is test.jsonl
test.drop(columns=["senences"]).to_json('test.jsonl', orient='records', lines=True, force_ascii=False)

In [27]:
test["senences"][0]

'Владелец «Бирмингема» получил шесть лет тюрьмы\nмини|слева|«Сент-Эндрюс» — домашний стадион футбольного клуба «Бирмингем Сити»\nВ пятницу, 7 марта суд Гонконга приговорил владельца футбольного клуба «Бирмингем Сити» Карсона Ёнга (Carson Yeung, также в некоторых источниках — Карсон Юнг; Карсон Ён) к шести годам тюремного заключения за мошенничество.\n\n54-летний бизнесмен был признан виновным в отмывании 55 миллионов фунтов стерлингов через его банковские счета в период с 2001 по 2007 годы.\n\nКарсон Ёнг стал владельцем «Бирмингема» в 2009 году, приобретя его за 81,5 миллионов фунтов стерлингов.\n'

In [28]:
test['ners'][0]

[[0, 19, 'PROFESSION'],
 [30, 34, 'NUMBER'],
 [110, 123, 'FACILITY'],
 [126, 135, 'DATE'],
 [137, 143, 'DATE'],
 [158, 167, 'EVENT'],
 [198, 211, 'FACILITY'],
 [214, 225, 'PERSON'],
 [228, 239, 'PERSON'],
 [273, 282, 'PERSON'],
 [285, 293, 'PERSON'],
 [298, 329, 'PENALTY'],
 [350, 358, 'AGE'],
 [360, 368, 'PROFESSION'],
 [374, 389, 'EVENT'],
 [393, 453, 'CRIME'],
 [470, 488, 'DATE'],
 [492, 501, 'PERSON'],
 [532, 542, 'DATE'],
 [562, 582, 'MONEY']]