# Imports & reading the data

In [None]:
import json
import torch
import random
from tqdm import tqdm
from prettytable import PrettyTable
import spacy
from spacy.training import Example
!python -m spacy download ru_core_news_lg

import warnings
warnings.filterwarnings("ignore")

Collecting ru-core-news-lg==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ru_core_news_lg-3.7.0/ru_core_news_lg-3.7.0-py3-none-any.whl (513.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m513.4/513.4 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting pymorphy3>=1.0.0 (from ru-core-news-lg==3.7.0)
  Downloading pymorphy3-2.0.1-py3-none-any.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.2/53.2 kB[0m [31m761.6 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting dawg-python>=0.7.1 (from pymorphy3>=1.0.0->ru-core-news-lg==3.7.0)
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Collecting pymorphy3-dicts-ru (from pymorphy3>=1.0.0->ru-core-news-lg==3.7.0)
  Downloading pymorphy3_dicts_ru-2.4.417150.4580142-py2.py3-none-any.whl (8.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m23.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pymorphy

In [None]:
with open("../train.jsonl") as json_file:
    json_list = list(json_file)

# sentences and ners from `train.jsonl`
train_sent = list()
train_ners = list()

for json_str in json_list:
    result = json.loads(json_str)
    train_sent.append(result["sentences"])

    train_ners.append(
        sorted(
            sorted([(result["sentences"][start:end+1], start, end, t) for start, end, t in result["ners"]], key=lambda l:l[2], reverse=True),
            key=lambda l: l[1]
        )
    )

with open("../test.jsonl") as json_file:
    json_list = list(json_file)

# sentences and corresponding ids from `test.jsonl`
test_sent = list()
test_idx = list()

for json_str in json_list:
    result = json.loads(json_str)
    test_sent.append(result["senences"]) # sEnENceS
    test_idx.append(result["id"])

# Model code

In [None]:
def get_entities_spacy(sent, recursion_depth=6):
    """
    This function takes a sentence and a maximum recursion depth
    and returns a list of all found entities without duplicates.

    Arguments:
        sent (str): a string where we need to find the entities
        recursion_depth (int): maximum recursion depth we may call

    Returns:
        List(Tuple(str, int, int, str)): a list of all found entities
                                         in a format (<word>, <start idx>, <end idx inclusive>, <label>)
    """
    # stores entities from current sentence
    res = list()
    doc = nlp(sent)

    # stores entities found from sub-calls
    sub_ents = list()

    for ent in doc.ents:
        # gets word boundaries
        a, b = ent.start_char, ent.end_char
        # stores the result in res
        res.append((doc.text[a:b], a, b-1, ent.label_))

        if recursion_depth and res[-1][0] != sent and len(res[-1][0].split())>1:
            # makes a recursive call for each found entity
            sub_ent = get_entities_spacy(res[-1][0], recursion_depth-1)

            # writes the found entity into sub_ents
            sub_ent = [(x[0], x[1]+a, x[2]+a, x[3]) for x in sub_ent]
            sub_ents += sub_ent

    # returns a list of all unique entities in this call and all sub-calls
    return list(set(res + sub_ents))

# Dataset splitting

In [None]:
def fun(text, ners):
    """
    Unfolds all nested entities recursively.

    For example, an input:

    "Ilya works in Moscow University" -> ["Ilya", "Person"], ["Moscow University", "Organization"], ["Moscow", "City"]

    would be split into two separate training examples:

    "Ilya works in Moscow University" -> ["Ilya", "Person"], ["Moscow University", "Organization"]
    "Moscow University" -> ["Moscow", "City"]

    Parameters:
        text (str): a full sentence to be split
        ners (List(Tuple(str, int, int, str))): a list of entities to be split

    Returns:
        List(Tuple(str, List(Tuple(str, int, int, str)))): a list of unfolded entities in a format (<sentence>, <list of ners>)
    """
    # sorts the entities by end idx and start idx
    ners.sort(reverse=True, key=lambda l:l[2])
    ners.sort(key=lambda l:l[1])

    res = []

    # shows whether this entity is nested
    included = [False] * len(ners)
    # stores parent text and location for nested values
    par_sent_loc = text
    shift_loc = 0

    # a dict that stores all nested entities and their parent text
    to_process = dict()

    for i, item in enumerate(ners):
        # if entity is not nested, then store its text and position
        if not res or item[1] > res[-1][2]:
            res.append(item)
            included[i] = True
            par_sent_loc = item[0]
            shift_loc = item[1]

        # if entity is nested, stores it in `to_process`
        if not included[i]:
            l = to_process.get(par_sent_loc, list())
            l.append((item[0], item[1]-shift_loc, item[2]-shift_loc, item[3]))
            l = list(set(l))
            to_process[par_sent_loc] = l

    answer = [[text, res]]

    # recursively calls this function for each nested entity
    for k, v in to_process.items():
        answer += fun(k, v)

    return answer

# Training

In [None]:
# 30 sentences will be used for eval, the rest for training
X_eval = train_sent[:30]
y_eval = train_ners[:30]
X_train = train_sent[30:]
y_train = train_ners[30:]

# transforming the training set with the function above
training_set = list()
for sent, ner in zip(X_train, y_train):
    training_set += fun(sent, ner)

In [None]:
nlp = spacy.load("ru_core_news_lg")

# disabling all pipes except for `ner`
disabled_pipes = list()
for pipe_name in nlp.pipe_names:
    if pipe_name != "ner":
        nlp.disable_pipes(pipe_name)
        disabled_pipes.append(pipe_name)

optimizer = nlp.create_optimizer()

n_epochs = 20
batch = 16
for i in range(n_epochs):
    losses = {}
    pbar = tqdm(total=len(training_set)//batch+1, position=0, leave=True)
    pbar.set_description_str(f"Epoch [{i+1}/{n_epochs}]")

    for j in range(len(training_set)//batch+1):
        # creating a list of randomly selected examples from training set
        example = list()
        for item in random.sample(training_set, batch):
            entities = [(x[1], x[2]+1, x[3]) for x in item[1]]
            doc = nlp.make_doc(item[0])
            example.append(Example.from_dict(doc, {"entities": entities}))
        # training nlp ner on these examples
        try: nlp.update(example, drop=0.3, losses=losses, sgd=optimizer)
        except: pass
        pbar.update(1)
        pbar.set_postfix_str(f"Loss: {round(losses['ner']/(j+1), 2)}")

# enabling the pipes back
for pipe_name in disabled_pipes:
    nlp.enable_pipe(pipe_name)

100%|██████████| 65/65 [48:15<00:00, 44.55s/it]
Epoch [1/20]: 100%|██████████| 303/303 [00:58<00:00,  5.14it/s, Loss: 131.28]
Epoch [2/20]: 100%|██████████| 303/303 [00:57<00:00,  5.24it/s, Loss: 95.43]
Epoch [3/20]: 100%|██████████| 303/303 [00:56<00:00,  5.40it/s, Loss: 81.54]
Epoch [4/20]: 100%|██████████| 303/303 [00:57<00:00,  5.24it/s, Loss: 76.22]
Epoch [5/20]: 100%|██████████| 303/303 [00:55<00:00,  5.46it/s, Loss: 69.49]
Epoch [6/20]: 100%|██████████| 303/303 [01:01<00:00,  4.89it/s, Loss: 75.62]
Epoch [7/20]: 100%|██████████| 303/303 [00:55<00:00,  5.45it/s, Loss: 62.62]
Epoch [8/20]: 100%|██████████| 303/303 [00:58<00:00,  5.15it/s, Loss: 63.36]
Epoch [9/20]: 100%|██████████| 303/303 [00:57<00:00,  5.27it/s, Loss: 59.25]
Epoch [10/20]: 100%|██████████| 303/303 [00:59<00:00,  5.08it/s, Loss: 61.21]
Epoch [11/20]: 100%|██████████| 303/303 [00:59<00:00,  5.10it/s, Loss: 58.03]
Epoch [12/20]: 100%|██████████| 303/303 [01:02<00:00,  4.86it/s, Loss: 54.31]
Epoch [13/20]: 100%|████

# Finy-tuning

In [None]:
def eval_f1(pred, true):
    """
    Evaluation function for NER. Calculates the f1-score like this:
    true positives are the entities correctly classified
    false positives are predicted entities that aren't in ground truth
    false negatives are true entities that weren't predicted.
    The f1-score is then calculated as usual

    Format of input variables:
    List of tuples (<word>, <start char idx>, <end char idx (inclusive)>, <type>)
    """
    true = set(true)
    pred = set(pred)

    true_positive = len(true & pred)
    false_positive = len(pred - true)
    false_negative = len(true - pred)

    if not true_positive:
        return 0

    recall = true_positive / (true_positive+false_positive)
    precision = true_positive / (true_positive+false_negative)

    return 2*precision*recall / (precision+recall)

In [None]:
items_to_test = 30
max_depth = 7

table = PrettyTable()
table.field_names = [""] + [f"depth={x}" for x in range(max_depth)]

pbar = tqdm(total=max_depth*items_to_test, position=0)

row = ["F1-score"]

for depth in range(max_depth):
    total_f1 = 0

    for idx in range(items_to_test):
        pbar.set_postfix_str(f"Depth={depth}, item={idx+1}, f1-score: {round(100*total_f1/(idx+1), 3)}%")

        pred = get_entities_spacy(train_sent[idx], recursion_depth=depth)
        true = train_ners[idx]

        f1_score = eval_f1(pred, true)
        total_f1 += f1_score

        pbar.update(1)

    row.append(f"{round(100*total_f1/items_to_test, 2)}%")
table.add_row(row)

print()
print(table)

  0%|          | 0/210 [00:03<?, ?it/s, Depth=0, item=1, f1-score: 0.0%]
100%|██████████| 210/210 [01:23<00:00,  2.86it/s, Depth=6, item=30, f1-score: 68.58%]


+----------+---------+---------+---------+---------+---------+---------+---------+
|          | depth=0 | depth=1 | depth=2 | depth=3 | depth=4 | depth=5 | depth=6 |
+----------+---------+---------+---------+---------+---------+---------+---------+
| F1-score |  66.36% |  70.73% |  71.14% |  71.21% |  71.19% |  71.19% |  71.19% |
+----------+---------+---------+---------+---------+---------+---------+---------+


# Predictions

In [None]:
pbar = tqdm(
    total=len(test_sent), position=0
)

depth = 5

with open("test.jsonl", "w") as f:
    for s, idx in zip(test_sent, test_idx):
        ners = get_entities_spacy(s, depth)
        ners = [ [x[1], x[2], x[3]] for x in ners]
        ners = {"ners": ners, "id": idx}
        ners = json.dumps(ners)
        f.write(f"{ners}\n")
        pbar.update(1)

print()
!zip test test.jsonl

100%|██████████| 65/65 [01:15<00:00,  1.16s/it]
100%|██████████| 65/65 [00:31<00:00,  1.04it/s]


updating: test.jsonl (deflated 76%)
