In [2]:
from __future__ import unicode_literals, print_function

import random
from pathlib import Path
import spacy
import json

from tqdm import tqdm

In [2]:

def train_spacy_ner(labeled_data_file_path, labels_file_path, output_dir, n_iter=20):
    def convert_to_spacy(labeled_data_file_path, labels_file_path):
        with open(labeled_data_file_path, mode='r') as labeled_data_file:
            labels = []
            with open(labels_file_path, mode='r') as labels_file:
                labels_json = json.load(labels_file)
                for label in labels_json:
                    labels.append(label)

            lines = labeled_data_file.readlines()
            spacy_labels = []
            for line in lines:
                line = json.loads(line)
                if "labels" in line and line["labels"] != []:
                    spacy_line = (line["text"], {'entities': line["labels"]})
                    spacy_labels.append(spacy_line)
                else:
                    continue
        return spacy_labels

    TRAIN_DATA = convert_to_spacy(labeled_data_file_path, labels_file_path)

    nlp = spacy.blank('ru')
    print("Created blank 'ru' model")

    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
    else:
        ner = nlp.get_pipe('ner')

    for _, annotations in TRAIN_DATA:
        for ent in annotations.get('entities'):
            ner.add_label(ent[2])

    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in tqdm(TRAIN_DATA):
                nlp.update(
                    [text],
                    [annotations],
                    drop=0.5,
                    sgd=optimizer,
                    losses=losses)
            print(losses)

    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

In [3]:
output_dir=Path("/home/droman/Documents/diploma/deeppavlov_ner_3.6/data/ner_custom_model")

In [4]:
train_spacy_ner("/home/droman/Documents/diploma/deeppavlov_ner_3.6/data/final-labels-data.jsonl",
                 "/home/droman/Documents/diploma/spacy/data/doccano/label_config.json",
                 output_dir)


  **kwargs
  **kwargs
  0%|          | 0/53 [00:00<?, ?it/s]

Created blank 'ru' model


  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
100%|██████████| 53/53 [00:02<00:00, 18.55it/s]
  6%|▌         | 3/53 [00:00<00:02, 21.57it/s]

{'ner': 1105.038847311974}


100%|██████████| 53/53 [00:02<00:00, 17.76it/s]
  4%|▍         | 2/53 [00:00<00:03, 13.22it/s]

{'ner': 966.0063099229894}


100%|██████████| 53/53 [00:03<00:00, 17.26it/s]
  4%|▍         | 2/53 [00:00<00:02, 18.09it/s]

{'ner': 927.2100930228634}


100%|██████████| 53/53 [00:02<00:00, 20.29it/s]
  6%|▌         | 3/53 [00:00<00:02, 21.70it/s]

{'ner': 857.4947894851734}


100%|██████████| 53/53 [00:02<00:00, 22.70it/s]
  6%|▌         | 3/53 [00:00<00:02, 23.56it/s]

{'ner': 797.0642671697914}


100%|██████████| 53/53 [00:02<00:00, 19.77it/s]
  6%|▌         | 3/53 [00:00<00:02, 21.14it/s]

{'ner': 767.8733931650263}


100%|██████████| 53/53 [00:02<00:00, 18.45it/s]
  4%|▍         | 2/53 [00:00<00:03, 16.09it/s]

{'ner': 679.2416832750015}


100%|██████████| 53/53 [00:02<00:00, 18.42it/s]
  6%|▌         | 3/53 [00:00<00:02, 20.00it/s]

{'ner': 652.758650344663}


100%|██████████| 53/53 [00:02<00:00, 18.55it/s]
  4%|▍         | 2/53 [00:00<00:02, 17.53it/s]

{'ner': 628.4494414727195}


100%|██████████| 53/53 [00:02<00:00, 18.45it/s]
  4%|▍         | 2/53 [00:00<00:02, 19.60it/s]

{'ner': 612.6937061951494}


100%|██████████| 53/53 [00:02<00:00, 18.02it/s]
  4%|▍         | 2/53 [00:00<00:03, 15.75it/s]

{'ner': 607.7889616984537}


100%|██████████| 53/53 [00:03<00:00, 17.11it/s]
  4%|▍         | 2/53 [00:00<00:02, 17.57it/s]

{'ner': 555.6685246870911}


100%|██████████| 53/53 [00:02<00:00, 18.65it/s]
  4%|▍         | 2/53 [00:00<00:02, 19.93it/s]

{'ner': 547.9339882040872}


100%|██████████| 53/53 [00:02<00:00, 18.74it/s]
  4%|▍         | 2/53 [00:00<00:03, 15.29it/s]

{'ner': 527.4184262259121}


100%|██████████| 53/53 [00:03<00:00, 17.63it/s]
  4%|▍         | 2/53 [00:00<00:02, 17.41it/s]

{'ner': 494.5945876408693}


100%|██████████| 53/53 [00:02<00:00, 18.74it/s]
  4%|▍         | 2/53 [00:00<00:02, 19.33it/s]

{'ner': 499.38125469780755}


100%|██████████| 53/53 [00:02<00:00, 18.51it/s]
  4%|▍         | 2/53 [00:00<00:02, 18.75it/s]

{'ner': 487.1954148670363}


100%|██████████| 53/53 [00:02<00:00, 18.58it/s]
  4%|▍         | 2/53 [00:00<00:02, 17.92it/s]

{'ner': 457.06029487405266}


100%|██████████| 53/53 [00:02<00:00, 18.40it/s]
  4%|▍         | 2/53 [00:00<00:02, 17.68it/s]

{'ner': 452.55344491791857}


100%|██████████| 53/53 [00:02<00:00, 18.39it/s]

{'ner': 440.7101006253131}
Saved model to /home/droman/Documents/diploma/deeppavlov_ner_3.6/data/ner_custom_model





In [4]:
model = spacy.load(output_dir)


In [24]:
import pandas as pd
data = pd.read_csv("./data/input/data_with_semi_preproc_without_stopwords.csv")

In [25]:
txt = data["original"][0]

In [26]:
print(txt)

великий княжество владимирский 1157года   —   суздальский княжество  —   русский княжество xii  —  xivвеков становиться ядро современный российский государствавый узкий смысл  —   территория который лично владеть великий князь владимирский широкий смысл  —   территория весь княжество выделяться владимирский зависимый владимирский князь середина xiii век сюзеренитет великий князь владимирский признавать также новгородский небольшой перерыв псковский республика


In [27]:
doc = model(txt)
for token in doc.ents:
    print(token.text, token.start_char, token.end_char, token.label_)

великий княжество владимирский 0 30 LOC
суздальский 46 57 LOC
русский княжество 73 90 LOC
российский государствавый 137 162 LOC
великий князь 213 226 TITLE
владимирский 227 239 LOC
княжество 275 284 LOC
владимирский 296 308 PERSON
владимирский князь 319 337 TITLE
великий князь 368 381 TITLE
владимирский 382 394 LOC
новгородский 412 424 LOC
псковский 443 452 LOC


In [None]:
import pandas as pd
origin_result_data = pd.read_csv("./data/result.csv")
origin_result_data.head()

In [8]:
with open('/home/droman/Documents/diploma/spacy/data/data_for_labeling.txt', mode='wt', encoding='utf-8') as myfile:
    myfile.write('\n'.join(set(origin_result_data["original"].tolist())))

In [6]:
len(origin_result_data["original"].tolist())

162

In [7]:
len(set(origin_result_data["original"].tolist()))


54