In [3]:
!python --version # should be 3.6

Python 3.6.3


In [4]:
import sys, traceback
from deeppavlov import configs, build_model
import nltk
import pandas as pd


import tqdm

from fuzzywuzzy import fuzz

from natasha import (
    DatesExtractor,
    MoneyExtractor,
    AddrExtractor,
    MorphVocab,
)

from natasha.obj import Money
from natasha.obj import Addr
from natasha.obj import Date
from natasha.obj import AddrPart

from natasha import (
    NewsNERTagger,
    NewsEmbedding,
)
import spacy
from pathlib import Path

output_dir=Path("/home/droman/Documents/diploma/deeppavlov_ner_3.6/data/ner_custom_model")
spacy_custom_ner = spacy.load(output_dir)

emb = NewsEmbedding()
ner = NewsNERTagger(emb)

morph_vocab = MorphVocab()
extractors = [
    AddrExtractor(morph_vocab),
    DatesExtractor(morph_vocab),
    MoneyExtractor(morph_vocab)
]

# !python3 -m deeppavlov install ner_ontonotes_bert_mult

# config_path = configs.ner.ner_rus_bert
# ner = build_model(config_path)
ner_multi = build_model(configs.ner.ner_ontonotes_bert_mult, download=False)


[nltk_data] Downloading package punkt to /home/droman/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/droman/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package perluniprops to
[nltk_data]     /home/droman/nltk_data...
[nltk_data]   Package perluniprops is already up-to-date!
[nltk_data] Downloading package nonbreaking_prefixes to
[nltk_data]     /home/droman/nltk_data...
[nltk_data]   Package nonbreaking_prefixes is already up-to-date!





2021-04-18 21:31:31.409 INFO in 'deeppavlov.core.data.simple_vocab'['simple_vocab'] at line 115: [loading vocabulary from /home/droman/.deeppavlov/models/ner_ontonotes_bert_mult/tag.dict]











The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use keras.layers.Dense instead.
Instructions for updating:
Please use `layer.__call__` method instead.

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API




Instructions for updating:
Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.


Instructions for updating:
Use sta

2021-04-18 21:31:49.831 INFO in 'deeppavlov.core.models.tf_model'['tf_model'] at line 51: [loading model from /home/droman/.deeppavlov/models/ner_ontonotes_bert_mult/model]



INFO:tensorflow:Restoring parameters from /home/droman/.deeppavlov/models/ner_ontonotes_bert_mult/model


In [22]:
def print_tokens(tokens, tags):
    for tok, tag in zip(tokens, tags):
        print(f'{tok}\t{tag}')

def filter_after_ner(tokens, tags):
    stop_words = ['́']
    excluded_indexes = []
    for i in range(len(tokens)):
        if tokens[i] in stop_words:
            excluded_indexes.append(i)
    def filter_by_index(l):
        return [l[i] for i in range(len(l)) if i not in excluded_indexes]
    return filter_by_index(tokens), filter_by_index(tags)

def get_deeppavlov_entities(tokens, tags):
    entities = []
    entity = ''
    for tok, tag in zip(tokens, tags):
        if tag == 'O' and entity != '':
            entities.append(entity)
            entity = ''
        if 'B-' in tag:
            entity = tok
        if 'I-' in tag:
            entity += ' ' + tok
    return entities

def nearest(sent1, sent2):
    for i in sent2:
        distance = fuzz.token_sort_ratio(sent1.lower(), i.lower()) #nltk.edit_distance(sent1, i)
        if distance >= 80:
            return True
        # if (distance/len(sent1)) * 100 <= 30:
        #     return True
    return False

def get_percent(part_entities, full_entities):
    max_count = len(full_entities)
    real_count = 0
    for i in part_entities:
        if nearest(i, full_entities):
            real_count += 1
    if max_count == 0:
        return 0.
    return real_count / max_count

def get_entities_from_natasha_extractors(text):
    if type(text) is list:
        text = text[0]
    spans = []
    entities = []
    for extractor in extractors:
        matches = extractor(text)
        if matches is not None:
            try:
                spans.extend(_ for _ in matches)
            except Exception:
                print("NATASHA MATCHES:")
                print(text)
                print(list(matches))
                print("============================")
    for span in spans:
        fact = span.fact
        if type(fact) is Money:
            entities.append(str(fact.amount))
        else:
            if type(fact) is Date:
                entities.append(str(fact.year) + " " + str(fact.month) + " " + str(fact.day))
            else:
                if type(fact) is AddrPart and fact.type is not None:
                    entities.append(str(fact.value))
    return entities

def get_natasha_ner_entities(text):
    if type(text) is list:
        text = text[0]
    entities = []
    try:
        markup = ner(text)
        spans = markup.spans
        for span in spans:
            entities.append(text[span.start: span.stop])
    except Exception:
        print("Exception in user code:")
        print(text)
        print("="*60)
        traceback.print_exc(file=sys.stdout)
        print("="*60)

    return entities

def get_natasha_entities(text):
    return list(set(get_entities_from_natasha_extractors(text) + get_natasha_ner_entities(text)))

def get_overlap_entities_percent(sent1, sent2, model_ents_funcs):
    entities1 = set()
    entities2 = set()

    for model in model_ents_funcs:
        entities1.update(model(sent1))
        entities2.update(model(sent2))

    return get_percent(entities1, entities2)

def get_deeppavlov_ents(sent):
    tokens1, tags1 = ner_multi(sent)
    tokens1, tags1 = tokens1[0], tags1[0]

    filtered_tokens1, filtered_tags1 = filter_after_ner(tokens1, tags1)

    return get_deeppavlov_entities(filtered_tokens1, filtered_tags1)

def get_spacy_custom_ents(sent):
    if type(sent) is list:
        sent = sent[0]
    return spacy_custom_ner(sent).entsv

def get_ner_overlap_feature(data):
    # ner_ents_extractors = [get_deeppavlov_ents, get_natasha_entities, get_spacy_ents]
    ner_ents_extractors = [get_deeppavlov_ents, get_natasha_entities]

    return get_abstract_ner_overlap_feature(data, ner_ents_extractors)

def get_ner_custom_overlap_feature(data):
    ner_ents_extractors = [get_spacy_custom_ents]

    return get_abstract_ner_overlap_feature(data, ner_ents_extractors)

def get_abstract_ner_overlap_feature(data, ner_ents_extractors):
    original = data["original"].tolist()
    targets = data["scored_text"].tolist()
    feature_values = []

    for i in tqdm.tqdm(range(len(original))):
        origin = original[i]
        target = targets[i]
        try:
            if target == " " or target == "" or target is None:
                feature_values.append(0.)
                continue
            feature_value = get_overlap_entities_percent([target], [origin], ner_ents_extractors)
            feature_values.append(feature_value)
        except Exception:
            print("Exception in user code:")
            print("-"*60)
            traceback.print_exc(file=sys.stdout)
            print("-"*60)
            feature_values.append(0.)
            print("index: ", i)
            print("raget text: ", target)
            print("origin text: ", origin)
    return feature_values

def run_entities_overlap(input="/home/droman/Documents/diploma/spacy/data/result.csv",
                         input_for_custom_ner="/home/droman/Documents/diploma/deeppavlov_ner_3.6/data/data_with_preprocess.csv",
                         output="./data/ner_overlap_result.csv"):
    data = pd.read_csv(input, index_col=False)
    ner_overlap_feature = get_ner_overlap_feature(data)
    data["ner_overlap"] = ner_overlap_feature

    data_for_custom_ner = pd.read_csv(input_for_custom_ner, index_col=False)
    assert len(data) == len(data_for_custom_ner)
    ner_custom_overlap_feature = get_ner_custom_overlap_feature(data_for_custom_ner)
    data["ner_custom_overlap"] = ner_custom_overlap_feature
    data.to_csv(output, index=False)
    print(pd.read_csv(output).head())

In [23]:
run_entities_overlap(input="/home/droman/Documents/diploma/deeppavlov_ner_3.6/data/data_with_preprocess.csv",
                     input_for_custom_ner="/home/droman/Documents/diploma/deeppavlov_ner_3.6/data/data_with_preprocess.csv",
                     output="./data/ner_overlap_result_with_preprocess.csv")

100%|██████████| 161/161 [02:07<00:00,  1.26it/s]

   Unnamed: 0                                           original  \
0           0  великий княжество владимирский 1157год      — ...   
1           1  новгородский республика      —      севернорус...   
2           2  великий княжество литовский     —      восточн...   
3           3  великий княжество московский     —      средне...   
4           4  олег      —      князь новгородский 879 год ве...   

                                         scored_text  distance  score  \
0  середина xiii век сюзеренитет великий князь вл...  0.923924      5   
1  находиться 1245 год сюзеренитет великий князь ...  0.908120      5   
2  1385 год находиться личный уния королевство по...  0.861883      5   
3  возвышение москва укрепление авторитет русь сп...  0.901416      5   
4  получать власть новгородский земля смерть рюри...  0.916509      5   

   ner_overlap  
0     1.000000  
1     0.909091  
2     0.777778  
3     0.625000  
4     1.000000  





In [56]:
# !pip install spacy==2.3.5
!python -m pip show spacy # should be 2.3.5
# !pip install nltk
# !pip install pyresparser



Name: spacy
Version: 2.3.5
Summary: Industrial-strength Natural Language Processing (NLP) in Python
Home-page: https://spacy.io
Author: Explosion
Author-email: contact@explosion.ai
License: MIT
Location: /home/droman/Documents/diploma/deeppavlov_ner_3.6/venv/lib/python3.6/site-packages
Requires: catalogue, setuptools, tqdm, wasabi, numpy, blis, plac, thinc, preshed, requests, srsly, cymem, murmurhash
Required-by: pyresparser


In [None]:
# local: bert_score, USE
# colab: fasttext