In [1]:
!python --version # should be 3.6

Python 3.6.3


In [2]:
from deeppavlov import configs, build_model
import nltk
import pandas as pd


import tqdm

from fuzzywuzzy import fuzz

from natasha import (
    DatesExtractor,
    MoneyExtractor,
    AddrExtractor,
    MorphVocab,
)

from natasha.obj import Money
from natasha.obj import Addr
from natasha.obj import Date
from natasha.obj import AddrPart

from natasha import (
    NewsNERTagger,
    NewsEmbedding,
)
emb = NewsEmbedding()
ner = NewsNERTagger(emb)

morph_vocab = MorphVocab()
extractors = [
    AddrExtractor(morph_vocab),
    DatesExtractor(morph_vocab),
    MoneyExtractor(morph_vocab)
]

# !python3 -m deeppavlov install ner_ontonotes_bert_mult

# config_path = configs.ner.ner_rus_bert
# ner = build_model(config_path)
ner_multi = build_model(configs.ner.ner_ontonotes_bert_mult, download=False)




In [3]:
def print_tokens(tokens, tags):
    for tok, tag in zip(tokens, tags):
        print(f'{tok}\t{tag}')

def filter_after_ner(tokens, tags):
    stop_words = ['́']
    excluded_indexes = []
    for i in range(len(tokens)):
        if tokens[i] in stop_words:
            excluded_indexes.append(i)
    def filter_by_index(l):
        return [l[i] for i in range(len(l)) if i not in excluded_indexes]
    return filter_by_index(tokens), filter_by_index(tags)

def get_deeppavlov_entities(tokens, tags):
    entities = []
    entity = ''
    for tok, tag in zip(tokens, tags):
        if tag == 'O' and entity != '':
            entities.append(entity)
            entity = ''
        if 'B-' in tag:
            entity = tok
        if 'I-' in tag:
            entity += ' ' + tok
    return entities

def nearest(sent1, sent2):
    for i in sent2:
        distance = fuzz.token_sort_ratio(sent1.lower(), i.lower()) #nltk.edit_distance(sent1, i)
        if distance >= 80:
            return True
        # if (distance/len(sent1)) * 100 <= 30:
        #     return True
    return False

def get_percent(part_entities, full_entities):
    max_count = len(full_entities)
    real_count = 0
    for i in part_entities:
        if nearest(i, full_entities):
            real_count += 1
    if max_count == 0:
        return 0.
    return real_count / max_count

def get_entities_from_natasha_extractors(text):
    spans = []
    entities = []
    for extractor in extractors:
        matches = extractor(text)
        spans.extend(_ for _ in matches)
    for span in spans:
        fact = span.fact
        if type(fact) is Money:
            entities.append(str(fact.amount))
        else:
            if type(fact) is Date:
                entities.append(str(fact.year) + " " + str(fact.month) + " " + str(fact.day))
            else:
                if type(fact) is AddrPart and fact.type is not None:
                    entities.append(str(fact.value))
    return entities

def get_natasha_ner_entities(text):
    markup = ner(text)
    spans = markup.spans
    entities = []
    for span in spans:
        entities.append(text[span.start: span.stop])
    return entities

def get_natasha_entities(text):
    return list(set(get_entities_from_natasha_extractors(text) + get_natasha_ner_entities(text)))

def get_overlap_entities_percent(sent1, sent2, model_ents_funcs):
    entities1 = set()
    entities2 = set()

    for model in model_ents_funcs:
        entities1.update(model(sent1))
        entities2.update(model(sent2))

    return get_percent(entities1, entities2)

def get_deeppavlov_ents(sent):
    tokens1, tags1 = ner_multi(sent)
    tokens1, tags1 = tokens1[0], tags1[0]

    filtered_tokens1, filtered_tags1 = filter_after_ner(tokens1, tags1)

    return get_deeppavlov_entities(filtered_tokens1, filtered_tags1)

def get_ner_overlap_feature(data):
    original = data["original"].tolist()
    targets = data["scored_text"].tolist()
    feature_values = []
    ner_ents_extractors = [get_deeppavlov_ents, get_natasha_entities]

    for i in tqdm.tqdm(range(len(original))):
        origin = original[i]
        target = targets[i]
        try:
            if target == " " or target == "" or target is None:
                feature_values.append(0.)
                continue
            feature_value = get_overlap_entities_percent([target], [origin], ner_ents_extractors)
            feature_values.append(feature_value)
        except:
            feature_values.append(0.)
            print("index: ", i)
            print("raget text: ", target)
            print("origin text: ", origin)
    return feature_values

def run_entities_overlap(input="/home/droman/Documents/diploma/spacy/data/result.csv",
                         output="./data/ner_overlap_result.csv"):
    data = pd.read_csv(input, index_col=False),
    ner_overlap_feature = get_ner_overlap_feature(data),
    data["ner_overlap"] = ner_overlap_feature,
    data.to_csv(output, index=False),
    print(pd.read_csv(output).head())

In [None]:
run_entities_overlap(input="/home/droman/Documents/diploma/spacy/data/result.csv",
                     output="./data/ner_overlap_result_test.csv")