## Лекция 3  NER

### __Задача 1__:

Реализуйте 2 функции препроцессинга:

- Удалить именованные сущности с помощью natasha (https://github.com/natasha/yargy)
- Удалить именованные сущности с помощью deepmipt (https://github.com/deepmipt/ner)

In [29]:
from natasha import (
    Segmenter,
    NewsEmbedding,
    NewsNERTagger,
    PER,
    NamesExtractor,
    DatesExtractor,

    Doc
)


In [30]:
from natasha import DatesExtractor
import numpy as np
from deeppavlov import configs, build_model

ner_model = build_model(configs.ner.ner_rus_bert, download=True)

emb = NewsEmbedding()
ner_tagger = NewsNERTagger(emb)
segmenter = Segmenter()

2020-10-01 22:29:19.382 INFO in 'deeppavlov.download'['download'] at line 132: Skipped http://files.deeppavlov.ai/deeppavlov_data/bert/rubert_cased_L-12_H-768_A-12_v1.tar.gz download because of matching hashes
2020-10-01 22:29:24.606 INFO in 'deeppavlov.download'['download'] at line 132: Skipped http://files.deeppavlov.ai/deeppavlov_data/ner_rus_bert_v1.tar.gz download because of matching hashes
2020-10-01 22:29:25.112 INFO in 'deeppavlov.core.data.simple_vocab'['simple_vocab'] at line 115: [loading vocabulary from /Users/Anna/.deeppavlov/models/ner_rus_bert/tag.dict]
2020-10-01 22:29:52.145 INFO in 'deeppavlov.core.models.tf_model'['tf_model'] at line 51: [loading model from /Users/Anna/.deeppavlov/models/ner_rus_bert/model]


INFO:tensorflow:Restoring parameters from /Users/Anna/.deeppavlov/models/ner_rus_bert/model


## Preprocessing

In [37]:
from pymorphy2 import MorphAnalyzer
from razdel import tokenize, sentenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

def preprocess_with_natasha(text: str) -> str:
    text = str(text)
    doc = Doc(text)
    doc.segment(segmenter)
    doc.tag_ner(ner_tagger)
    for ner in doc.spans:
        text = text.replace(ner.text, "")
    return " ".join(text.split())
#     return my_preprocess(text)

def preprocess_with_deepmipt(text: str) -> str:
    text = str(text)
    text_splitted = sentenize(text)
    for sent in text_splitted:
        sent = str(sent.text)
        if len(sent) == 0:
            break
        sent_buf = list(tokenize(sent))
        if len(sent_buf) >= 300:
            sent = " ".join([str(str_buf) for str_buf in sent_buf[:300]])
#         print(len(sent))
#         print(sent)
        res = ner_model([sent])
        for m in zip(res[0][0], res[1][0]):
            if m[1][0] == "B" or m[1][0] == "I":
                text = text.replace(m[0], "")
            
    return " ".join(text.split())
#     return my_preprocess(text)

### __Задача 2__:    
На предыдущем занятии вы реализовывали функции поиска ближайших ответов на запросы через TF-IDF и BM25. 
Сравните качество нахождения верного ответа для обоих методов в трех случаях:
- с функцией ```preprocess_with_natasha```
- с функцией ```preprocess_with_deepmipt```
- без препроцессинга

Для измерения качества используйте метрику accuracy. Считаем, что ответ верный, если он входит в топ-1.

In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import math
import pandas as pd


In [39]:
answers_df = pd.read_excel("answers_base.xlsx")
questions_df = pd.read_excel("queries_base.xlsx")

# Natasha + TF_IDF

In [40]:
answers_df.rename(columns={'Текст вопросов': 'text', 'Номер связки': 'join_num'}, inplace=True)
questions_df.rename(columns={'Текст вопроса': 'text', 'Номер связки\n': 'join_num'}, inplace=True)

train, test_quest = train_test_split(questions_df, test_size=0.3)

train_quest = pd.concat([answers_df, train])

train_quest_processed = []
test_quest_processed = []
vectorizer = TfidfVectorizer()

# preprocess train / test
for quest in tqdm(train_quest['text']):
    train_quest_processed.append(preprocess_with_natasha(quest))
for quest in tqdm(test_quest['text']):
    test_quest_processed.append(preprocess_with_natasha(quest))

vectorizer.fit(train_quest_processed + test_quest_processed)
# train matrix
X_train = vectorizer.transform(train_quest_processed)
# test natrix
X_test = vectorizer.transform(test_quest_processed)

print("X_train.shape: " + str(X_train.shape))
print("X_test.shape: " + str(X_test.shape))


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  
100%|██████████| 1652/1652 [00:15<00:00, 108.55it/s]
100%|██████████| 690/690 [00:05<00:00, 125.94it/s]


X_train.shape: (1652, 13065)
X_test.shape: (690, 13065)


In [41]:
rating = X_train.dot(X_test.T).argmax(axis=0)
rating = np.array(rating)[0]
count = 0
for ind_test, pred in enumerate(rating):
    if math.isnan(test_quest.iloc[ind_test].join_num) or math.isnan(train_quest.iloc[pred].join_num):
        continue

    if int(test_quest.iloc[ind_test].join_num) == int(train_quest.iloc[pred].join_num):
        count += 1

"Accuracy with natasha and TF-IDF: " + str(count / len(rating))

'Accuracy with natasha and TF-IDF: 0.5130434782608696'

# Deepmipt + TF_IDF

In [44]:
train_quest_processed = []
test_quest_processed = []
vectorizer = TfidfVectorizer()

# preprocess train / test
for quest in tqdm(train_quest['text']):
    train_quest_processed.append(preprocess_with_deepmipt(quest))
for quest in tqdm(test_quest['text']):
    test_quest_processed.append(preprocess_with_deepmipt(quest))

vectorizer.fit(train_quest_processed + test_quest_processed)
# train matrix
X_train = vectorizer.transform(train_quest_processed)
# test natrix
X_test = vectorizer.transform(test_quest_processed)

print("X_train.shape: " + str(X_train.shape))
print("X_test.shape: " + str(X_test.shape))

rating = X_train.dot(X_test.T).argmax(axis=0)
rating = np.array(rating)[0]
count = 0
for ind_test, pred in enumerate(rating):
    if math.isnan(test_quest.iloc[ind_test].join_num) or math.isnan(train_quest.iloc[pred].join_num):
        continue

    if int(test_quest.iloc[ind_test].join_num) == int(train_quest.iloc[pred].join_num):
        count += 1

"Accuracy with deepmipt and TF-IDF: " + str(count / len(rating))

100%|██████████| 1652/1652 [17:43<00:00,  1.55it/s] 
100%|██████████| 690/690 [07:32<00:00,  1.52it/s]


X_train.shape: (1652, 13934)
X_test.shape: (690, 13934)


'Accuracy with deepmipt and TF-IDF: 0.4927536231884058'

# Without preprocessing + TF_IDF

In [45]:
train_quest_processed = []
test_quest_processed = []
vectorizer = TfidfVectorizer()

# preprocess train / test
for quest in tqdm(train_quest['text']):
    train_quest_processed.append(str(quest))
for quest in tqdm(test_quest['text']):
    test_quest_processed.append(str(quest))

vectorizer.fit(train_quest_processed + test_quest_processed)
# train matrix
X_train = vectorizer.transform(train_quest_processed)
# test natrix
X_test = vectorizer.transform(test_quest_processed)

print("X_train.shape: " + str(X_train.shape))
print("X_test.shape: " + str(X_test.shape))

100%|██████████| 1652/1652 [00:00<00:00, 15825.10it/s]
100%|██████████| 690/690 [00:00<00:00, 773608.60it/s]


X_train.shape: (1652, 14972)
X_test.shape: (690, 14972)


In [46]:
rating = X_train.dot(X_test.T).argmax(axis=0)
rating = np.array(rating)[0]
count = 0
for ind_test, pred in enumerate(rating):
    if math.isnan(test_quest.iloc[ind_test].join_num) or math.isnan(train_quest.iloc[pred].join_num):
        continue

    if int(test_quest.iloc[ind_test].join_num) == int(train_quest.iloc[pred].join_num):
        count += 1

"Accuracy without any preprocessing and TF-IDF: " + str(count / len(rating))

'Accuracy without any preprocessing and TF-IDF: 0.5217391304347826'

## BM25 векторизация

In [47]:
import collections
import string
import math
import pandas as pd
import numpy as np

from pymorphy2 import MorphAnalyzer
from razdel import tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer


def get_inverse_dict(mat):
    d = dict()
    mat = mat.toarray()
    for ind, word in enumerate(vectorizer.get_feature_names()):
        d[word] = [int(sum(mat[:, ind]))]
        for ind_j, doc_ind in enumerate(mat[:, ind].tolist()):
            if doc_ind != 0:
                d[word].append(doc_ind)
        d[word].append(ind)
    return d


def vec_bm25(doc, dict_words):
    vec = np.zeros((1, len(dict_words)))
    doc = preprocess_with_natasha(doc)
    for word in doc.split(" "):
        if word in dict_words.keys():
            vec[0, dict_words[word][-1]] = 1
    return vec


def bm25_vectorizer(tf_val, len_d, corpus_len, nq):
    k = 2.0
    b = 0.75
    IDF = np.log((corpus_len-nq+0.5) / (nq+0.5))
    TF = (tf_val * (k+1)) / (tf_val + k * (1-b+b*(len_d / avrdl)))
    return TF * IDF



# Without preprocessing + BM25

In [48]:
train_quest_processed = []
test_quest_processed = []
vectorizer = TfidfVectorizer()

# preprocess train / test
for quest in tqdm(train_quest['text']):
    train_quest_processed.append(str(quest))
for quest in tqdm(test_quest['text']):
    test_quest_processed.append(str(quest))

corpus_len = len(train_quest_processed)
avrdl = sum([len(i.split(" ")) for i in train_quest_processed]) / corpus_len

vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_quest_processed)

inverse_dict = get_inverse_dict(X_train)

mat = np.zeros((corpus_len, len(inverse_dict)))

for ind, doc in enumerate(train_quest_processed):
    tokens = doc.split(" ")
    tf_values = collections.Counter(tokens)
    len_d = len(tokens)
    for word in tokens:
        if word not in inverse_dict.keys():
            continue
#         print("Ffff")
        mat[ind, inverse_dict[word][-1]] = bm25_vectorizer(tf_values[word],
                                                         len_d,
                                                         corpus_len,
                                                         len(inverse_dict[word]) - 1)

100%|██████████| 1652/1652 [00:00<00:00, 640801.83it/s]
100%|██████████| 690/690 [00:00<00:00, 547186.57it/s]


In [49]:
test_vecs = []
for i in test_quest.text.values:
    test_vecs.append(vec_bm25(i, inverse_dict)[0])
test_vecs = np.array(test_vecs)

rating = mat.dot(test_vecs.T).argmax(axis=0)
rating = np.array(rating)

count = 0
for ind_test, pred in enumerate(rating):
    if math.isnan(test_quest.iloc[ind_test].join_num) or math.isnan(train_quest.iloc[pred].join_num):
        continue

    if int(test_quest.iloc[ind_test].join_num) == int(train_quest.iloc[pred].join_num):
        count += 1

"Accuracy without any preprocessing and BM25: " + str(count / len(rating))

'Accuracy without any preprocessing and BM25: 0.5028985507246376'

# Natasha + BM25

In [50]:
train_quest_processed = []
test_quest_processed = []
vectorizer = TfidfVectorizer()

# preprocess train / test
for quest in tqdm(train_quest['text']):
    train_quest_processed.append(preprocess_with_natasha(quest))
for quest in tqdm(test_quest['text']):
    test_quest_processed.append(preprocess_with_natasha(quest))

corpus_len = len(train_quest_processed)
avrdl = sum([len(i.split(" ")) for i in train_quest_processed]) / corpus_len

vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_quest_processed)

inverse_dict = get_inverse_dict(X_train)

mat = np.zeros((corpus_len, len(inverse_dict)))

for ind, doc in enumerate(train_quest_processed):
    tokens = doc.split(" ")
    tf_values = collections.Counter(tokens)
    len_d = len(tokens)
    for word in tokens:
        if word not in inverse_dict.keys():
            continue
#         print("Ffff")
        mat[ind, inverse_dict[word][-1]] = bm25_vectorizer(tf_values[word],
                                                         len_d,
                                                         corpus_len,
                                                         len(inverse_dict[word]) - 1)

100%|██████████| 1652/1652 [00:11<00:00, 144.94it/s]
100%|██████████| 690/690 [00:16<00:00, 41.98it/s] 


In [51]:
test_vecs = []
for i in test_quest.text.values:
    test_vecs.append(vec_bm25(i, inverse_dict)[0])
test_vecs = np.array(test_vecs)

rating = mat.dot(test_vecs.T).argmax(axis=0)
rating = np.array(rating)

count = 0
for ind_test, pred in enumerate(rating):
    if math.isnan(test_quest.iloc[ind_test].join_num) or math.isnan(train_quest.iloc[pred].join_num):
        continue

    if int(test_quest.iloc[ind_test].join_num) == int(train_quest.iloc[pred].join_num):
        count += 1

"Accuracy with natasha and BM25: " + str(count / len(rating))

'Accuracy with natasha and BM25: 0.5028985507246376'

# Deepmipt + BM25

In [52]:
strtrain_quest_processed = []
test_quest_processed = []
vectorizer = TfidfVectorizer()

# preprocess train / test
for quest in tqdm(train_quest['text']):
    train_quest_processed.append(preprocess_with_deepmipt(quest))
for quest in tqdm(test_quest['text']):
    test_quest_processed.append(preprocess_with_deepmipt(quest))

corpus_len = len(train_quest_processed)
avrdl = sum([len(i.split(" ")) for i in train_quest_processed]) / corpus_len

vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_quest_processed)

inverse_dict = get_inverse_dict(X_train)

mat = np.zeros((corpus_len, len(inverse_dict)))

for ind, doc in enumerate(train_quest_processed):
    tokens = doc.split(" ")
    tf_values = collections.Counter(tokens)
    len_d = len(tokens)
    for word in tokens:
        if word not in inverse_dict.keys():
            continue
#         print("Ffff")
        mat[ind, inverse_dict[word][-1]] = bm25_vectorizer(tf_values[word],
                                                         len_d,
                                                         corpus_len,
                                                         len(inverse_dict[word]) - 1)
    
test_vecs = []
for i in test_quest.text.values:
    test_vecs.append(vec_bm25(i, inverse_dict)[0])
test_vecs = np.array(test_vecs)

rating = mat.dot(test_vecs.T).argmax(axis=0)
rating = np.array(rating)

count = 0
for ind_test, pred in enumerate(rating):
    if math.isnan(test_quest.iloc[ind_test].join_num) or math.isnan(train_quest.iloc[pred].join_num):
        continue

    if int(test_quest.iloc[ind_test].join_num) == int(train_quest.iloc[pred].join_num):
        count += 1

"Accuracy with deepmipt and BM25: " + str(count / len(rating))

100%|██████████| 1652/1652 [13:25<00:00,  2.05it/s] 
100%|██████████| 690/690 [05:17<00:00,  2.17it/s]


IndexError: single positional indexer is out-of-bounds

### __Задача 3__:    
Улучшить правила в natasha. Написать правила, которые ловят даты в следующих примерах и пересчитать статистику из Задачи 2:
- Уехал 8-9 ноября в Сочи
- Уезжаю 5 числа                           
- 20го сентября заболел

Пример можно посмотреть тут: https://github.com/natasha/yargy

In [53]:
from yargy import or_
from yargy.predicates import caseless, normalized, dictionary
from yargy import rule, and_, Parser
from yargy.predicates import gte, lte


DAY = and_(
    gte(1),
    lte(31)
)
MONTH = and_(
    gte(1),
    lte(12)
)
YEAR = and_(
    gte(1),
    lte(2018)
)
DATE = rule(
    YEAR,
    '-',
    MONTH,
    '-',
    DAY
)


MONTHS = {
    'январь',
    'февраль',
    'март',
    'апрель',
    'мая',
    'июнь',
    'июль',
    'август',
    'сентябрь',
    'октябрь',
    'ноябрь',
    'декабрь'
}
MONTH_NAME = dictionary(MONTHS)
YEAR_WORDS = or_(
    rule(caseless('г'), '.'),
    rule(normalized('год')),
    rule(normalized('число')),
    rule(caseless('числа')),
    rule(caseless('го'))
)

DATE = or_(
    rule(
        YEAR,
        '-',
        MONTH,
        '-',
        DAY
    ),
    rule(
        DAY,
        MONTH_NAME,
        YEAR,
        YEAR_WORDS.optional()
    ),
    rule(
        DAY,
        "-",
        DAY,
        MONTH_NAME,
    ),
    rule(
        DAY,
        YEAR_WORDS.optional(),
        MONTH_NAME.optional()
    )
)

parser = Parser(DATE)
text = '''
8 января 2014 года, 15 июня 2001 г.,
31 февраля 2018
Уехал 8-9 ноября в Сочи
Уезжаю 5 числа
20го сентября заболел'''
for match in parser.findall(text):
    print(match.span, [_.value for _ in match.tokens])

[1, 19) ['8', 'января', '2014', 'года']
[21, 36) ['15', 'июня', '2001', 'г', '.']
[38, 53) ['31', 'февраля', '2018']
[60, 70) ['8', '-', '9', 'ноября']
[85, 92) ['5', 'числа']
[93, 106) ['20', 'го', 'сентября']


In [54]:
parser = Parser(DATE)

def preprocess_with_natasha_date(text: str) -> str:
    text = str(text)
    doc = Doc(text)
    doc.segment(segmenter)
    doc.tag_ner(ner_tagger)
    for ner in doc.spans:
        text = text.replace(ner.text, "")
    for match in parser.findall(text):
        for tok in match.tokens:
            text = text.replace(tok.value, "")
    return " ".join(text.split())
#     return my_preprocess(text)

In [55]:
answers_df.rename(columns={'Текст вопросов': 'text', 'Номер связки': 'join_num'}, inplace=True)
questions_df.rename(columns={'Текст вопроса': 'text', 'Номер связки\n': 'join_num'}, inplace=True)

train, test_quest = train_test_split(questions_df, test_size=0.3)

train_quest = pd.concat([answers_df, train])

train_quest_processed = []
test_quest_processed = []
vectorizer = TfidfVectorizer()

# preprocess train / test
for quest in tqdm(train_quest['text']):
    train_quest_processed.append(preprocess_with_natasha_date(quest))
for quest in tqdm(test_quest['text']):
    test_quest_processed.append(preprocess_with_natasha_date(quest))

vectorizer.fit(train_quest_processed + test_quest_processed)
# train matrix
X_train = vectorizer.transform(train_quest_processed)
# test natrix
X_test = vectorizer.transform(test_quest_processed)

print("X_train.shape: " + str(X_train.shape))
print("X_test.shape: " + str(X_test.shape))


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  
100%|██████████| 1652/1652 [00:26<00:00, 61.41it/s] 
100%|██████████| 690/690 [00:10<00:00, 68.12it/s] 


X_train.shape: (1652, 12945)
X_test.shape: (690, 12945)


In [56]:
rating = X_train.dot(X_test.T).argmax(axis=0)
rating = np.array(rating)[0]
count = 0
for ind_test, pred in enumerate(rating):
    if math.isnan(test_quest.iloc[ind_test].join_num) or math.isnan(train_quest.iloc[pred].join_num):
        continue

    if int(test_quest.iloc[ind_test].join_num) == int(train_quest.iloc[pred].join_num):
        count += 1

"Accuracy with natasha and TF-IDF: " + str(count / len(rating))

'Accuracy with natasha and TF-IDF: 0.5362318840579711'