# __Задача 2__:    
На предыдущем занятии вы реализовывали функции поиска ближайших ответов на запросы через TF-IDF и BM25. 
Сравните качество нахождения верного ответа для обоих методов в трех случаях:
- с функцией ```preprocess_with_natasha```
- с функцией ```preprocess_with_deepmipt```
- без препроцессинга

Для измерения качества используйте метрику accuracy. Считаем, что ответ верный, если он входит в топ-1.

In [None]:
!pip install razdel
!pip install pymorphy2

import pandas as pd
import numpy as np
from razdel import tokenize
from razdel import sentenize
import nltk
from nltk.corpus import stopwords
from string import punctuation
import pymorphy2

morph = pymorphy2.MorphAnalyzer()
nltk.download('stopwords')  
nltk.download('punkt')
russian_stopwords = stopwords.words('russian')

queries = pd.read_excel('/content/queries.xlsx')
answers = pd.read_excel('/content/answers.xlsx')

In [None]:
# queries.columns = ['Текст вопросов','Номер связки','Тематика', 'Текст вопросов без NER NATASHA','Текст вопросов без NER DEEPMINT']
# answers.columns = ['Номер связки', 'Текст вопросов', 'Текст ответов', 'Тематика', 'Текст вопросов без NER DEEPMINT', 'Текст вопросов без NER NATASHA']

In [None]:
def preprocessing(text: str) -> str:
  tokens = list(tokenize(text))
  tokens = [_.text for _ in tokens]
  tokens = [word.lower() for word in tokens if word.lower() not in russian_stopwords]
  tokens = [(token.translate(str.maketrans('', '', punctuation))) for token in tokens]
  lemmas = [morph.parse(token)[0].normal_form for token in tokens]

  return " ".join(lemmas)

In [None]:
def make_column(df):
  deepmint = list(df['Текст вопросов без NER DEEPMINT'])
  natasha = list(df['Текст вопросов без NER NATASHA'])
  deepmint_lemms = [preprocessing(str(text)) for text in deepmint]
  natasha_lemms = [preprocessing(str(text)) for text in natasha]
  df['deepmint_preprocessed'] = deepmint_lemms
  df['natasha_preprocessed'] = natasha_lemms

  return df

answers_new = make_column(answers)
queries_new = make_column(queries)

In [None]:
queries.to_excel('queries.xlsx')
answers.to_excel('answers.xlsx')

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
import numpy as np

columns = ['Номер связки', 'Текст вопросов', 'deepmint_preprocessed', 'natasha_preprocessed']

answers_train = pd.DataFrame(answers, columns=columns)
queries_train = pd.DataFrame(queries, columns=columns)

vectorizer = TfidfVectorizer()

In [None]:
train_quer, test = train_test_split(queries_train, test_size=0.3, random_state=7654)
train = pd.concat([answers_train, train_quer])
print(train.shape, test.shape)

(1652, 4) (690, 4)


In [None]:
def vectorize(text, X_array):
  vec = vectorizer.transform(text)
  X = vec.toarray()[0]
  scalar = X_array.dot(X)

  return scalar

links = list(train['Номер связки'])

In [None]:
def raking(question_array):
  vec_link ={}
  for x,y in enumerate(np.nditer(question_array)):
    vec_link[float(y)] = links[x]

  zip_links = [(el, vec_link[el]) for el in sorted(vec_link.keys(), reverse=True)]

  # result = 'В ранжировании выиграл документ номер: ' + str(zip_links[0][1]) + '\n'
  # print(result)

  # for x,y in zip_links:
  #   print('Коэффициент близости: ', x,'\t Номер документа: ',y)

  return zip_links[0][1]

# NATASHA

In [None]:
X_natasha = vectorizer.fit_transform(train['natasha_preprocessed'].values.astype('U'))
X_natasha_array = X_natasha.toarray()
X_natasha_array.shape

(1652, 5854)

In [None]:
tf_natasha = []
for i in range(690):
  text = list(test['natasha_preprocessed'])[i]
  array = vectorize([str(text)],  X_natasha_array)
  tf_natasha.append(raking(array))

In [None]:
test['tf_natasha'] = tf_natasha

In [None]:
test = test.fillna(0)
test.astype({'Номер связки': 'int32'}).dtypes

In [None]:
test.astype({'tf_natasha': 'int32'}).dtypes

In [None]:
accuracy_score(test['Номер связки'], test['tf_natasha'])

0.5608695652173913

# DEEPMINT

In [None]:
X_deepmint = vectorizer.fit_transform(train['deepmint_preprocessed'].values.astype('U'))
X_deepmint_array = X_deepmint.toarray()
X_deepmint_array.shape

(1652, 6537)

In [None]:
tf_deepmint = []
for i in range(690):
  text = list(test['deepmint_preprocessed'])[i]
  array = vectorize([str(text)],  X_deepmint_array)
  tf_deepmint.append(raking(array))

In [None]:
test['tf_deepmint'] = tf_deepmint

In [None]:
test.astype({'tf_deepmint': 'int32'}).dtypes

In [None]:
accuracy_score(test['Номер связки'], test['tf_deepmint'])

0.5666666666666667

# WITHOUT PREPROCESSING

In [None]:
X_plain = vectorizer.fit_transform(train['Текст вопросов'].values.astype('U'))
X_plain_array = X_plain.toarray()
X_plain_array.shape

(1652, 12477)

In [None]:
tf_plain = []
for i in range(690):
  text = list(test['Текст вопросов'])[i]
  array = vectorize([str(text)],  X_plain_array)
  tf_plain .append(raking(array))

In [None]:
test['tf_plain'] = tf_plain

In [None]:
test.astype({'tf_plain': 'int32'}).dtypes

In [None]:
accuracy_score(test['Номер связки'], test['tf_plain'])

0.5608695652173913

# BM25

In [None]:
!pip install rank_bm25
from rank_bm25 import BM25Okapi

In [108]:
natasha = list(train['natasha_preprocessed'])
deepmint = list(train['deepmint_preprocessed'])
plain = list(train['Текст вопросов'])

In [109]:
tokenized_natasha = [str(text).split(' ') for text in natasha]
tokenized_deepmint = [str(text).split(' ') for text in deepmint]
tokenized_plain = [str(text).split(' ') for text in plain]

In [110]:
bm25_natasha = BM25Okapi(tokenized_natasha)
bm25_deepmint = BM25Okapi(tokenized_deepmint)
bm25_plain = BM25Okapi(tokenized_plain)

In [142]:
natasha_bm25 = []
for i in range(690):
  text = list(test['natasha_preprocessed'])[i]
  array = bm25_natasha.get_scores(str(text).split(' '))
  natasha_bm25.append(raking(array))

test['natasha_bm25'] = natasha_bm25

deepmint_bm25 = []
for i in range(690):
  text = list(test['deepmint_preprocessed'])[i]
  array = bm25_deepmint.get_scores(str(text).split(' '))
  deepmint_bm25.append(raking(array))

test['deepmint_bm25'] = deepmint_bm25

plain_bm25 = []
for i in range(690):
  text = list(test['Текст вопросов'])[i]
  array = bm25_plain.get_scores(str(text).split(' '))
  plain_bm25.append(raking(array))

test['plain_bm25'] = plain_bm25

In [143]:
test.head(3)

Unnamed: 0,Номер связки,Текст вопросов,deepmint_preprocessed,natasha_preprocessed,tf_natasha,tf_deepmint,tf_plain,natasha_bm25,deepmint_bm25,plain_bm25
39,1.0,Добрый день.\nЕсли тест на ковид окажется поло...,добрый день тест ковид оказаться положительны...,добрый день тест ковид оказаться положительны...,1.0,1.0,6.0,1.0,1.0,308.0
1096,308.0,"\nДобрый день! \n Подскажите, пожалуйста, как ...",добрый день подсказать пожалуйста ситуация...,добрый день подсказать пожалуйста ситуация...,308.0,308.0,308.0,308.0,308.0,308.0
787,6.0,Добрый день. Для уточнения результата теста на...,добрый день уточнение результат тест ковид ма...,добрый день уточнение результат тест ковид ма...,6.0,6.0,1.0,6.0,1.0,1.0


In [None]:
test.astype({'Номер связки': 'int32'}).dtypes

In [153]:
test = test.fillna(0)

In [None]:
test.astype({'plain_bm25': 'int32'}).dtypes

In [155]:
accuracy_score(test['Номер связки'], test['plain_bm25'])

0.5173913043478261

In [157]:
test.astype({'deepmint_bm25': 'int32'}).dtypes
accuracy_score(test['Номер связки'], test['deepmint_bm25'])

0.5449275362318841

In [158]:
test.astype({'natasha_bm25': 'int32'}).dtypes
accuracy_score(test['Номер связки'], test['natasha_bm25'])

0.5391304347826087