## ДЗ по поиску

Привет! Вам надо реализивать поисковик на базе вопросов-ответов с сайта [pravoved.ru](https://pravoved.ru/questions-archive/).        
Поиск должен работать на трех технологиях:       
1. обратном индексе     
2. word2vec         
3. doc2vec      

Вы должны понять, какой метод и при каких условиях эксперимента на этом корпусе работает лучше.          
Для измерения качества поиска найдите точность (accuracy) выпадания правильного ответа на конкретный вопрос (в этой базе у каждого вопроса есть только один правильный ответ). Точность нужно измерить для всей базы.    
При этом давайте считать, что выпал правильный ответ, если он попал в **топ-5** поисковой выдачи.

> Сделайте ваш поиск максимально качественным, чтобы значение точности стремилось к 1.     
Для этого можно поэкспериментировать со следующим:       
- модель word2vec (можно брать любую из опен сорса или обучить свою)
- способ получения вектора документа через word2vec: простое среднее арифметическое или взвешивать каждый вектор в соответствии с его tf-idf      
- количество эпох у doc2vec (начинайте от 100)
- предобработка документов для обучения doc2vec (удалять / не удалять стоп-слова)
- блендинг методов поиска: соединить результаты обратного индекса и w2v, или (что проще) w2v и d2v

На это задание отведем 10 дней. Дэдлайн сдачи до полуночи 12.10.

In [19]:
import pickle
import os
import json
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from pymystem3.mystem import Mystem

mystem = Mystem()

import os
import json
from nltk.tokenize import word_tokenize
from collections import defaultdict, Counter
from string import punctuation, digits
from math import log

punctuation = set(punctuation + '«»—–…“”\n\t' + digits)

import pickle
import json
import numpy as np
from gensim import matutils
from gensim.models import Word2Vec, KeyedVectors
from gensim.models.fasttext import FastText
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from tqdm import tqdm_notebook
from judicial_splitter import splitter

import warnings
warnings.filterwarnings('ignore')

In [2]:
def preprocessing(input_text, del_stopwords=True, del_digit=False):
    """
    :input: raw text
        1. lowercase, del punctuation, tokenize
        2. normal form
        3. del stopwords
        4. del digits
    :return: lemmas
    """
    russian_stopwords = set(stopwords.words('russian'))
    words = [x.lower().strip(string.punctuation+'»«–…') for x in word_tokenize(input_text)]
    lemmas = [mystem.lemmatize(x)[0] for x in words if x]

    lemmas_arr = []
    for lemma in lemmas:
        if del_stopwords:
            if lemma in russian_stopwords:
                continue
        if del_digit:
            if lemma.isdigit():
                continue
        lemmas_arr.append(lemma)
    return lemmas_arr

In [3]:
def preprocess_files(mystem, file, files_list):
    if file.endswith('.txt'):
        with open(file, 'r', encoding='utf-8') as f:
            text = f.read()
    else:
        text = file
    
    table = str.maketrans({ch: ' ' for ch in punctuation})
    tokenized = word_tokenize(text.replace('\ufeff', '').lower().translate(table))
    words_list = [mystem.lemmatize(word)[0] for word in tokenized]
    text_length = len(tokenized)
    
    document_length[files_list.index(file)] = text_length
    
    return words_list

In [4]:
def get_inverted_index(mystem, files_list):
    """
    Create inverted index by input doc collection and count the length of each document 
    :return: inverted index
    """
    inverted_index = defaultdict(list)
    global document_length
    document_length = [None] * len(files_list)

    for file in files_list:
        for word in preprocess_files(mystem, file, files_list):
            inverted_index[word].append(files_list.index(file))
            
    with open('inverted_index.json', 'w', encoding='utf-8') as fw:
        json.dump(inverted_index, fw, ensure_ascii=False)

    with open('document_length.json', 'w', encoding='utf-8') as fw:
        json.dump(document_length, fw, ensure_ascii=False)
    
    return inverted_index, document_length

In [5]:
def score_BM25(qf, dl, avgdl, k1, b, N, n):
    """
    Compute similarity score between search query and documents from collection
    :return: score
    """
    score = log((N - n + 0.5) / (n + 0.5)) * (k1 + 1) * qf / (qf + k1 * (1 - b + b * dl / avgdl))
    return score

In [6]:
def compute_sim(lemma, inverted_index, document_length):
    """
    Compute similarity score between word in search query and all document from collection
    :return: score
    """
    if inverted_index.get(lemma):
        doc_list = inverted_index[lemma]
        relevance_score = {}
        avgdl = sum(document_length) / len(document_length)
        N = len(document_length)
    
        for doc in range(N):    
            qf = Counter(inverted_index[lemma])[doc]
            relevance_score[doc] = score_BM25(qf, document_length[doc], avgdl,
                                          2.0, 0.75, N, len(set(inverted_index[lemma])))
        return relevance_score
    return

In [7]:
def get_search_result(query, inverted_index, files_list, document_length, num_res):
    """
    Compute sim score between search query and all documents in collection
    Collect as pair (doc_id, score)
    :param query: input text
    :return: list of lists with (doc_id, score)
    """
    relevance_dict = defaultdict(float)
    
    for lemma in query:
        score = compute_sim(lemma, inverted_index, document_length)
        if score:
            for elem in score:
                relevance_dict[elem] += score[elem]    
    result = sorted(relevance_dict, key=relevance_dict.get, reverse=True)[:num_res]
    
    return result

In [8]:
def similarity(v1, v2):
    v1_norm = matutils.unitvec(np.array(v1))
    v2_norm = matutils.unitvec(np.array(v2))
    return np.dot(v1_norm, v2_norm)

In [9]:
def get_w2v_vectors(model, lemmas):
    """Получает вектор документа"""
    vec_list = []
    
    for word in lemmas:
        try:
            vec = model.wv[word]
            vec_list.append(vec)
        except:
            continue
    
    return (sum(vec_list) / len(vec_list))


def save_w2v_base(files_list, model, mystem, save=True, title='w2v_base'):
    """Индексирует всю базу для поиска через word2vec"""
    documents_info = []    
    
    for i, file in tqdm_notebook(enumerate(files_list)):
        if file.endswith('.txt'):
            with open(file, 'r', encoding='utf-8') as f:
                text = f.read()
        else:
            text = file
            file = i

        lemmas = preprocessing(text)
        vec = get_w2v_vectors(model, lemmas)
            
        file_info = {'file': file, 'word2vec': vec}
        documents_info.append(file_info)
    
    if save:
        with open(title + '.pkl', 'wb') as fw:
            pickle.dump(documents_info, fw)
    
    return documents_info


def search_w2v(query, w2v_model, data_word2vec, n_results):
    vec1 = get_w2v_vectors(w2v_model, query)
    similarity_dict = {}
    
    for elem in data_word2vec:
        sim = similarity(vec1, elem['word2vec'])
        similarity_dict[sim] = elem['file']
        
    relevant = [similarity_dict[sim] for sim in sorted(similarity_dict, reverse=True)[:n_results]]
    return relevant

In [10]:
def get_paragraphs(files_list, mystem, del_stopwords=False):
    file_text = {}
    data = []
    
    for i, file in enumerate(files_list):
        if file.endswith('.txt'):
            with open(file, 'r', encoding='utf-8') as f:
                text = f.read()
                file_text[file] = text
        else:
            text = file
            file = i
        
        paragraphs = splitter(text, 1)
            
        for paragraph in paragraphs:
            paragraph_lemmatized = preprocessing(paragraph, del_stopwords)
            data.append({'file': file, 'paragraph': paragraph_lemmatized})

    if file_text:
        with open('file_text', 'w', encoding='utf-8') as fw:
            json.dump(file_text, fw)
        return data, file_text
    
    else:
        return data

In [11]:
def train_doc2vec(data, epochs, save=True, title='d2v_model'):
    tagged_data = [TaggedDocument(words=elem['paragraph'],tags=[str(i)]) for i, elem in enumerate(data)]
    model = Doc2Vec(vector_size=100, min_count=5, alpha=0.025, min_alpha=0.025, epochs=epochs, workers=4, dm=1)
    
    model.build_vocab(tagged_data)
    model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)
    
    if save:
        with open(title + '.pkl', 'wb') as fw:
            pickle.dump(model, fw)
    
    return model

def get_d2v_vectors(model, lemmas):
    """Получает вектор документа"""
    vec = model.infer_vector(lemmas)
    return vec
    

def save_d2v_base(model, paragraphs, save=True, title='d2v_base'):
    """Индексирует всю базу для поиска через doc2vec"""
    documents_info = []    
    
    for paragraph in paragraphs:
        vec = get_d2v_vectors(model, paragraph['paragraph'])
            
        file_info = {'file': paragraph['file'], 'doc2vec': vec}
        documents_info.append(file_info)
    
    if save:
        with open(title + '.pkl', 'wb') as fw:
            pickle.dump(documents_info, fw)
    
    return documents_info 


def search_d2v(query, d2v_model, data_doc2vec, n_results):
    vec1 = get_d2v_vectors(d2v_model, query)
    similarity_dict = {}
    
    for elem in data_doc2vec:
        sim = similarity(vec1, elem['doc2vec'])
        similarity_dict[sim] = elem['file']
        
    relevant = [similarity_dict[sim] for sim in sorted(similarity_dict, reverse=True)[:n_results]]
    return relevant

In [12]:
with open('qa_corpus.pkl', 'rb') as file:
    qa_corpus = pickle.load(file)

Всего в корпусе 1384 пары вопрос-ответ

In [13]:
len(qa_corpus)

1384

Первый элемент блока это вопрос, второй - ответ на него

In [14]:
qa_corpus[0]

['\nДобрый день.Мой сын гражданин Украины (ДНР),имеет вид на жительство в Р.Ф., кот.получил проживая с 2014 г. в Нижегородской области.В 2017г. переехал на постоянное место жительство в г.Ростов.Официально трудоустроился на одно из промышл.предприятий г.Ростова.Оформил временную регистрацию в Ростове.В УФМС предупредили,что по истечении 90 дней он должен либо постоянно прописаться либо покинуть территорию России.Прошу проконсультировать как быть дальше.(Вернуться домой в Донецк,но здесь идет война,работы нет.В Ростове он работает по специальности.Он инженер машиностроитель.)Временная прописка до 15 марта.  Если он сможет приобрести какую либо недвижимость,как долго будет решаться вопрос о его постоянной прописке в Ростове.Как в этом случае будет решаться вопрос с видом на жительство в Ростове? Не получится ли ,что приобретя квартиру,он не успеет в ней прописаться до окончании срока временной регистрации. С уважением Людмила Евгеньевна.\n',
 'Добрый вечер!Из Вашего вопроса вообще ничего

In [15]:
questions = [elem[0] for elem in qa_corpus]
answers = [elem[1] for elem in qa_corpus]

with open('questions.json', 'w', encoding='utf-8') as fw:
    json.dump(questions, fw)

with open('answers.json', 'w', encoding='utf-8') as fw:
    json.dump(answers, fw)

In [17]:
with open('questions.json', 'r', encoding='utf-8') as f:
    questions = json.load(f)

with open('answers.json', 'r', encoding='utf-8') as f:
    answers = json.load(f)

In [18]:
inverted_index, document_length = get_inverted_index(mystem, answers)

CPU times: user 16.7 s, sys: 3.48 s, total: 20.2 s
Wall time: 45.6 s


In [19]:
with open('inverted_index.json', 'r', encoding='utf-8') as f:
    inverted_index = json.load(f)

with open('document_length.json', 'r', encoding='utf-8') as f:
    document_length = json.load(f)

In [41]:
def search(query, search_method, n_results=5, return_answer_text=False):
    
    query = preprocessing(query, del_stopwords=False)
    
    try:
        if search_method == 'inverted_index':
            search_result = get_search_result(query, inverted_index, answers, document_length, n_results)

        elif search_method == 'word2vec':
            search_result = search_w2v(query, w2v_model, data_word2vec, n_results)

        elif search_method == 'doc2vec':
            search_result = search_d2v(query, d2v_model, data_doc2vec, n_results)

        else:
            raise TypeError('unsupported search method')

    except:
        search_result = ['Не найдено результатов по заданному запросу']
    
    if not return_answer_text:
        return search_result
    
    results = [(index, answers[index]) for index in search_result]
    
    return results

## word2vec

In [21]:
w2v_model = FastText.load('/Users/alinashaymardanova/Downloads/araneum_none_fasttextskipgram_300_5_2018/araneum_none_fasttextskipgram_300_5_2018.model')
w2v_model.init_sims(replace=True)

In [22]:
%%time

data_word2vec = save_w2v_base(answers, w2v_model, mystem)

A Jupyter Widget


CPU times: user 16.6 s, sys: 3.69 s, total: 20.3 s
Wall time: 44.6 s


In [25]:
accuracy_score = 0
answers_index = []

for i, question in enumerate(tqdm_notebook(questions)):

    search_result = search(question, 'word2vec')

    if i in search_result:
        accuracy_score += 1
        answers_index.append(i)

final_accuracy = accuracy_score / len(questions)        

final_accuracy

A Jupyter Widget




0.0050578034682080926

## word2vec cbow

In [26]:
w2v_model = FastText.load('/Users/alinashaymardanova/Downloads/araneum_none_fasttextskipgram_300_5_2018/araneum_none_fasttextskipgram_300_5_2018.model')
w2v_model.init_sims(replace=True)

In [27]:
%%time

data_word2vec = save_w2v_base(answers, w2v_model, mystem, title='w2v_base_cbow')

A Jupyter Widget


CPU times: user 16.7 s, sys: 3.7 s, total: 20.4 s
Wall time: 43.5 s


In [28]:
accuracy_score = 0
answers_index = []

for i, question in enumerate(tqdm_notebook(questions)):

    search_result = search(question, 'word2vec')

    if i in search_result:
        accuracy_score += 1
        answers_index.append(i)

final_accuracy = accuracy_score / len(questions)        

final_accuracy

A Jupyter Widget




0.0050578034682080926

## doc2vec со стоп-словами

In [29]:
paragraphs = get_paragraphs(answers, mystem)

In [30]:
d2v_model = train_doc2vec(paragraphs, 1000)

CPU times: user 11min 7s, sys: 3min 26s, total: 14min 33s
Wall time: 8min 42s


In [34]:
data_doc2vec = save_d2v_base(d2v_model, paragraphs)

In [36]:
accuracy_score = 0
answers_index = []

for i, question in enumerate(tqdm_notebook(questions)):

    search_result = search(question, 'doc2vec')

    if i in search_result:
        accuracy_score += 1
        answers_index.append(i)

final_accuracy = accuracy_score / len(questions)        

final_accuracy

A Jupyter Widget




0.002890173410404624

## doc2vec без стоп-слов

In [38]:
without_stopwords = get_paragraphs(answers, mystem, del_stopwords=False)

In [39]:
data_doc2vec = save_d2v_base(d2v_model, without_stopwords, title='d2v_base_without_stopwords')

In [40]:
accuracy_score = 0
answers_index = []

for i, question in enumerate(tqdm_notebook(questions)):

    search_result = search(question, 'doc2vec')

    if i in search_result:
        accuracy_score += 1
        answers_index.append(i)

final_accuracy = accuracy_score / len(questions)        

final_accuracy

A Jupyter Widget




0.001445086705202312

# Соединим

In [22]:
def merging(w2v, d2v, all_): 
    ans = {}
    for item in all_:
        if item in w2v: 
            it_w = w2v[item]
        else: it_w = 0
            
        if item in d2v: 
            it_d = d2v[item]
        else: it_d = 0
        
        ans[item] = (it_w * 0.7 + it_d * 0.3) / 2
    
    return ans

In [60]:
def search_w2v_(query, w2v_model, data_word2vec, n_results):
    vec1 = get_w2v_vectors(w2v_model, query)
    similarity_dict = {}
    
    for elem in data_word2vec:
        sim = similarity(vec1, elem['word2vec'])
        similarity_dict[sim] = elem['file']
        
    relevant = [similarity_dict[sim] for sim in sorted(similarity_dict, reverse=True)[:n_results]]
   
    return relevant_w2v

def search_d2v_(query, d2v_model, data_doc2vec, n_results):
    vec2 = get_d2v_vectors(d2v_model, query)
    similarity_dict = {}
    
    for elem in data_doc2vec:
        sim = similarity(vec1, elem['doc2vec'])
        similarity_dict[sim] = elem['file']
        
    relevant = [similarity_dict[sim] for sim in sorted(similarity_dict, reverse=True)[:n_results]]
    
    return relevant_d2v

In [61]:
def search_comb(query, w2v_model, d2v_model, n_results=5):
    relevant_w2v = search_w2v_(query, w2v_model, data_word2vec, n_results)
    relevant_d2v = search_d2v_(query, d2v_model, data_doc2vec, n_results)
    all_ = set(relevant_w2v) | set(relevant_d2v)
    ans = merging(relevant_w2v, relevant_d2v, all_)

    return sorted(ans.items(), reverse=True, key=lambda x: x[1])

In [70]:
accuracy_score = 0
answers_index = []

for i, question in enumerate(tqdm_notebook(questions)):
    
    search_result = search_comb(question, 'word2vec', 'doc2vec')

    if i in search_result:
        accuracy_score += 1
        answers_index.append(i)

final_accuracy = accuracy_score / len(questions)        

final_accuracy

A Jupyter Widget

0



ZeroDivisionError: division by zero

## inverted index

In [20]:
%%time

inv_index_score, inv_index_res = get_accuracy(questions, 'inverted_index')

A Jupyter Widget


CPU times: user 3h 28min 39s, sys: 1min 24s, total: 3h 30min 3s
Wall time: 3h 39min 31s


In [21]:
inv_index_score

0.4877167630057804