In [None]:
from pymystem3 import Mystem
from glob import glob
import numpy as np
import linecache as lc
import tkinter
from tkinter import *
from operator import itemgetter

In [None]:
TFIDF_INDEX_PATH = 'data/inverse-index-tfidf/'
LINK_STORAGE_PATH = 'data/crawl/index.txt'

In [None]:
def get_lemmas(query):
    return Mystem().lemmatize(query)

In [None]:
def get_lemmas_tfidfs(lemmas):
    lemmas_tfidfs = {}
    
    for lemma in lemmas:
        files = glob(TFIDF_INDEX_PATH + 'lemmas/' + lemma + '.txt')
        if files:
            with open(files[0], 'r') as file:
                lemmas_tfidfs[lemma] = [float(tfidf) for tfidf in file.readlines()]
    
    return lemmas_tfidfs

In [None]:
def count_lemmas(lemmas):
    lemmas_counts = {}
    for lemma in lemmas:
        if lemma not in lemmas_counts:
            lemmas_counts[lemma] = 0
        lemmas_counts[lemma] += 1
    
    return lemmas_counts

In [None]:
def compute_lemmas_tfs(lemmas):
    lemmas_counts = count_lemmas(lemmas)
    lemmas_tfs = {}
    for lemma,count in lemmas_counts.items():
        lemmas_tfs[lemma] = count / len(lemmas)
    
    return lemmas_tfs

In [None]:
def compute_q_vec_len(lemmas_tfs, lemmas_idfs):
    q_vec_len = .0
    for lemma in lemmas_tfs.keys():
        q_vec_len += (lemmas_tfs[lemma] * lemmas_idfs[lemma])**2
        
    return np.sqrt(q_vec_len)

In [None]:
def get_docs_vecs_lens():
    with open(TFIDF_INDEX_PATH + 'index.txt', 'r') as file:
        return [float(v_len) for v_len in file.readlines()]

In [None]:
def compute_similarity(q_lemmas, q_v_len, q_lemmas_tfs, lemmas_idfs, lemmas_tfidfs, d_v_len, d_num):
    divisible = .0
    divisor = q_v_len * d_v_len
    
    for lemma in q_lemmas:
        divisible += (q_lemmas_tfs[lemma] * lemmas_idfs[lemma]) * lemmas_tfidfs[lemma][d_num]
        
    return divisible / divisor    

In [None]:
def compute_docs_similarities(q_lemmas, q_v_len, q_lemmas_tfs, lemmas_idfs, lemmas_tfidfs, d_v_lens):
    d_similarities = {}
    for i in range(0,100):
        d_similarities[i] = compute_similarity(q_lemmas, q_v_len, \
            q_lemmas_tfs, lemmas_idfs, lemmas_tfidfs, d_v_lens[i], i)
        
    return d_similarities

In [None]:
def get_sorted_docs(d_similarities):
    sorted_docs = sorted(d_similarities.items(), key=itemgetter(1), reverse=True)
    
    return [tup[0] for tup in sorted_docs if tup[1] != .0]

In [None]:
def find_in_index(query_lemmas):
    lemmas_tfidfs = get_lemmas_tfidfs(query_lemmas)
    
    if not lemmas_tfidfs:
        return []
    
    lemmas_idfs = {}
    for lemma,tfidfs in lemmas_tfidfs.items():
        lemmas_idfs[lemma] = tfidfs[0]
        lemmas_tfidfs[lemma] = tfidfs[1:]
    
    q_lemmas_tfs = compute_lemmas_tfs(query_lemmas)
    
    q_v_len = compute_q_vec_len(q_lemmas_tfs, lemmas_idfs)

    d_v_lens = get_docs_vecs_lens()
    
    d_similarities = compute_docs_similarities(query_lemmas, q_v_len, q_lemmas_tfs, lemmas_idfs, \
        lemmas_tfidfs, d_v_lens)
    
    return get_sorted_docs(d_similarities)

In [None]:
def search(query):
    query_lemmas = get_lemmas(query)
    query_lemmas = [lemma for lemma in query_lemmas if lemma not in [' ', '\n']]
    print('Query lemmas:', query_lemmas)
    docs = find_in_index(query_lemmas)

    links = []
    
    for doc in docs:
        links.append(lc.getline(LINK_STORAGE_PATH, doc + 1)[:-1])
    
    return links

In [None]:
def button_click():
    listbox.delete(0, listbox.size() - 1)
    err_label.grid_remove()
    
    query = text_field.get()
    if not query:
        err_label.grid()
        return
    
    links = search(query)
    
    if not links:
        listbox.insert(0, 'Ничего не найдено')
        return
    
    index = 0
    for link in links:
        listbox.insert(index, link)
        index += 1

In [None]:
root = tkinter.Tk()
root.title('Vector search')

label = Label(root, text='Введите запрос', bd=5, font='timesnewroman 10')
err_label = Label(root, text='Вы ничего не ввели!', bd=5, fg='red', font='timesnewroman 10')
text_field = Entry(root, bd=2, width=37, font='timesnewroman 10')
button = Button(root, text='Искать', bd=2, command=button_click, font='timesnewroman 10')

label.grid(columnspan=3, padx=3, pady=3, sticky='W')
err_label.grid(row=1, columnspan=3, padx=3, pady=3, sticky='W')
err_label.grid_remove()
text_field.grid(row=2, columnspan=2, padx=3, pady=3)
button.grid(row=2, column=2, padx=3, pady=3, sticky='E')

listbox = Listbox(root, bd=2, width=55, height=15, font='timesnewroman 10')
listbox.yview()
listbox.grid(row=3, columnspan=3, padx=3, pady=3)

root.mainloop()