# Механизм поиска по сайту

### Используя полученные наработки двух предыдущих модулей, реализуем поисковый движок

In [1]:
# Применим библиотеку машинного обучения sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
# функционал метода главных компонент 
from sklearn.decomposition import PCA
# метод ближайших соседей 
from sklearn.neighbors import NearestNeighbors

import pandas as pd

import nltk
from nltk.corpus import stopwords

import re
import pymorphy2

from string import punctuation

# стопслова - те, которые не будем использовать при обучении:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zlatt\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# читаем датафрейм из файла с токенизированными текстами
pages_data = pd.read_csv('data/link_page_sentences.csv')

# удаляем пустые значения из строк
pages_data.dropna(inplace=True)
# pages_data.drop('index', axis=1, inplace=True)
pages_data.set_index(['link', 'sentence_order'], inplace=True)

pages_data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,sentence_text
link,sentence_order,Unnamed: 2_level_1
data/raw_pages/zrg74.ru/obshhestvo/item/26920-chistovoe-vyrazhenie-v-zlatouste-oglasili-sroki-sdachi-10-jetazhki-dlja-vethoavarijshhikov.html,0,--
data/raw_pages/zrg74.ru/obshhestvo/item/26920-chistovoe-vyrazhenie-v-zlatouste-oglasili-sroki-sdachi-10-jetazhki-dlja-vethoavarijshhikov.html,1,чистовой выражение
data/raw_pages/zrg74.ru/obshhestvo/item/26920-chistovoe-vyrazhenie-v-zlatouste-oglasili-sroki-sdachi-10-jetazhki-dlja-vethoavarijshhikov.html,2,в златоуст огласить срок сдача 10-этажка для в...
data/raw_pages/zrg74.ru/obshhestvo/item/26920-chistovoe-vyrazhenie-v-zlatouste-oglasili-sroki-sdachi-10-jetazhki-dlja-vethoavarijshhikov.html,4,1em
data/raw_pages/zrg74.ru/obshhestvo/item/26920-chistovoe-vyrazhenie-v-zlatouste-oglasili-sroki-sdachi-10-jetazhki-dlja-vethoavarijshhikov.html,5,1em


In [3]:
# посмотрим, что модуль nltk по умолчанию относит к стопсловам (первые 20)
stopwords.words('russian')[:20]

['и',
 'в',
 'во',
 'не',
 'что',
 'он',
 'на',
 'я',
 'с',
 'со',
 'как',
 'а',
 'то',
 'все',
 'она',
 'так',
 'его',
 'но',
 'да',
 'ты']

In [4]:
# посмотрим размер нашего датасета 
pages_data.shape

(8034, 1)

In [5]:
# Обучим векторизатор по принципу tf-idf 
# если в общих чертах, то данный метод придает бОльший вес словам, которые часто встречаются 
# в одном доументе из множества нашего корпуса, а в других текстах их мало или нет вовсе
# vectorizer = TfidfVectorizer(stop_words=stopwords.words('russian'), min_df=0.01)

vectorizer = TfidfVectorizer(stop_words=stopwords.words('russian'))
page_indices_array = vectorizer.fit_transform(pages_data['sentence_text']).toarray()
columns = list(vectorizer.vocabulary_.keys())

page_indices = pd.DataFrame(page_indices_array, columns=columns)
page_indices.index = pages_data.index
page_indices.shape

(8034, 4231)

In [6]:
page_indices.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,чистовой,выражение,златоуст,огласить,срок,сдача,10,этажка,ветхоаварийщик,1em,...,jnews_6190be0ec09ea,jnews_module_26959_0_6190be0ec1764,jnews_module_26959_1_6190be0ec1b8e,jnews_module_26959_2_6190be0ec2c57,jnews_6190be0ecadb9,jnews_module_26959_3_6190be0ee4e1b,jnews_module_26959_4_6190be0ee5d2c,jnews_module_26959_5_6190be0ee6b62,jnews_module_26959_6_6190be0ee79b3,jnews_module_26959_7_6190be0ee9d14
link,sentence_order,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
data/raw_pages/zrg74.ru/obshhestvo/item/26920-chistovoe-vyrazhenie-v-zlatouste-oglasili-sroki-sdachi-10-jetazhki-dlja-vethoavarijshhikov.html,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
data/raw_pages/zrg74.ru/obshhestvo/item/26920-chistovoe-vyrazhenie-v-zlatouste-oglasili-sroki-sdachi-10-jetazhki-dlja-vethoavarijshhikov.html,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
data/raw_pages/zrg74.ru/obshhestvo/item/26920-chistovoe-vyrazhenie-v-zlatouste-oglasili-sroki-sdachi-10-jetazhki-dlja-vethoavarijshhikov.html,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# теперь обучим векторизованный корпус методом ближайших соседей, использовав метрику косинусной близости
ranker = NearestNeighbors(n_neighbors=100, metric='cosine', n_jobs=-1)
ranker.fit(page_indices)

NearestNeighbors(metric='cosine', n_jobs=-1, n_neighbors=100)

In [8]:
# смотрим в корпусе вторую строку столбца с токенами
pages_data.values[1][0]

'чистовой выражение'

In [9]:
# реализуем морфологический разбор слова, который мы писали в другом модуле
MORPH = pymorphy2.MorphAnalyzer()

def preprocess_query(query):
    """Метод, который будет предобрабатывать запрос:
    переводим в нижний регистр, удаляем двойные пробелы и делаем морф. преобразование"""
    query = query.lower()
    query = re.sub('\s\s+', ' ', query)
    
    wrds = []
    for wrd in nltk.word_tokenize(query):
        if wrd in punctuation:
            continue

        wrd = MORPH.parse(wrd)[0].normal_form
        wrds.append(wrd)

    clear_query = ' '.join(wrds)
    
    return clear_query

In [10]:
query = 'Новостройка новоселье'
query

'Новостройка новоселье'

In [11]:
preprocess_query(query)

'новостройка новоселье'

In [12]:
# реализованные ранее методы:
query = preprocess_query(query)
query_vect = vectorizer.transform([query]).toarray()

scores, indices = ranker.kneighbors(query_vect)
scores = scores[0]
indices = indices[0]

In [13]:
scores, indices

(array([0.57156048, 0.94467594, 1.        , 1.        , 1.        ,
        1.        , 1.        , 1.        , 1.        , 1.        ,
        1.        , 1.        , 1.        , 1.        , 1.        ,
        1.        , 1.        , 1.        , 1.        , 1.        ,
        1.        , 1.        , 1.        , 1.        , 1.        ,
        1.        , 1.        , 1.        , 1.        , 1.        ,
        1.        , 1.        , 1.        , 1.        , 1.        ,
        1.        , 1.        , 1.        , 1.        , 1.        ,
        1.        , 1.        , 1.        , 1.        , 1.        ,
        1.        , 1.        , 1.        , 1.        , 1.        ,
        1.        , 1.        , 1.        , 1.        , 1.        ,
        1.        , 1.        , 1.        , 1.        , 1.        ,
        1.        , 1.        , 1.        , 1.        , 1.        ,
        1.        , 1.        , 1.        , 1.        , 1.        ,
        1.        , 1.        , 1.        , 1.  

In [14]:
# выберем строки с наиболее близкими новостями, соответствующими нашему запросу
page_indices.iloc[indices]

Unnamed: 0_level_0,Unnamed: 1_level_0,чистовой,выражение,златоуст,огласить,срок,сдача,10,этажка,ветхоаварийщик,1em,...,jnews_6190be0ec09ea,jnews_module_26959_0_6190be0ec1764,jnews_module_26959_1_6190be0ec1b8e,jnews_module_26959_2_6190be0ec2c57,jnews_6190be0ecadb9,jnews_module_26959_3_6190be0ee4e1b,jnews_module_26959_4_6190be0ee5d2c,jnews_module_26959_5_6190be0ee6b62,jnews_module_26959_6_6190be0ee79b3,jnews_module_26959_7_6190be0ee9d14
link,sentence_order,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
data/raw_pages/zrg74.ru/obshhestvo/item/26920-chistovoe-vyrazhenie-v-zlatouste-oglasili-sroki-sdachi-10-jetazhki-dlja-vethoavarijshhikov.html,329,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26920-chistovoe-vyrazhenie-v-zlatouste-oglasili-sroki-sdachi-10-jetazhki-dlja-vethoavarijshhikov.html,284,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26932-kanikuljarnyj-podschjot-bolee-6-tysjach-junyh-zlatoustovcev-otdohnuli-jetim-letom-v-lagerjah-i-zdravnicah.html,105,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26932-kanikuljarnyj-podschjot-bolee-6-tysjach-junyh-zlatoustovcev-otdohnuli-jetim-letom-v-lagerjah-i-zdravnicah.html,50,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26932-kanikuljarnyj-podschjot-bolee-6-tysjach-junyh-zlatoustovcev-otdohnuli-jetim-letom-v-lagerjah-i-zdravnicah.html,49,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26932-kanikuljarnyj-podschjot-bolee-6-tysjach-junyh-zlatoustovcev-otdohnuli-jetim-letom-v-lagerjah-i-zdravnicah.html,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26932-kanikuljarnyj-podschjot-bolee-6-tysjach-junyh-zlatoustovcev-otdohnuli-jetim-letom-v-lagerjah-i-zdravnicah.html,137,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26932-kanikuljarnyj-podschjot-bolee-6-tysjach-junyh-zlatoustovcev-otdohnuli-jetim-letom-v-lagerjah-i-zdravnicah.html,138,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26932-kanikuljarnyj-podschjot-bolee-6-tysjach-junyh-zlatoustovcev-otdohnuli-jetim-letom-v-lagerjah-i-zdravnicah.html,139,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26932-kanikuljarnyj-podschjot-bolee-6-tysjach-junyh-zlatoustovcev-otdohnuli-jetim-letom-v-lagerjah-i-zdravnicah.html,129,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
candidate_links = page_indices.iloc[indices]['link'] # .reset_index(drop=True)
score_df = pd.DataFrame({'candidate_link': candidate_links, 'score': scores})

# сбрасываем индекс
score_df.reset_index()

Unnamed: 0,link,sentence_order,candidate_link,score
0,data/raw_pages/zrg74.ru/obshhestvo/item/26920-...,329,0.0,0.571560
1,26920-chistovoe-vyrazhenie-v-zlatouste-oglasil...,284,0.0,0.944676
2,26932-kanikuljarnyj-podschjot-bolee-6-tysjach-...,105,0.0,1.000000
3,26932-kanikuljarnyj-podschjot-bolee-6-tysjach-...,50,0.0,1.000000
4,26932-kanikuljarnyj-podschjot-bolee-6-tysjach-...,49,0.0,1.000000
...,...,...,...,...
95,26932-kanikuljarnyj-podschjot-bolee-6-tysjach-...,137,0.0,1.000000
96,26932-kanikuljarnyj-podschjot-bolee-6-tysjach-...,138,0.0,1.000000
97,26932-kanikuljarnyj-podschjot-bolee-6-tysjach-...,139,0.0,1.000000
98,26932-kanikuljarnyj-podschjot-bolee-6-tysjach-...,129,0.0,1.000000


In [16]:
# группируем по столбцу link, суммируем по оценкам, затем сортируем по возрастанию
group_score_df = score_df.groupby('link').sum()['score']
group_score_df = group_score_df.sort_values()

# наиболее релевантная ссылка нашему запросу:
best_link = group_score_df.index[0]
best_link

'data/raw_pages/zrg74.ru/obshhestvo/item/26920-chistovoe-vyrazhenie-v-zlatouste-oglasili-sroki-sdachi-10-jetazhki-dlja-vethoavarijshhikov.html'

In [17]:
# список подобранных релевантных ссылок
group_score_df[:10].index.tolist()

['data/raw_pages/zrg74.ru/obshhestvo/item/26920-chistovoe-vyrazhenie-v-zlatouste-oglasili-sroki-sdachi-10-jetazhki-dlja-vethoavarijshhikov.html',
 '26920-chistovoe-vyrazhenie-v-zlatouste-oglasili-sroki-sdachi-10-jetazhki-dlja-vethoavarijshhikov.html',
 '26932-kanikuljarnyj-podschjot-bolee-6-tysjach-junyh-zlatoustovcev-otdohnuli-jetim-letom-v-lagerjah-i-zdravnicah.html']

In [18]:
# проведем обучение методом главных компонент (снижаем размерность)
pca = PCA(n_components=60)
pca_indices_array = pca.fit_transform(page_indices_array)

In [19]:
pca_indices_array[0]

array([-1.53314500e-01, -3.31280018e-02, -4.13072141e-02, -2.72379687e-02,
        1.49590068e-02,  1.78771198e-02, -6.86591511e-03, -1.34256688e-02,
       -8.30930630e-03, -9.84024470e-03,  1.04300438e-03,  3.08536920e-03,
       -4.24361818e-04,  8.06829238e-04, -2.39459496e-03, -4.51036934e-03,
        3.37681596e-04,  1.03079895e-03, -6.82783655e-03,  1.62004031e-03,
        2.57762663e-03,  1.12312408e-03,  7.48623866e-04,  8.93534864e-03,
       -1.87960465e-03,  2.24920648e-04, -1.07540463e-02,  1.25154806e-02,
        9.33076934e-03,  4.45860498e-03, -8.97011420e-03, -4.73475444e-03,
       -7.70517016e-03, -5.90825641e-03, -1.69009473e-03,  3.97590521e-04,
        1.11169197e-03,  2.88122236e-03, -3.59986673e-03, -2.91199465e-04,
        2.18997723e-03, -1.18753799e-03,  2.99293076e-03,  3.47538112e-03,
        3.77985432e-04,  9.78610788e-04,  1.13120952e-04,  3.36953830e-03,
        3.75102761e-04, -5.02523426e-04,  1.88025336e-03,  2.17994043e-03,
        1.91042504e-03, -

In [20]:
ranker_pca = NearestNeighbors(n_neighbors=100, metric='cosine', n_jobs=-1)
ranker_pca.fit(pca_indices_array)

NearestNeighbors(metric='cosine', n_jobs=-1, n_neighbors=100)

In [21]:
pca.transform(vectorizer.transform([query]).toarray())

array([[-1.53442003e-01, -3.32257394e-02, -4.13070525e-02,
        -2.73123647e-02,  1.49813399e-02,  1.77601090e-02,
        -6.81642830e-03, -1.34602861e-02, -8.41517599e-03,
        -1.01105696e-02,  8.59026953e-04,  2.96935645e-03,
        -6.13285406e-04,  1.05501890e-03, -2.46175645e-03,
        -4.56745161e-03,  1.65021633e-05,  7.96943998e-04,
        -7.27769085e-03,  2.10448744e-03,  2.15539459e-03,
         1.06910748e-03,  7.74789337e-04,  8.60346372e-03,
        -1.66064449e-03,  4.52730722e-04, -1.07537882e-02,
         1.27782059e-02,  9.11323920e-03,  4.61484011e-03,
        -9.00331845e-03, -4.81030862e-03, -7.88968624e-03,
        -5.55976104e-03, -2.07398047e-03,  6.39724740e-04,
         1.53229814e-03,  3.06220554e-03, -3.65149143e-03,
        -1.37807274e-04,  2.26553271e-03, -1.09853072e-03,
         2.85028038e-03,  2.66431893e-03,  1.17605022e-04,
         9.20858429e-04, -3.29281274e-04,  3.49202764e-03,
         1.88891847e-04, -4.34137000e-04,  1.99696419e-0

In [22]:
def search_page(query, vectorizer, pca, ranker, page_count=10, score_type='sum'):
    """Функция поиска; page_count=10 - количество страниц с ответами, 
    score_type='sum' - как будем подсчитывать оценку (можно взять среднее)"""
    query = preprocess_query(query)
    query_vect = vectorizer.transform([query]).toarray()
    query_vect = pca.transform(query_vect)
    
    scores, indices = ranker.kneighbors(query_vect)
    scores = scores[0]
    indices = indices[0]
    

    candidate_links = page_indices.iloc[indices]['link'] # .reset_index(drop=True)
    score_df = pd.DataFrame({'candidate_link': candidate_links, 'score': scores})

    # сбрасываем индекс
    score_df.reset_index()
    
    if score_type == 'mean':
        group_score_df = group_score_df.groupby('link').mean()['score']
        group_score_df = group_score_df.sort_values(ascending=True)
    elif score_type == 'min':
        group_score_df = score_df.groupby('link').min()['score']
        group_score_df = group_score_df.sort_values(ascending=True)
    else:
        score_df['score'] = 1 - score_df['score']
        group_score_df = score_df.groupby('link').sum()['score']
        group_score_df = group_score_df.sort_values(ascending=False)
    
    return group_score_df[:page_count].index.tolist()

In [23]:
search_page('новостройка новоселье', vectorizer, pca, ranker_pca, page_count=2, score_type='sum')

['data/raw_pages/zrg74.ru/obshhestvo/item/26932-kanikuljarnyj-podschjot-bolee-6-tysjach-junyh-zlatoustovcev-otdohnuli-jetim-letom-v-lagerjah-i-zdravnicah.html',
 'data/raw_pages/zrg74.ru/obshhestvo/item/26934-takoj-variant-zhiteli-zlatousta-mogut-projti-perepis-v-mfc.html']

In [24]:
import pickle
# «Пиклинг» - это процесс, посредством которого иерархия объекта Python 
# преобразуется в поток байтов, а «анпиклинг» - обратная операция, 
# посредством которой поток байтов 
# (из двоичного файла или байтоподобного объекта) 
# преобразуется обратно в иерархию объектов.

with open('search_model', 'wb') as f:
    pickle.dump({'vectorizer': vectorizer, 'pca': pca, 'ranker': ranker_pca}, f)

In [25]:
with open('search_model', 'rb') as f:
    search_model = pickle.load(f)
    
search_model

{'vectorizer': TfidfVectorizer(stop_words=['и', 'в', 'во', 'не', 'что', 'он', 'на', 'я', 'с',
                             'со', 'как', 'а', 'то', 'все', 'она', 'так', 'его',
                             'но', 'да', 'ты', 'к', 'у', 'же', 'вы', 'за', 'бы',
                             'по', 'только', 'ее', 'мне', ...]),
 'pca': PCA(n_components=60),
 'ranker': NearestNeighbors(metric='cosine', n_jobs=-1, n_neighbors=100)}

In [26]:
vectorizer = search_model['vectorizer']
pca = search_model['pca']
ranker_pca = search_model['ranker']

vectorizer, pca, ranker_pca

(TfidfVectorizer(stop_words=['и', 'в', 'во', 'не', 'что', 'он', 'на', 'я', 'с',
                             'со', 'как', 'а', 'то', 'все', 'она', 'так', 'его',
                             'но', 'да', 'ты', 'к', 'у', 'же', 'вы', 'за', 'бы',
                             'по', 'только', 'ее', 'мне', ...]),
 PCA(n_components=60),
 NearestNeighbors(metric='cosine', n_jobs=-1, n_neighbors=100))

In [27]:
search_page('Туманов', vectorizer, pca, ranker_pca, page_count=3, score_type='sum')

['data/raw_pages/zrg74.ru/obshhestvo/item/26948-dlja-vseh-ljuboznatelnyh-v-zlatouste-opredelilis-s-ploshhadkoj-dlja-tehnoparka-kvantorium.html',
 'data/raw_pages/zrg74.ru/obshhestvo/item/26943-osennie-shtrihi-v-zlatouste-namechennye-sezonnye-preobrazovanija-podveli-k-finishnoj-cherte.html',
 'data/raw_pages/zrg74.ru/obshhestvo/item/26932-kanikuljarnyj-podschjot-bolee-6-tysjach-junyh-zlatoustovcev-otdohnuli-jetim-letom-v-lagerjah-i-zdravnicah.html']