In [234]:
import os, sys
import re
import math
from pymystem3 import Mystem
from bs4 import BeautifulSoup, Comment
import pandas as pd
import numpy as np
import multiprocessing as mp
import urllib
import pickle
from tqdm import tqdm_notebook
import matplotlib.pyplot as plt
from collections import Counter
from elasticsearch import Elasticsearch
from elasticsearch.helpers import parallel_bulk
import networkx as nx

In [235]:
def read_docs(path):
    df = pd.read_csv(path, skiprows=0)
    ids = df['id'].to_numpy()
    docs = df['text'].to_numpy()
    return ids, docs

ids, docs = read_docs('texts.csv')

In [295]:
MYANDEX = 'myandex'
settings = {
    'mappings': {
        'properties': {
            'content': {
                'type': 'text'
             }
         }
    },
    'settings': {
        'analysis': {
            'analyzer': {
                'white_lover': {
                    'tokenizer': 'letter',
                    'filter': [
                        'lowercase', 
                    ]
                }
            }
        }
    }
}

es = Elasticsearch([{'host': 'localhost', 'port': 9200, 'timeout': 360, 'maxsize': 25}])
es.indices.delete(MYANDEX)
es.indices.create(index=MYANDEX, body=settings)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'myandex'}

In [296]:
def create_es_action(index, doc_id, document):
    return {
        '_index': index,
        '_id': doc_id,
        '_source': document
    }

def create_index(all_docs, index):
    def next_document():
        for i, doc in tqdm_notebook(list(zip(ids, all_docs))):
            if doc is not None:
                yield create_es_action(index, int(i), {'content': str(doc)})
    
    for ok, result in parallel_bulk(es, next_document(), queue_size=4, thread_count=4, chunk_size=1000):
        if not ok:
            print(result)

In [297]:
create_index(docs, MYANDEX)

HBox(children=(IntProgress(value=0, max=199368), HTML(value='')))

In [313]:
def search(query, index, *args, K=20):
    res = es.search(index=index, body=query, size=K)["hits"]
    pretty_res = []
    for hit in res['hits']:
        pretty_res.append({'id': hit["_id"],
                           'score': hit["_score"]})
    return pretty_res

def search_and_print(query, index, *args):
    pretty_print_result(es.search(index=index, body=query, size=20), args)
    # note that size set to 20 just because default value is 10 and we know that we have 12 docs and 10 < 12 < 20
                        
def pretty_print_result(search_result, fields=[]):
    # fields is a list of fields names which we want to be printed
    res = search_result['hits']
    print(f'Total documents: {res["total"]["value"]}')
    for hit in res['hits']:
        print(f'Doc {hit["_id"]}, score is {hit["_score"]}')
        for field in fields:
            print(f'{field}: {hit["_source"][field]}')
                  
def get_doc_by_id(doc_id):
    return es.get(index=MYANDEX, id=doc_id)['_source']

def build_query(query):
    return {
        'query': {
            'bool': {
                'must': [
                    {
                        'match': {
                            'content': query
                        }
                    }
                ]
            }
        }
    }

search(build_query('Андрей'), MYANDEX, 'content')

[{'id': '332329', 'score': 6.322534},
 {'id': '677707', 'score': 6.3214607},
 {'id': '1322832', 'score': 6.2616105},
 {'id': '308211', 'score': 6.216829},
 {'id': '563489', 'score': 6.2017894},
 {'id': '633870', 'score': 6.200276},
 {'id': '967868', 'score': 6.1842537},
 {'id': '720665', 'score': 6.136043},
 {'id': '1091727', 'score': 6.1353703},
 {'id': '528882', 'score': 6.1119347},
 {'id': '271267', 'score': 6.1099944},
 {'id': '1196434', 'score': 6.1097217},
 {'id': '1488521', 'score': 6.104635},
 {'id': '1308244', 'score': 6.0981293},
 {'id': '795213', 'score': 6.0903134},
 {'id': '1002619', 'score': 6.0827484},
 {'id': '1203731', 'score': 6.077414},
 {'id': '924753', 'score': 6.0756383},
 {'id': '64833', 'score': 6.0557446},
 {'id': '9098', 'score': 6.0557446}]

In [241]:
def print_index_size(index): 
    print(f"{(es.indices.stats(index)['_all']['primaries']['store']['size_in_bytes'] / 2 ** 30):.2f} GB")

In [242]:
print_index_size(MYANDEX)

3.17 GB


In [243]:
m = Mystem()

def lemmatize_doc(doc):
    return ''.join(m.lemmatize(str(doc)))
    
def lemmatize_collection(docs):
    result = []
    for doc in tqdm_notebook(docs):
        result.append(lemmatize_doc(doc))
    return result



In [244]:
def save_lemmas(lemmas):
    with open('lemmas.pickle', 'wb') as f:
        pickle.dump(lemmatized_docs, f)

def load_lemmas():
    with open('lemmas.pickle', 'rb') as f:
        return pickle.load(f)

In [245]:
# lemmatized_docs = lemmatize_collection(docs)
lemmatized_docs = load_lemmas()

In [246]:
def decode(s):
    s = base64.b64decode(s)
    s = codecs.decode(s, 'cp1251', errors='ignore')
    return s

def get_quieries(relevance):
    queries = {}
    with open('web2008_adhoc.xml','r', encoding="cp1251") as src:
        raw_xml = src.read()
        soup = BeautifulSoup(raw_xml)
        for task in soup.find_all('task'):
            if task['id'] in relevance:
                queries[task['id']] = task.querytext.string
    return queries

def get_relevance():
    relevance = {}
    with open('or_relevant-minus_table.xml','r', encoding="cp1251") as src:
        raw_xml = src.read()
        soup = BeautifulSoup(raw_xml)
        
        for task in soup.find_all('task'):
            documents = task.find_all('document') 
            vital = set()
            for doc in documents:
                if doc['relevance'] == 'vital':
                    vital.add(doc['id'])
            if vital:
                relevance[task['id']] = vital
    return relevance

In [247]:
relevance = get_relevance()
queries = get_quieries(relevance)
print(len(relevance), len(queries))

495 495


In [273]:
K = 20

def get_relevant_for_k(res, relevant, k):
    return sum([1 if res['id'] in relevant else 0 for res in res[:k]])        

def analyze_results(index, lemmatize_query=False):
    Q = len(queries)
    qpK, qrK, qR_average, qmapK = 0, 0, 0, 0
    qR = []
    for task, q in tqdm_notebook(queries.items()):
        if lemmatize_query:
            q = lemmatize_doc(q)
        results = search(build_query(q), index, K)
        cur_relevant = len(relevance[task])
        qpK += get_relevant_for_k(results, relevance[task], K) / K 
        qrK += get_relevant_for_k(results, relevance[task], K) / cur_relevant
        qR.append(get_relevant_for_k(results, relevance[task], cur_relevant) / cur_relevant)
        qR_average += qR[-1]
        mapK = 0
        for k in range(1, K + 1):
            mapK += get_relevant_for_k(results, relevance[task], k) / k
        mapK /= K
        qmapK += mapK
    print(f"p@20 {qpK / Q}")
    print(f"r@20 {qrK / Q}")
    print(f"R-precision {qR_average / Q}")
    print(f"Average MAP@20 {qmapK / Q}")
    return np.array(qR)
        
qR_pure = analyze_results(MYANDEX)

HBox(children=(IntProgress(value=0, max=495), HTML(value='')))

p@20 0.2682828282828283
r@20 0.18055493176592302
R-precision 0.1529419202313428
Average MAP@20 0.3051976683724968


In [249]:
MYANDEX_LEMMAS = 'myandex_lemmas'
es.indices.delete(MYANDEX_LEMMAS)
es.indices.create(index=MYANDEX_LEMMAS, body=settings)
create_index(lemmatized_docs, MYANDEX_LEMMAS)
print_index_size(MYANDEX_LEMMAS)

HBox(children=(IntProgress(value=0, max=199368), HTML(value='')))

2.14 GB


In [274]:
qR_lemmas = analyze_results(MYANDEX_LEMMAS, lemmatize_query=True)

HBox(children=(IntProgress(value=0, max=495), HTML(value='')))

p@20 0.401010101010101
r@20 0.26760448426425604
R-precision 0.22521276526825904
Average MAP@20 0.45474138485518534


In [251]:
MYANDEX_TITLE = 'myandex_title'
settings = {
    'mappings': {
        'properties': {
            'title': {
                'type': 'text'
             },            
            'content': {
                'type': 'text'
             }
         }
    },
    'settings': {
        'analysis': {
            'analyzer': {
                'white_lover': {
                    'tokenizer': 'letter',
                    'filter': [
                        'lowercase', 
                    ]
                }
            }
        }
    }
}

es.indices.delete(MYANDEX_TITLE)
es.indices.create(index=MYANDEX_TITLE, body=settings)

def create_index_titled(all_docs, index):
    def next_document():
        for i, doc in tqdm_notebook(list(zip(ids, all_docs))):
            if doc is not None:
                doc = str(doc).split('\n', 1)
                if len(doc) == 2:
                    title, body = doc
                    yield create_es_action(index, int(i), {'content': body, 'title': title})
    
    for ok, result in parallel_bulk(es, next_document(), queue_size=4, thread_count=4, chunk_size=1000):
        if not ok:
            print(result)
            

create_index_titled(lemmatized_docs, MYANDEX_TITLE)
print_index_size(MYANDEX_TITLE)

HBox(children=(IntProgress(value=0, max=199368), HTML(value='')))

1.92 GB


In [275]:
def build_query(query):
    return {
        'query': {
            'bool': {
                'should': [
                    {
                        'match': {
                            'title': {
                                'query': query,
                                'boost': 0.15
                            }
                        }
                    },
                    {
                        'match': {
                            'content': query
                        }
                    }
                ]
            }
        }
    }

qR_titles = analyze_results(MYANDEX, lemmatize_query=True)

HBox(children=(IntProgress(value=0, max=495), HTML(value='')))

p@20 0.40959595959595946
r@20 0.26869876829820327
R-precision 0.22932912345173131
Average MAP@20 0.46620991140582557


In [333]:
def max_diff(k=10):
    diffs = np.abs(qR_lemmas - qR_pure)
#     print(diffs)
    ascending = np.argsort(diffs)
#     print(ascending)
    for i in range(k):
        idx = ascending[-i - 1]
        q = list(queries.values())[idx]
        print(f'{q} {lemmatize_doc(q)} lemmas {qR_lemmas[idx]} vs pure {qR_pure[idx]}')
#         search_and_print(build_query(lemmatize_doc(q)), MYANDEX_LEMMAS, 'content')

In [334]:
max_diff()

УРАЛЬСКАЯ ПЛИТКА уральский плитка
 lemmas 1.0 vs pure 0.0
аугментин - состав аугментина - состав
 lemmas 1.0 vs pure 0.0
контакт контакт
 lemmas 1.0 vs pure 0.0
гда находится занзибар гда находиться занзибар
 lemmas 0.6 vs pure 0.0
уральские авиалинии уральский авиалиния
 lemmas 0.5862068965517241 vs pure 0.0
иониты в катализе ионит в катализ
 lemmas 0.5714285714285714 vs pure 0.0
допуски и посадки допуск и посадка
 lemmas 0.5652173913043478 vs pure 0.0
МАГАЗИН СУПИНАТОРЫ магазин супинатор
 lemmas 0.5625 vs pure 0.0
византийские источники о руси византийский источник о русь
 lemmas 0.5384615384615384 vs pure 0.0
карта турции карта турция
 lemmas 0.5 vs pure 0.0
