In [12]:
import os, sys
import re
from pymystem3 import Mystem
from bs4 import BeautifulSoup, Comment
import pandas as pd
import numpy as np
import multiprocessing as mp
import urllib
import pickle
from tqdm import tqdm_notebook
import matplotlib.pyplot as plt
from collections import Counter
from elasticsearch import Elasticsearch
from elasticsearch.helpers import parallel_bulk
import networkx as nx

In [92]:
def read_docs(path):
    df = pd.read_csv(path, skiprows=0)
    ids = df['id'].to_numpy()
    docs = df['text'].to_numpy()
    return ids, docs

ids, docs = read_docs('texts.csv')

In [89]:
settings = {
    'mappings': {
        'properties': {
            'content': {
                'type': 'text'
             }
         }
    },
    'settings': {
        'analysis': {
            'analyzer': {
                'white_lover': {
                    'tokenizer': 'letter',
                    'filter': [
                        'lowercase', 
                        'my_snow'
                    ]
                }
            },
            'filter': { 
                'my_snow': {
                    "type" : "snowball",
                    "language" : ["Russian", "English"]
                }
            }
        }
    }
}

es = Elasticsearch([{'host': 'localhost', 'port': 9200, 'timeout': 360, 'maxsize': 25}])
es.indices.delete('myandex')
es.indices.create(index='myandex', body=settings)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'myandex'}

In [182]:
def create_es_action(index, doc_id, document):
    return {
        '_index': index,
        '_id': doc_id,
        '_source': document
    }

def create_index(all_docs, index):
    def next_document():
        for i, doc in tqdm_notebook(list(zip(ids, all_docs))):
            if doc is not None:
                yield create_es_action(index, int(i), {'content': str(doc)})
    
    for ok, result in parallel_bulk(es, next_document(), queue_size=4, thread_count=4, chunk_size=1000):
        if not ok:
            print(result)

In [184]:
create_index(docs, 'yandex')

HBox(children=(IntProgress(value=0, max=199368), HTML(value='')))

In [173]:
def search(query, index, *args, K=20):
    res = es.search(index=index, body=query, size=K)["hits"]
    pretty_res = []
    for hit in res['hits']:
        pretty_res.append({'id': hit["_id"],
                           'score': hit["_score"]})
    return pretty_res

def search_and_print(query, *args):
    pretty_print_result(es.search(index='myandex', body=query, size=20), args)
    # note that size set to 20 just because default value is 10 and we know that we have 12 docs and 10 < 12 < 20
                        
def pretty_print_result(search_result, fields=[]):
    # fields is a list of fields names which we want to be printed
    res = search_result['hits']
    print(f'Total documents: {res["total"]["value"]}')
    for hit in res['hits']:
        print(f'Doc {hit["_id"]}, score is {hit["_score"]}')
        for field in fields:
            print(f'{field}: {hit["_source"][field]}')
                  
def get_doc_by_id(doc_id):
    return es.get(index='myandex', id=doc_id)['_source']

def build_query(query):
    return {
        'query': {
            'bool': {
                'must': [
                    {
                        'match': {
                            'content': query
                        }
                    }
                ]
            }
        }
    }

search(build_query('Андрей'), 'myandex', 'content')

[{'id': '332329', 'score': 6.322534},
 {'id': '677707', 'score': 6.3214607},
 {'id': '1322832', 'score': 6.2616105},
 {'id': '308211', 'score': 6.216829},
 {'id': '563489', 'score': 6.2017894},
 {'id': '633870', 'score': 6.200276},
 {'id': '967868', 'score': 6.1842537},
 {'id': '720665', 'score': 6.136043},
 {'id': '1091727', 'score': 6.1353703},
 {'id': '528882', 'score': 6.1119347},
 {'id': '271267', 'score': 6.1099944},
 {'id': '1196434', 'score': 6.1097217},
 {'id': '1488521', 'score': 6.104635},
 {'id': '1308244', 'score': 6.0981293},
 {'id': '795213', 'score': 6.0903134},
 {'id': '1002619', 'score': 6.0827484},
 {'id': '1203731', 'score': 6.077414},
 {'id': '924753', 'score': 6.0756383},
 {'id': '64833', 'score': 6.0557446},
 {'id': '9098', 'score': 6.0557446}]

In [98]:
def print_index_size(index): 
    print(f"{(es.indices.stats(index)['_all']['primaries']['store']['size_in_bytes'] / 2 ** 30):.2f} GB")

In [99]:
print_index_size('myandex')

2.06 GB


In [139]:
m = Mystem()

def lemmatize_doc(doc):
    return ''.join(m.lemmatize(str(doc)))
    
def lemmatize_collection(docs):
    result = []
    for doc in tqdm_notebook(docs):
        result.append(lemmatize_doc(doc))
    return result



HBox(children=(IntProgress(value=0, max=199368), HTML(value='')))




In [161]:
def save_lemmas(lemmas):
    with open('lemmas.pickle', 'wb') as f:
        pickle.dump(lemmatized_docs, f)

def load_lemmas():
    with open('lemmas.pickle', 'rb') as f:
        return pickle.load(f)

In [162]:
# lemmatized_docs = lemmatize_collection(docs)
lemmatized_docs = load_lemmas()

In [150]:
def decode(s):
    s = base64.b64decode(s)
    s = codecs.decode(s, 'cp1251', errors='ignore')
    return s

def get_quieries(relevance):
    queries = {}
    with open('web2008_adhoc.xml','r', encoding="cp1251") as src:
        raw_xml = src.read()
        soup = BeautifulSoup(raw_xml)
        for task in soup.find_all('task'):
            if task['id'] in relevance:
                queries[task['id']] = task.querytext.string
    return queries

def get_relevance():
    relevance = {}
    with open('or_relevant-minus_table.xml','r', encoding="cp1251") as src:
        raw_xml = src.read()
        soup = BeautifulSoup(raw_xml)
        
        for task in soup.find_all('task'):
            documents = task.find_all('document') 
            vital = set()
            for doc in documents:
                if doc['relevance'] == 'vital':
                    vital.add(doc['id'])
            if vital:
                relevance[task['id']] = vital
    return relevance

In [151]:
relevance = get_relevance()
queries = get_quieries(relevance)
print(len(relevance), len(queries))

495 495


In [160]:
K = 20

def get_relevant_for_k(res, relevant, k):
    return sum([1 if res['id'] in relevant else 0 for res in res[:k]])        

def analyze_results(index):
    Q = len(queries)
    qpK, qrK, qR_average, qmapK = 0, 0, 0, 0
    for task, q in tqdm_notebook(queries.items()):
        results = search(build_query(q), index, K)
        cur_relevant = len(relevance[task])
        qpK += get_relevant_for_k(results, relevance[task], K) / K 
        qrK += get_relevant_for_k(results, relevance[task], K) / cur_relevant
        qR_average += get_relevant_for_k(results, relevance[task], cur_relevant) / cur_relevant
        mapK = 0
        for k in range(1, K + 1):
            mapK += get_relevant_for_k(results, relevance[task], k) / k
        mapK /= K
        qmapK += mapK
    print(f"p@20 {qpK / Q}")
    print(f"r@20 {qrK / Q}")
    print(f"R-precision {qR_average / Q}")
    print(f"Average MAP@20 {qmapK / Q}")
        
analyze_results('myandex')

HBox(children=(IntProgress(value=0, max=495), HTML(value='')))

p@20 0.20010101010101009
r@20 0.1416571660545857
R-precision 0.1302237846212043
Average MAP@20 0.3563415944051676


In [186]:
MYANDEX_LEMMAS = 'myandex_lemmas'
es.indices.delete(MYANDEX_LEMMAS)
es.indices.create(index=MYANDEX_LEMMAS, body=settings)
create_index(lemmatized_docs, MYANDEX_LEMMAS)

HBox(children=(IntProgress(value=0, max=199368), HTML(value='')))

In [187]:
analyze_results(MYANDEX_LEMMAS)

HBox(children=(IntProgress(value=0, max=495), HTML(value='')))

p@20 0.25797979797979764
r@20 0.17678170065715523
R-precision 0.14856475601864197
Average MAP@20 0.2956632644661422
