In [2]:
import os, sys
import re
import math
from pymystem3 import Mystem
from bs4 import BeautifulSoup, Comment
import pandas as pd
import numpy as np
import multiprocessing as mp
import urllib
import pickle
from tqdm import tqdm_notebook
import matplotlib.pyplot as plt
from collections import Counter
from elasticsearch import Elasticsearch
from elasticsearch.helpers import parallel_bulk
import networkx as nx

In [3]:
es = Elasticsearch([{'host': 'localhost', 'port': 9200, 'timeout': 360, 'maxsize': 25}])

In [8]:
def read_docs():
    df = pd.read_csv('texts.csv', skiprows=0)
    ids = df['id'].to_numpy()
    docs = df['text'].to_numpy()
    return ids, docs


def read_graph(nodes):
    df = pd.read_csv('url_graph.csv')
    src = df['src'].to_numpy()
    dst = df['dst'].to_numpy()
    edges = np.hstack((src.reshape(-1,1), dst.reshape(-1,1)))
    graph = nx.DiGraph()
    graph.add_nodes_from(nodes)
    graph.add_edges_from(edges)
    return graph


def read_urls():
    df = pd.read_csv('urls.csv')
    urls = {int(row['html_id']): row['html_url'] for _, row in df.iterrows()}
    return urls


def load_lemmas():
    with open('lemmas.pickle', 'rb') as f:
        return pickle.load(f)
    
    
def decode(s):
    s = base64.b64decode(s)
    s = codecs.decode(s, 'cp1251', errors='ignore')
    return s


def get_quieries(relevance):
    queries = {}
    with open('web2008_adhoc.xml','r', encoding="cp1251") as src:
        raw_xml = src.read()
        soup = BeautifulSoup(raw_xml)
        for task in soup.find_all('task'):
            if task['id'] in relevance:
                queries[task['id']] = task.querytext.string
    return queries


def get_relevance():
    relevance = {}
    with open('or_relevant-minus_table.xml','r', encoding="cp1251") as src:
        raw_xml = src.read()
        soup = BeautifulSoup(raw_xml)
        
        for task in soup.find_all('task'):
            documents = task.find_all('document') 
            vital = set()
            for doc in documents:
                if doc['relevance'] == 'vital':
                    vital.add(doc['id'])
            if vital:
                relevance[task['id']] = vital
    return relevance

In [5]:
def create_settings():
    settings = {
        'mappings': {
            'properties': {
                'title': {
                    'type': 'text'
                }, 
                'content': {
                    'type': 'text'
                },
                'pagerank': {
                    'type': 'rank_feature'
                },
                'url_len': {
                    'type': 'rank_feature'
                }
             }
        },
        'settings': {
            'analysis': {
                'analyzer': {
                    'white_lover': {
                        'tokenizer': 'letter',
                        'filter': [
                            'lowercase', 
                        ]
                    }
                }
            }
        }
    }
    return settings


def create_es_action(index, doc_id, document):
    return {
        '_index': index,
        '_id': doc_id,
        '_source': document
    }


def create_index_with_pagerank(index, ids, all_docs, pagerank, urls):
    def next_document():
        for i, doc in tqdm_notebook(list(zip(ids, all_docs))):
            if doc is not None:
                doc = str(doc).split('\n', 1)
                if len(doc) == 2:
                    title, body = doc
                    desc = {
                        'title': title,
                        'content': body, 
                        'pagerank': pagerank.get(i, 0), 
                        'url_len': len(urls.get(i, ''))
                    }
                    yield create_es_action(index, int(i), desc)
    
    for ok, result in parallel_bulk(es, next_document(), queue_size=4, thread_count=4, chunk_size=1000):
        if not ok:
            print(result)

def print_index_size(index): 
    print(f"{(es.indices.stats(index)['_all']['primaries']['store']['size_in_bytes'] / 2 ** 30):.2f} GB")

In [25]:
def get_query_builder(title_boost, pagerank_boost, url_length_boost):
    return lambda query : {
            'query': {
                'bool': {
                    'should': [
                        {
                            'match': {
                                'content': query
                            }
                        },
                        {
                            'match': {
                                'title': {
                                    'query': query,
                                    'operator': 'AND',
                                    'boost': title_boost
                                }
                            }
                        },
                        {
                            'rank_feature': {
                                'field': 'pagerank',
                                'boost': pagerank_boost,
                            },
                        },
                        {
                            'rank_feature': {
                                'field': 'url_len',
                                'boost': url_length_boost
                            },
                        }
                    ]
                }
            }
        }


def search(query, index, *args, K=20):
    res = es.search(index=index, body=query, size=K)['hits']
    pretty_result = []
    for hit in res['hits']:
        pretty_result.append({
            'id': hit['_id'],
            'score': hit['_score']
        })
    return pretty_result


def get_relevant_for_k(res, relevant, K=20):
    return sum([1 if res['id'] in relevant else 0 for res in res[:K]])        


def analyze_results(index, query_builder, lemmatize_query=False, K=20, params={}):
    m = Mystem()
    
    def lemmatize_doc(doc):
        return ''.join(m.lemmatize(str(doc)))

    def lemmatize_collection(docs):
        result = []
        for doc in tqdm_notebook(docs):
            result.append(lemmatize_doc(doc))
        return result

    Q = len(queries)
    qpK, qrK, qR_average, qmapK = 0, 0, 0, 0
    qR = []
    for task, q in tqdm_notebook(queries.items()):
        if lemmatize_query:
            q = lemmatize_doc(q)
        results = search(query_builder(q), index, K)
        cur_relevant = len(relevance[task])
        qpK += get_relevant_for_k(results, relevance[task], K) / K 
        qrK += get_relevant_for_k(results, relevance[task], K) / cur_relevant
        qR.append(get_relevant_for_k(results, relevance[task], cur_relevant) / cur_relevant)
        qR_average += qR[-1]
        mapK = 0
        for k in range(1, K + 1):
            mapK += get_relevant_for_k(results, relevance[task], k) / k
        mapK /= K
        qmapK += mapK
    print('=========')
    print(f'params={params}')
    print(f'p@{K} {qpK / Q}')
    print(f'r@{K} {qrK / Q}')
    print(f'R-precision@{K} {qR_average / Q}')
    print(f'Average MAP@{K} {qmapK / Q}')
    print()

In [9]:
ids, _ = read_docs()
ids = ids.astype(np.int32)
lemmatized_docs = load_lemmas()

graph = read_graph(ids)
pagerank = nx.pagerank(graph, alpha=0.9)

urls = read_urls()

In [10]:
relevance = get_relevance()
queries = get_quieries(relevance)

In [11]:
MYANDEX_PAGERANK = 'myandex_pagerank'
es.indices.delete(index=MYANDEX_PAGERANK, ignore=[400, 404])
es.indices.create(index=MYANDEX_PAGERANK, body=create_settings())
create_index_with_pagerank(MYANDEX_PAGERANK, ids, lemmatized_docs, pagerank, urls)
print_index_size(MYANDEX_PAGERANK)

HBox(children=(IntProgress(value=0, max=199368), HTML(value='')))


2.00 GB


In [24]:
# matching any word from query in title (OR operator)
for title_boost in [0.05, 0.1, 0.15, 0.2, 0.5, 0.6, 1, 1.5, 2, 2.5, 3, 5]:
    params = {
        'title_boost': title_boost, 
        'pagerank_boost': 0.3, 
        'url_length_boost': 0.1
    }
    query_builder = get_query_builder(**params)
    analyze_results(MYANDEX_PAGERANK, query_builder, lemmatize_query=True, K=20, params=params)

HBox(children=(IntProgress(value=0, max=495), HTML(value='')))


params={'title_boost': 0.05, 'pagerank_boost': 0.3, 'url_length_boost': 0.1}
p@20 0.40787878787878773
r@20 0.2679791848725738
R-precision@20 0.2278855390858517
Average MAP@20 0.4644576682599245



HBox(children=(IntProgress(value=0, max=495), HTML(value='')))


params={'title_boost': 0.1, 'pagerank_boost': 0.3, 'url_length_boost': 0.1}
p@20 0.4105050505050504
r@20 0.2690059372442559
R-precision@20 0.22845189524847453
Average MAP@20 0.4652662340773443



HBox(children=(IntProgress(value=0, max=495), HTML(value='')))


params={'title_boost': 0.15, 'pagerank_boost': 0.3, 'url_length_boost': 0.1}
p@20 0.4092929292929289
r@20 0.26871641700140114
R-precision@20 0.22778263702010262
Average MAP@20 0.4659289970591688



HBox(children=(IntProgress(value=0, max=495), HTML(value='')))


params={'title_boost': 0.2, 'pagerank_boost': 0.3, 'url_length_boost': 0.1}
p@20 0.40818181818181776
r@20 0.26729345444453456
R-precision@20 0.22656571326881517
Average MAP@20 0.4647113159666634



HBox(children=(IntProgress(value=0, max=495), HTML(value='')))


params={'title_boost': 0.5, 'pagerank_boost': 0.3, 'url_length_boost': 0.1}
p@20 0.38989898989898975
r@20 0.25647465647105466
R-precision@20 0.21227514918213955
Average MAP@20 0.446129721950519



HBox(children=(IntProgress(value=0, max=495), HTML(value='')))


params={'title_boost': 0.6, 'pagerank_boost': 0.3, 'url_length_boost': 0.1}
p@20 0.38333333333333314
r@20 0.2520706249118833
R-precision@20 0.20905985956037307
Average MAP@20 0.43994655111864756



HBox(children=(IntProgress(value=0, max=495), HTML(value='')))


params={'title_boost': 1, 'pagerank_boost': 0.3, 'url_length_boost': 0.1}
p@20 0.35858585858585806
r@20 0.22253605431629506
R-precision@20 0.1857521863268731
Average MAP@20 0.41056178874141386



HBox(children=(IntProgress(value=0, max=495), HTML(value='')))


params={'title_boost': 1.5, 'pagerank_boost': 0.3, 'url_length_boost': 0.1}
p@20 0.3356565656565655
r@20 0.19653478135389413
R-precision@20 0.17066042720179336
Average MAP@20 0.38910022299629693



HBox(children=(IntProgress(value=0, max=495), HTML(value='')))


params={'title_boost': 2, 'pagerank_boost': 0.3, 'url_length_boost': 0.1}
p@20 0.3189898989898989
r@20 0.18539902829004395
R-precision@20 0.15951728534267562
Average MAP@20 0.37568961163179654



HBox(children=(IntProgress(value=0, max=495), HTML(value='')))


params={'title_boost': 2.5, 'pagerank_boost': 0.3, 'url_length_boost': 0.1}
p@20 0.3097979797979797
r@20 0.1792325817001651
R-precision@20 0.153429574798556
Average MAP@20 0.36854795759761033



HBox(children=(IntProgress(value=0, max=495), HTML(value='')))


params={'title_boost': 3, 'pagerank_boost': 0.3, 'url_length_boost': 0.1}
p@20 0.3039393939393938
r@20 0.17399137899509745
R-precision@20 0.14910640467818762
Average MAP@20 0.3629080069354012



HBox(children=(IntProgress(value=0, max=495), HTML(value='')))


params={'title_boost': 5, 'pagerank_boost': 0.3, 'url_length_boost': 0.1}
p@20 0.29505050505050495
r@20 0.16712922176581793
R-precision@20 0.14376694951082578
Average MAP@20 0.34963506564198443



In [26]:
# matching all words from query in title (AND operator)
for title_boost in [0.05, 0.1, 0.15, 0.2, 0.5, 0.6, 1, 1.5, 2, 2.5, 3, 5]:
    params = {
        'title_boost': title_boost, 
        'pagerank_boost': 0.3, 
        'url_length_boost': 0.1
    }
    query_builder = get_query_builder(**params)
    analyze_results(MYANDEX_PAGERANK, query_builder, lemmatize_query=True, K=20, params=params)

HBox(children=(IntProgress(value=0, max=495), HTML(value='')))


params={'title_boost': 0.05, 'pagerank_boost': 0.3, 'url_length_boost': 0.1}
p@20 0.4045454545454544
r@20 0.2663234146769705
R-precision@20 0.2248158977214754
Average MAP@20 0.45993655384507004



HBox(children=(IntProgress(value=0, max=495), HTML(value='')))


params={'title_boost': 0.1, 'pagerank_boost': 0.3, 'url_length_boost': 0.1}
p@20 0.4065656565656566
r@20 0.2676514241503794
R-precision@20 0.2253021563531334
Average MAP@20 0.459350301973707



HBox(children=(IntProgress(value=0, max=495), HTML(value='')))


params={'title_boost': 0.15, 'pagerank_boost': 0.3, 'url_length_boost': 0.1}
p@20 0.4075757575757574
r@20 0.2709839831457147
R-precision@20 0.225941112654866
Average MAP@20 0.4593703718109384



HBox(children=(IntProgress(value=0, max=495), HTML(value='')))


params={'title_boost': 0.2, 'pagerank_boost': 0.3, 'url_length_boost': 0.1}
p@20 0.4077777777777775
r@20 0.27218007657812626
R-precision@20 0.22612710507717654
Average MAP@20 0.4593632655570689



HBox(children=(IntProgress(value=0, max=495), HTML(value='')))


params={'title_boost': 0.5, 'pagerank_boost': 0.3, 'url_length_boost': 0.1}
p@20 0.40676767676767667
r@20 0.2718985568163435
R-precision@20 0.22584558531539384
Average MAP@20 0.45781439464145374



HBox(children=(IntProgress(value=0, max=495), HTML(value='')))


params={'title_boost': 0.6, 'pagerank_boost': 0.3, 'url_length_boost': 0.1}
p@20 0.4059595959595957
r@20 0.27152600009669897
R-precision@20 0.2254730285957493
Average MAP@20 0.4573537973191082



HBox(children=(IntProgress(value=0, max=495), HTML(value='')))


params={'title_boost': 1, 'pagerank_boost': 0.3, 'url_length_boost': 0.1}
p@20 0.40575757575757554
r@20 0.271318208632365
R-precision@20 0.22459183645801453
Average MAP@20 0.45618237102455833



HBox(children=(IntProgress(value=0, max=495), HTML(value='')))


params={'title_boost': 1.5, 'pagerank_boost': 0.3, 'url_length_boost': 0.1}
p@20 0.40515151515151493
r@20 0.2712063064360848
R-precision@20 0.22447993426173438
Average MAP@20 0.4553200454433802



HBox(children=(IntProgress(value=0, max=495), HTML(value='')))


params={'title_boost': 2, 'pagerank_boost': 0.3, 'url_length_boost': 0.1}
p@20 0.40434343434343417
r@20 0.27101543670351486
R-precision@20 0.22428906452916442
Average MAP@20 0.45464551708849943



HBox(children=(IntProgress(value=0, max=495), HTML(value='')))


params={'title_boost': 2.5, 'pagerank_boost': 0.3, 'url_length_boost': 0.1}
p@20 0.4044444444444442
r@20 0.2709698224883364
R-precision@20 0.22424345031398604
Average MAP@20 0.45421035579895447



HBox(children=(IntProgress(value=0, max=495), HTML(value='')))


params={'title_boost': 3, 'pagerank_boost': 0.3, 'url_length_boost': 0.1}
p@20 0.4046464646464644
r@20 0.271029285038365
R-precision@20 0.22430291286401463
Average MAP@20 0.4541160580905161



HBox(children=(IntProgress(value=0, max=495), HTML(value='')))


params={'title_boost': 5, 'pagerank_boost': 0.3, 'url_length_boost': 0.1}
p@20 0.4043434343434341
r@20 0.27092748570663805
R-precision@20 0.22420111353228778
Average MAP@20 0.45379181901064813

