In [37]:
import os, sys
import re
import math
from pymystem3 import Mystem
from bs4 import BeautifulSoup, Comment
import pandas as pd
import numpy as np
import multiprocessing as mp
import urllib
import pickle
from tqdm import tqdm_notebook
import matplotlib.pyplot as plt
from collections import Counter
from elasticsearch import Elasticsearch
from elasticsearch.helpers import parallel_bulk
import networkx as nx
import time

import plotly.offline as py
import plotly.graph_objs as go

py.init_notebook_mode(connected=True)

In [2]:
es = Elasticsearch([{'host': 'localhost', 'port': 9200, 'timeout': 360, 'maxsize': 25}])

In [3]:
def read_docs():
    df = pd.read_csv('texts.csv', skiprows=0)
    ids = df['id'].to_numpy()
    docs = df['text'].to_numpy()
    return ids, docs


def read_graph(nodes):
    df = pd.read_csv('url_graph.csv')
    src = df['src'].to_numpy()
    dst = df['dst'].to_numpy()
    edges = np.hstack((src.reshape(-1,1), dst.reshape(-1,1)))
    graph = nx.DiGraph()
    graph.add_nodes_from(nodes)
    graph.add_edges_from(edges)
    return graph


def read_urls():
    df = pd.read_csv('urls.csv')
    urls = {int(row['html_id']): row['html_url'] for _, row in df.iterrows()}
    return urls


def load_lemmas():
    with open('lemmas.pickle', 'rb') as f:
        return pickle.load(f)
    
    
def decode(s):
    s = base64.b64decode(s)
    s = codecs.decode(s, 'cp1251', errors='ignore')
    return s


def get_quieries(relevance):
    queries = {}
    with open('web2008_adhoc.xml','r', encoding="cp1251") as src:
        raw_xml = src.read()
        soup = BeautifulSoup(raw_xml)
        for task in soup.find_all('task'):
            if task['id'] in relevance:
                queries[task['id']] = task.querytext.string
    return queries


def get_relevance():
    relevance = {}
    with open('or_relevant-minus_table.xml','r', encoding="cp1251") as src:
        raw_xml = src.read()
        soup = BeautifulSoup(raw_xml)
        
        for task in soup.find_all('task'):
            documents = task.find_all('document') 
            vital = set()
            for doc in documents:
                if doc['relevance'] == 'vital':
                    vital.add(doc['id'])
            if vital:
                relevance[task['id']] = vital
    return relevance

In [4]:
def create_settings():
    settings = {
        'mappings': {
            'properties': {
                'title': {
                    'type': 'text'
                }, 
                'content': {
                    'type': 'text'
                },
                'raw_content': {
                    'type': 'text'
                },
                'pagerank': {
                    'type': 'rank_feature'
                },
                'url_len': {
                    'type': 'rank_feature'
                }
             }
        },
        'settings': {
            'analysis': {
                'analyzer': {
                    'white_lover': {
                        'tokenizer': 'letter',
                        'filter': [
                            'lowercase', 
                        ]
                    }
                }
            }
        }
    }
    return settings


def create_es_action(index, doc_id, document):
    return {
        '_index': index,
        '_id': doc_id,
        '_source': document
    }


def create_index_with_pagerank(index, ids, all_docs, all_raw_docs, pagerank, urls):
    def next_document():
        for i, doc, raw_doc in tqdm_notebook(list(zip(ids, all_docs, all_raw_docs))):
            if doc is not None:
                doc = str(doc).split('\n', 1)
                if len(doc) == 2:
                    title, body = doc
                    desc = {
                        'title': title,
                        'content': str(body),
                        'raw_content': str(raw_doc),
                        'pagerank': pagerank.get(i, 0), 
                        'url_len': len(urls.get(i, ''))
                    }
                    yield create_es_action(index, int(i), desc)
    
    for ok, result in parallel_bulk(es, next_document(), queue_size=4, thread_count=4, chunk_size=1000):
        if not ok:
            print(result)

def print_index_size(index): 
    print(f"{(es.indices.stats(index)['_all']['primaries']['store']['size_in_bytes'] / 2 ** 30):.2f} GB")

In [58]:
def get_query_builder(title_boost, pagerank_boost, url_length_boost, content_boost=1, raw_content_boost=0):
    return lambda query : {
            'query': {
                'bool': {
                    'should': [
                        {
                            'match': {
                                'content': {
                                    'query': query,
                                    'boost': content_boost
                                }
                            }
                        },
                        {
                            'match': {
                                'raw_content': {
                                    'query': query,
                                    'boost': raw_content_boost
                                }
                            }
                        },
                        {
                            'match': {
                                'title': {
                                    'query': query,
                                    'operator': 'OR',
                                    'boost': title_boost
                                }
                            }
                        },
                        {
                            'rank_feature': {
                                'field': 'pagerank',
                                'boost': pagerank_boost,
                            },
                        },
                        {
                            'rank_feature': {
                                'field': 'url_len',
                                'boost': url_length_boost
                            },
                        }
                    ]
                }
            }
        }


def search(query, index, *args, K=20):
    res = es.search(index=index, body=query, size=K)['hits']
    pretty_result = []
    for hit in res['hits']:
        pretty_result.append({
            'id': hit['_id'],
            'score': hit['_score']
        })
    return pretty_result


def get_relevant_for_k(res, relevant, K=20):
    return sum([1 if res['id'] in relevant else 0 for res in res[:K]])        


def analyze_results(index, query_builder, lemmatize_query=False, K=20, params={}):
    m = Mystem()
    
    def lemmatize_doc(doc):
        return ''.join(m.lemmatize(str(doc)))

    def lemmatize_collection(docs):
        result = []
        for doc in tqdm_notebook(docs):
            result.append(lemmatize_doc(doc))
        return result

    Q = len(queries)
    qpK, qrK, qR_average, qmapK = 0, 0, 0, 0
    qR = []
    for task, q in tqdm_notebook(queries.items()):
        if lemmatize_query:
            q = lemmatize_doc(q)
        results = search(query_builder(q), index, K)
        cur_relevant = len(relevance[task])
        qpK += get_relevant_for_k(results, relevance[task], K) / K 
        qrK += get_relevant_for_k(results, relevance[task], K) / cur_relevant
        qR.append(get_relevant_for_k(results, relevance[task], cur_relevant) / cur_relevant)
        qR_average += qR[-1]
        
        mapK, cur = 0, 0
        for k in range(min(K, len(results))):
            if results[k]['id'] in relevance[task]:
                cur += 1
                mapK += cur / (k + 1)
        if cur != 0:
            mapK /= cur
        qmapK += mapK
    print('=========')
    print(f'params={params}')
    print(f'p@{K} {qpK / Q}')
    print(f'r@{K} {qrK / Q}')
    print(f'R-precision@{K} {qR_average / Q}')
    print(f'MAP@{K} {qmapK / Q}')
    print()
    
    return qpK / Q, qrK / Q, qR_average / Q, qmapK / Q

In [6]:
ids, docs = read_docs()
ids = ids.astype(np.int32)
lemmatized_docs = load_lemmas()

graph = read_graph(ids)
pagerank = nx.pagerank(graph, alpha=0.9)

urls = read_urls()

In [7]:
relevance = get_relevance()
queries = get_quieries(relevance)

In [8]:
MYANDEX_PAGERANK = 'myandex_pagerank'
es.indices.delete(index=MYANDEX_PAGERANK, ignore=[400, 404])
es.indices.create(index=MYANDEX_PAGERANK, body=create_settings())
tin = time.time()
create_index_with_pagerank(MYANDEX_PAGERANK, ids, lemmatized_docs, docs, pagerank, urls)
tout = time.time()
print_index_size(MYANDEX_PAGERANK)

HBox(children=(IntProgress(value=0, max=199368), HTML(value='')))


3.32 GB


In [9]:
print(f'total_time={tout-tin:.5f} sec')

total_time=245.07406 sec


In [51]:
# with lemmatizing, titles and static params
params = {
    'title_boost': 0.05, 
    'pagerank_boost': 0.3, 
    'url_length_boost': 0.1,
    'content_boost': 1,
    'raw_content_boost': 0
}
query_builder = get_query_builder(**params)
_ = analyze_results(MYANDEX_PAGERANK, query_builder, lemmatize_query=True, K=20, params=params)

HBox(children=(IntProgress(value=0, max=495), HTML(value='')))


params={'title_boost': 0.05, 'pagerank_boost': 0.3, 'url_length_boost': 0.1, 'content_boost': 1, 'raw_content_boost': 0}
p@20 0.40797979797979783
r@20 0.26800214171371245
R-precision@20 0.2279084959269904
MAP@20 0.5563371968330636



In [52]:
# without static params
params = {
    'title_boost': 0.05, 
    'pagerank_boost': 0, 
    'url_length_boost': 0,
    'content_boost': 1,
    'raw_content_boost': 0
}
query_builder = get_query_builder(**params)
_ = analyze_results(MYANDEX_PAGERANK, query_builder, lemmatize_query=True, K=20, params=params)

HBox(children=(IntProgress(value=0, max=495), HTML(value='')))


params={'title_boost': 0.05, 'pagerank_boost': 0, 'url_length_boost': 0, 'content_boost': 1, 'raw_content_boost': 0}
p@20 0.4073737373737372
r@20 0.26724703458598775
R-precision@20 0.22796908420732437
MAP@20 0.55738684362248



In [53]:
# only with lemmas
params = {
    'title_boost': 0, 
    'pagerank_boost': 0, 
    'url_length_boost': 0,
    'content_boost': 1,
    'raw_content_boost': 0
}
query_builder = get_query_builder(**params)
_ = analyze_results(MYANDEX_PAGERANK, query_builder, lemmatize_query=True, K=20, params=params)

HBox(children=(IntProgress(value=0, max=495), HTML(value='')))


params={'title_boost': 0, 'pagerank_boost': 0, 'url_length_boost': 0, 'content_boost': 1, 'raw_content_boost': 0}
p@20 0.39999999999999997
r@20 0.2673210090932032
R-precision@20 0.22566073878320037
MAP@20 0.5451749977972244



In [54]:
def get_query_builder():
    return lambda query : {
        'query': {
            'bool': {
                'should': [
                    {
                        'match': {
                            'raw_content': query,
                        }
                    }
                ]
            }
        }
    }


params = {
    'title_boost': 0, 
    'pagerank_boost': 0, 
    'url_length_boost': 0,
    'content_boost': 0,
    'raw_content_boost': 1
}
        
query_builder = get_query_builder()
_ = analyze_results(MYANDEX_PAGERANK, query_builder, lemmatize_query=False, K=20, params=params)

HBox(children=(IntProgress(value=0, max=495), HTML(value='')))


params={'title_boost': 0, 'pagerank_boost': 0, 'url_length_boost': 0, 'content_boost': 0, 'raw_content_boost': 1}
p@20 0.3434343434343436
r@20 0.22120657518953626
R-precision@20 0.1907489345702951
MAP@20 0.5136545704304819



In [55]:
def get_query_builder():
    return lambda query : {
        'query': {
            'bool': {
                'must': [
                    {
                        'match': {
                            'raw_content': query,
                        }
                    }
                ]
            }
        }
    }


params = {
    'title_boost': 0, 
    'pagerank_boost': 0, 
    'url_length_boost': 0,
    'content_boost': 0,
    'raw_content_boost': 1
}
        
query_builder = get_query_builder()
_ = analyze_results(MYANDEX_PAGERANK, query_builder, lemmatize_query=False, K=20, params=params)

HBox(children=(IntProgress(value=0, max=495), HTML(value='')))


params={'title_boost': 0, 'pagerank_boost': 0, 'url_length_boost': 0, 'content_boost': 0, 'raw_content_boost': 1}
p@20 0.3434343434343436
r@20 0.22120657518953626
R-precision@20 0.1907489345702951
MAP@20 0.5136545704304819



In [56]:
def build_plot(results):
    ks = list(range(1, len(results) + 1))
    ps = [p for p, _, _, _ in results]
    rs = [r for _, r, _, _ in results]
    Rs = [R for _, _, R, _ in results]
    MAPs = [MAP for _, _, _, MAP in results]
    
    traces = [
        go.Scatter(x=ks, y=ps, name='p@K', mode='lines'),
        go.Scatter(x=ks, y=rs, name='r@K', mode='lines'),
        go.Scatter(x=ks, y=Rs, name='R-precision@K', mode='lines'),
        go.Scatter(x=ks, y=MAPs, name='MAP@K', mode='lines')
    ]
    
    layout = go.Layout(xaxis={'title': 'K'}, yaxis={'title': 'score'})
    fig = go.Figure(traces, layout=layout)
    py.iplot(fig)

In [59]:
params = {
    'title_boost': 0.05, 
    'pagerank_boost': 0.3, 
    'url_length_boost': 0.1,
    'content_boost': 1,
    'raw_content_boost': 0
}
query_builder = get_query_builder(**params)
results = [analyze_results(MYANDEX_PAGERANK, query_builder, lemmatize_query=True, K=k, params=params) for k in range(1, 21)]

HBox(children=(IntProgress(value=0, max=495), HTML(value='')))


params={'title_boost': 0.05, 'pagerank_boost': 0.3, 'url_length_boost': 0.1, 'content_boost': 1, 'raw_content_boost': 0}
p@1 0.5494949494949495
r@1 0.02538380936741067
R-precision@1 0.2279084959269904
MAP@1 0.5494949494949495



HBox(children=(IntProgress(value=0, max=495), HTML(value='')))


params={'title_boost': 0.05, 'pagerank_boost': 0.3, 'url_length_boost': 0.1, 'content_boost': 1, 'raw_content_boost': 0}
p@2 0.5393939393939394
r@2 0.04713920237217049
R-precision@2 0.2279084959269904
MAP@2 0.5969696969696969



HBox(children=(IntProgress(value=0, max=495), HTML(value='')))


params={'title_boost': 0.05, 'pagerank_boost': 0.3, 'url_length_boost': 0.1, 'content_boost': 1, 'raw_content_boost': 0}
p@3 0.5205387205387202
r@3 0.06587442099027123
R-precision@3 0.2279084959269904
MAP@3 0.6055555555555554



HBox(children=(IntProgress(value=0, max=495), HTML(value='')))


params={'title_boost': 0.05, 'pagerank_boost': 0.3, 'url_length_boost': 0.1, 'content_boost': 1, 'raw_content_boost': 0}
p@4 0.5126262626262627
r@4 0.08378599066203048
R-precision@4 0.2279084959269904
MAP@4 0.6037037037037034



HBox(children=(IntProgress(value=0, max=495), HTML(value='')))


params={'title_boost': 0.05, 'pagerank_boost': 0.3, 'url_length_boost': 0.1, 'content_boost': 1, 'raw_content_boost': 0}
p@5 0.4993939393939397
r@5 0.09727631005677798
R-precision@5 0.2279084959269904
MAP@5 0.6038355780022441



HBox(children=(IntProgress(value=0, max=495), HTML(value='')))


params={'title_boost': 0.05, 'pagerank_boost': 0.3, 'url_length_boost': 0.1, 'content_boost': 1, 'raw_content_boost': 0}
p@6 0.4902356902356904
r@6 0.11274943631301382
R-precision@6 0.2279084959269904
MAP@6 0.6029315375982041



HBox(children=(IntProgress(value=0, max=495), HTML(value='')))


params={'title_boost': 0.05, 'pagerank_boost': 0.3, 'url_length_boost': 0.1, 'content_boost': 1, 'raw_content_boost': 0}
p@7 0.4802308802308802
r@7 0.12685400171226757
R-precision@7 0.2279084959269904
MAP@7 0.6008644380311048



HBox(children=(IntProgress(value=0, max=495), HTML(value='')))


params={'title_boost': 0.05, 'pagerank_boost': 0.3, 'url_length_boost': 0.1, 'content_boost': 1, 'raw_content_boost': 0}
p@8 0.47424242424242424
r@8 0.1408066347964722
R-precision@8 0.2279084959269904
MAP@8 0.5956181199752629



HBox(children=(IntProgress(value=0, max=495), HTML(value='')))


params={'title_boost': 0.05, 'pagerank_boost': 0.3, 'url_length_boost': 0.1, 'content_boost': 1, 'raw_content_boost': 0}
p@9 0.46868686868686876
r@9 0.1544530232182102
R-precision@9 0.2279084959269904
MAP@9 0.5905953583453583



HBox(children=(IntProgress(value=0, max=495), HTML(value='')))


params={'title_boost': 0.05, 'pagerank_boost': 0.3, 'url_length_boost': 0.1, 'content_boost': 1, 'raw_content_boost': 0}
p@10 0.4604040404040404
r@10 0.16641377271002872
R-precision@10 0.2279084959269904
MAP@10 0.586557092721114



HBox(children=(IntProgress(value=0, max=495), HTML(value='')))


params={'title_boost': 0.05, 'pagerank_boost': 0.3, 'url_length_boost': 0.1, 'content_boost': 1, 'raw_content_boost': 0}
p@11 0.4534435261707989
r@11 0.17693847780105393
R-precision@11 0.2279084959269904
MAP@11 0.584451454870286



HBox(children=(IntProgress(value=0, max=495), HTML(value='')))


params={'title_boost': 0.05, 'pagerank_boost': 0.3, 'url_length_boost': 0.1, 'content_boost': 1, 'raw_content_boost': 0}
p@12 0.44595959595959594
r@12 0.18744514433086906
R-precision@12 0.2279084959269904
MAP@12 0.5812712115869778



HBox(children=(IntProgress(value=0, max=495), HTML(value='')))


params={'title_boost': 0.05, 'pagerank_boost': 0.3, 'url_length_boost': 0.1, 'content_boost': 1, 'raw_content_boost': 0}
p@13 0.44024864024864024
r@13 0.19782655186536863
R-precision@13 0.2279084959269904
MAP@13 0.576783069120808



HBox(children=(IntProgress(value=0, max=495), HTML(value='')))


params={'title_boost': 0.05, 'pagerank_boost': 0.3, 'url_length_boost': 0.1, 'content_boost': 1, 'raw_content_boost': 0}
p@14 0.43593073593073633
r@14 0.20964175999099346
R-precision@14 0.2279084959269904
MAP@14 0.5723492712513084



HBox(children=(IntProgress(value=0, max=495), HTML(value='')))


params={'title_boost': 0.05, 'pagerank_boost': 0.3, 'url_length_boost': 0.1, 'content_boost': 1, 'raw_content_boost': 0}
p@15 0.43084175084175047
r@15 0.22145637294133885
R-precision@15 0.2279084959269904
MAP@15 0.5706535982533995



HBox(children=(IntProgress(value=0, max=495), HTML(value='')))


params={'title_boost': 0.05, 'pagerank_boost': 0.3, 'url_length_boost': 0.1, 'content_boost': 1, 'raw_content_boost': 0}
p@16 0.4268939393939394
r@16 0.2321318246512367
R-precision@16 0.2279084959269904
MAP@16 0.5683896600524405



HBox(children=(IntProgress(value=0, max=495), HTML(value='')))


params={'title_boost': 0.05, 'pagerank_boost': 0.3, 'url_length_boost': 0.1, 'content_boost': 1, 'raw_content_boost': 0}
p@17 0.4211527035056449
r@17 0.24099908317536192
R-precision@17 0.2279084959269904
MAP@17 0.5650714531664506



HBox(children=(IntProgress(value=0, max=495), HTML(value='')))


params={'title_boost': 0.05, 'pagerank_boost': 0.3, 'url_length_boost': 0.1, 'content_boost': 1, 'raw_content_boost': 0}
p@18 0.4163860830527495
r@18 0.24928805925769193
R-precision@18 0.2279084959269904
MAP@18 0.562283249547074



HBox(children=(IntProgress(value=0, max=495), HTML(value='')))


params={'title_boost': 0.05, 'pagerank_boost': 0.3, 'url_length_boost': 0.1, 'content_boost': 1, 'raw_content_boost': 0}
p@19 0.4123338649654441
r@19 0.2588011428649659
R-precision@19 0.2279084959269904
MAP@19 0.5600075911557244



HBox(children=(IntProgress(value=0, max=495), HTML(value='')))


params={'title_boost': 0.05, 'pagerank_boost': 0.3, 'url_length_boost': 0.1, 'content_boost': 1, 'raw_content_boost': 0}
p@20 0.40797979797979783
r@20 0.26800214171371245
R-precision@20 0.2279084959269904
MAP@20 0.5563371968330636



In [60]:
build_plot(results)