In [46]:
import os, sys
import re
import math
from pymystem3 import Mystem
from bs4 import BeautifulSoup, Comment
import pandas as pd
import numpy as np
import multiprocessing as mp
import urllib
import pickle
from tqdm import tqdm_notebook
import matplotlib.pyplot as plt
from collections import Counter
from elasticsearch import Elasticsearch
from elasticsearch.helpers import parallel_bulk
import networkx as nx
import time

from sklearn.datasets import load_svmlight_file
from sklearn.model_selection import train_test_split
from catboost import CatBoost, Pool, MetricVisualizer

import plotly.offline as py
import plotly.graph_objs as go

py.init_notebook_mode(connected=True)

In [4]:
def read_data(file):
    X, y = load_svmlight_file(file)
    print(X.shape, y.shape)
    qid = np.zeros(y.shape, dtype=int)
    with open(file, 'r') as f:
        for i, line in enumerate(f):
            qid[i] = int(line.split()[-1])
    return X.toarray(), y / np.max(y), qid
      
    
def order_by_qid(X, y, qid):
    idx = np.argsort(qid)
    return X[idx], y[idx], qid[idx]

# IMAT 2009 ranking

In [24]:
X, y, qid = read_data('imat2009-datasets/imat2009_learning.txt')
X, y, qid = order_by_qid(X_train, y_train, qid_train)

(97290, 245) (97290,)


In [25]:
X_train, X_test, y_train, y_test, qid_train, qid_test = train_test_split(X, y, qid, test_size=0.2, shuffle=False)
X_train, X_val, y_train, y_val, qid_train, qid_val = train_test_split(X_train, y_train, qid_train, test_size=0.125, shuffle=False)

In [26]:
train = Pool(
    data=X_train,
    label=y_train,
    group_id=qid_train
)

test = Pool(
    data=X_test,
    label=y_test,
    group_id=qid_test
)

val = Pool(
    data=X_val,
    label=y_val,
    group_id=qid_val
)

In [45]:
parameters = {
    'loss_function': 'QueryRMSE',
    'eval_metric': 'NDCG:top=20',
    'verbose': True,
    'random_seed': 0,
    'iterations': 2000,
    'learning_rate': 0.045,
    'depth': 6,
    'max_bin': 64
}

model = CatBoost(parameters)
model.fit(train, eval_set=val, use_best_model=True, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	test: 0.7065538	best: 0.7065538 (0)	total: 145ms	remaining: 4m 50s
1:	test: 0.7352348	best: 0.7352348 (1)	total: 349ms	remaining: 5m 49s
2:	test: 0.7431379	best: 0.7431379 (2)	total: 497ms	remaining: 5m 30s
3:	test: 0.7520812	best: 0.7520812 (3)	total: 663ms	remaining: 5m 30s
4:	test: 0.7563591	best: 0.7563591 (4)	total: 819ms	remaining: 5m 26s
5:	test: 0.7593573	best: 0.7593573 (5)	total: 1.22s	remaining: 6m 44s
6:	test: 0.7607983	best: 0.7607983 (6)	total: 1.54s	remaining: 7m 19s
7:	test: 0.7617563	best: 0.7617563 (7)	total: 1.7s	remaining: 7m 3s
8:	test: 0.7626214	best: 0.7626214 (8)	total: 1.85s	remaining: 6m 49s
9:	test: 0.7640904	best: 0.7640904 (9)	total: 2s	remaining: 6m 38s
10:	test: 0.7638685	best: 0.7640904 (9)	total: 2.16s	remaining: 6m 31s
11:	test: 0.7665732	best: 0.7665732 (11)	total: 2.65s	remaining: 7m 18s
12:	test: 0.7672498	best: 0.7672498 (12)	total: 2.92s	remaining: 7m 25s
13:	test: 0.7678267	best: 0.7678267 (13)	total: 3.08s	remaining: 7m 17s
14:	test: 0.769226

116:	test: 0.7792377	best: 0.7792377 (116)	total: 21.5s	remaining: 5m 45s
117:	test: 0.7795513	best: 0.7795513 (117)	total: 21.7s	remaining: 5m 45s
118:	test: 0.7795872	best: 0.7795872 (118)	total: 21.8s	remaining: 5m 44s
119:	test: 0.7796327	best: 0.7796327 (119)	total: 22s	remaining: 5m 45s
120:	test: 0.7803405	best: 0.7803405 (120)	total: 22.3s	remaining: 5m 46s
121:	test: 0.7819019	best: 0.7819019 (121)	total: 22.5s	remaining: 5m 46s
122:	test: 0.7812515	best: 0.7819019 (121)	total: 22.7s	remaining: 5m 46s
123:	test: 0.7815523	best: 0.7819019 (121)	total: 22.9s	remaining: 5m 45s
124:	test: 0.7813905	best: 0.7819019 (121)	total: 23.1s	remaining: 5m 45s
125:	test: 0.7818254	best: 0.7819019 (121)	total: 23.2s	remaining: 5m 45s
126:	test: 0.7820208	best: 0.7820208 (126)	total: 23.4s	remaining: 5m 44s
127:	test: 0.7819391	best: 0.7820208 (126)	total: 23.7s	remaining: 5m 46s
128:	test: 0.7813344	best: 0.7820208 (126)	total: 23.8s	remaining: 5m 45s
129:	test: 0.7814284	best: 0.7820208 (12

228:	test: 0.7844584	best: 0.7844584 (228)	total: 42.3s	remaining: 5m 27s
229:	test: 0.7844053	best: 0.7844584 (228)	total: 42.5s	remaining: 5m 27s
230:	test: 0.7842538	best: 0.7844584 (228)	total: 42.7s	remaining: 5m 26s
231:	test: 0.7843140	best: 0.7844584 (228)	total: 42.9s	remaining: 5m 26s
232:	test: 0.7843384	best: 0.7844584 (228)	total: 43s	remaining: 5m 25s
233:	test: 0.7843384	best: 0.7844584 (228)	total: 43.1s	remaining: 5m 25s
234:	test: 0.7846896	best: 0.7846896 (234)	total: 43.2s	remaining: 5m 24s
235:	test: 0.7846614	best: 0.7846896 (234)	total: 43.4s	remaining: 5m 24s
236:	test: 0.7845753	best: 0.7846896 (234)	total: 43.5s	remaining: 5m 23s
237:	test: 0.7845729	best: 0.7846896 (234)	total: 43.7s	remaining: 5m 23s
238:	test: 0.7845187	best: 0.7846896 (234)	total: 43.8s	remaining: 5m 23s
239:	test: 0.7846126	best: 0.7846896 (234)	total: 44s	remaining: 5m 22s
240:	test: 0.7849430	best: 0.7849430 (240)	total: 44.1s	remaining: 5m 21s
241:	test: 0.7854889	best: 0.7854889 (241)

340:	test: 0.7864046	best: 0.7868867 (329)	total: 1m	remaining: 4m 52s
341:	test: 0.7861908	best: 0.7868867 (329)	total: 1m	remaining: 4m 52s
342:	test: 0.7863624	best: 0.7868867 (329)	total: 1m	remaining: 4m 52s
343:	test: 0.7868426	best: 0.7868867 (329)	total: 1m	remaining: 4m 51s
344:	test: 0.7861769	best: 0.7868867 (329)	total: 1m	remaining: 4m 51s
345:	test: 0.7863236	best: 0.7868867 (329)	total: 1m	remaining: 4m 51s
346:	test: 0.7864578	best: 0.7868867 (329)	total: 1m 1s	remaining: 4m 51s
347:	test: 0.7862596	best: 0.7868867 (329)	total: 1m 1s	remaining: 4m 51s
348:	test: 0.7862254	best: 0.7868867 (329)	total: 1m 1s	remaining: 4m 51s
349:	test: 0.7862150	best: 0.7868867 (329)	total: 1m 1s	remaining: 4m 51s
350:	test: 0.7864919	best: 0.7868867 (329)	total: 1m 1s	remaining: 4m 50s
351:	test: 0.7863747	best: 0.7868867 (329)	total: 1m 2s	remaining: 4m 50s
352:	test: 0.7863487	best: 0.7868867 (329)	total: 1m 2s	remaining: 4m 50s
353:	test: 0.7863432	best: 0.7868867 (329)	total: 1m 2s	

452:	test: 0.7872406	best: 0.7873907 (450)	total: 1m 18s	remaining: 4m 27s
453:	test: 0.7873204	best: 0.7873907 (450)	total: 1m 18s	remaining: 4m 27s
454:	test: 0.7872074	best: 0.7873907 (450)	total: 1m 18s	remaining: 4m 26s
455:	test: 0.7872248	best: 0.7873907 (450)	total: 1m 18s	remaining: 4m 26s
456:	test: 0.7872951	best: 0.7873907 (450)	total: 1m 19s	remaining: 4m 27s
457:	test: 0.7873131	best: 0.7873907 (450)	total: 1m 19s	remaining: 4m 27s
458:	test: 0.7870463	best: 0.7873907 (450)	total: 1m 19s	remaining: 4m 27s
459:	test: 0.7871596	best: 0.7873907 (450)	total: 1m 19s	remaining: 4m 27s
460:	test: 0.7871237	best: 0.7873907 (450)	total: 1m 19s	remaining: 4m 26s
461:	test: 0.7870998	best: 0.7873907 (450)	total: 1m 20s	remaining: 4m 26s
462:	test: 0.7871122	best: 0.7873907 (450)	total: 1m 20s	remaining: 4m 26s
463:	test: 0.7872105	best: 0.7873907 (450)	total: 1m 20s	remaining: 4m 26s
464:	test: 0.7870262	best: 0.7873907 (450)	total: 1m 20s	remaining: 4m 25s
465:	test: 0.7867452	best

563:	test: 0.7878938	best: 0.7882689 (544)	total: 1m 36s	remaining: 4m 5s
564:	test: 0.7885261	best: 0.7885261 (564)	total: 1m 36s	remaining: 4m 5s
565:	test: 0.7884679	best: 0.7885261 (564)	total: 1m 36s	remaining: 4m 5s
566:	test: 0.7885007	best: 0.7885261 (564)	total: 1m 37s	remaining: 4m 5s
567:	test: 0.7879237	best: 0.7885261 (564)	total: 1m 37s	remaining: 4m 5s
568:	test: 0.7876118	best: 0.7885261 (564)	total: 1m 37s	remaining: 4m 5s
569:	test: 0.7880751	best: 0.7885261 (564)	total: 1m 37s	remaining: 4m 5s
570:	test: 0.7880445	best: 0.7885261 (564)	total: 1m 37s	remaining: 4m 4s
571:	test: 0.7880310	best: 0.7885261 (564)	total: 1m 38s	remaining: 4m 4s
572:	test: 0.7880310	best: 0.7885261 (564)	total: 1m 38s	remaining: 4m 4s
573:	test: 0.7882968	best: 0.7885261 (564)	total: 1m 38s	remaining: 4m 4s
574:	test: 0.7881801	best: 0.7885261 (564)	total: 1m 38s	remaining: 4m 4s
575:	test: 0.7882193	best: 0.7885261 (564)	total: 1m 38s	remaining: 4m 3s
576:	test: 0.7881304	best: 0.7885261 (

673:	test: 0.7894591	best: 0.7894591 (673)	total: 1m 53s	remaining: 3m 43s
674:	test: 0.7893219	best: 0.7894591 (673)	total: 1m 53s	remaining: 3m 43s
675:	test: 0.7892588	best: 0.7894591 (673)	total: 1m 53s	remaining: 3m 42s
676:	test: 0.7889742	best: 0.7894591 (673)	total: 1m 54s	remaining: 3m 42s
677:	test: 0.7889873	best: 0.7894591 (673)	total: 1m 54s	remaining: 3m 42s
678:	test: 0.7889873	best: 0.7894591 (673)	total: 1m 54s	remaining: 3m 42s
679:	test: 0.7888968	best: 0.7894591 (673)	total: 1m 54s	remaining: 3m 42s
680:	test: 0.7888269	best: 0.7894591 (673)	total: 1m 54s	remaining: 3m 41s
681:	test: 0.7887881	best: 0.7894591 (673)	total: 1m 54s	remaining: 3m 41s
682:	test: 0.7889844	best: 0.7894591 (673)	total: 1m 54s	remaining: 3m 41s
683:	test: 0.7890167	best: 0.7894591 (673)	total: 1m 54s	remaining: 3m 41s
684:	test: 0.7890101	best: 0.7894591 (673)	total: 1m 55s	remaining: 3m 40s
685:	test: 0.7890245	best: 0.7894591 (673)	total: 1m 55s	remaining: 3m 40s
686:	test: 0.7889697	best

784:	test: 0.7890920	best: 0.7894591 (673)	total: 2m 9s	remaining: 3m 20s
785:	test: 0.7890747	best: 0.7894591 (673)	total: 2m 9s	remaining: 3m 20s
786:	test: 0.7889094	best: 0.7894591 (673)	total: 2m 9s	remaining: 3m 20s
787:	test: 0.7893547	best: 0.7894591 (673)	total: 2m 10s	remaining: 3m 19s
788:	test: 0.7893547	best: 0.7894591 (673)	total: 2m 10s	remaining: 3m 19s
789:	test: 0.7893607	best: 0.7894591 (673)	total: 2m 10s	remaining: 3m 19s
790:	test: 0.7893702	best: 0.7894591 (673)	total: 2m 10s	remaining: 3m 19s
791:	test: 0.7893494	best: 0.7894591 (673)	total: 2m 10s	remaining: 3m 19s
792:	test: 0.7893746	best: 0.7894591 (673)	total: 2m 11s	remaining: 3m 19s
793:	test: 0.7893732	best: 0.7894591 (673)	total: 2m 11s	remaining: 3m 19s
794:	test: 0.7895177	best: 0.7895177 (794)	total: 2m 11s	remaining: 3m 19s
795:	test: 0.7894630	best: 0.7895177 (794)	total: 2m 11s	remaining: 3m 19s
796:	test: 0.7894670	best: 0.7895177 (794)	total: 2m 11s	remaining: 3m 18s
797:	test: 0.7894132	best: 0

894:	test: 0.7903726	best: 0.7903726 (894)	total: 2m 28s	remaining: 3m 3s
895:	test: 0.7902520	best: 0.7903726 (894)	total: 2m 29s	remaining: 3m 3s
896:	test: 0.7903408	best: 0.7903726 (894)	total: 2m 29s	remaining: 3m 3s
897:	test: 0.7900672	best: 0.7903726 (894)	total: 2m 29s	remaining: 3m 3s
898:	test: 0.7900568	best: 0.7903726 (894)	total: 2m 29s	remaining: 3m 3s
899:	test: 0.7900765	best: 0.7903726 (894)	total: 2m 29s	remaining: 3m 2s
900:	test: 0.7901698	best: 0.7903726 (894)	total: 2m 29s	remaining: 3m 2s
901:	test: 0.7902063	best: 0.7903726 (894)	total: 2m 29s	remaining: 3m 2s
902:	test: 0.7903309	best: 0.7903726 (894)	total: 2m 30s	remaining: 3m 2s
903:	test: 0.7905826	best: 0.7905826 (903)	total: 2m 30s	remaining: 3m 2s
904:	test: 0.7905886	best: 0.7905886 (904)	total: 2m 30s	remaining: 3m 2s
905:	test: 0.7905921	best: 0.7905921 (905)	total: 2m 30s	remaining: 3m 2s
906:	test: 0.7905921	best: 0.7905921 (905)	total: 2m 31s	remaining: 3m 2s
907:	test: 0.7905484	best: 0.7905921 (

1005:	test: 0.7911345	best: 0.7915192 (955)	total: 2m 47s	remaining: 2m 45s
1006:	test: 0.7910414	best: 0.7915192 (955)	total: 2m 47s	remaining: 2m 45s
1007:	test: 0.7910286	best: 0.7915192 (955)	total: 2m 47s	remaining: 2m 45s
1008:	test: 0.7910286	best: 0.7915192 (955)	total: 2m 47s	remaining: 2m 44s
1009:	test: 0.7909544	best: 0.7915192 (955)	total: 2m 47s	remaining: 2m 44s
1010:	test: 0.7909445	best: 0.7915192 (955)	total: 2m 48s	remaining: 2m 44s
1011:	test: 0.7910312	best: 0.7915192 (955)	total: 2m 48s	remaining: 2m 44s
1012:	test: 0.7910722	best: 0.7915192 (955)	total: 2m 48s	remaining: 2m 44s
1013:	test: 0.7910722	best: 0.7915192 (955)	total: 2m 48s	remaining: 2m 43s
1014:	test: 0.7911654	best: 0.7915192 (955)	total: 2m 48s	remaining: 2m 43s
1015:	test: 0.7911371	best: 0.7915192 (955)	total: 2m 48s	remaining: 2m 43s
1016:	test: 0.7911293	best: 0.7915192 (955)	total: 2m 49s	remaining: 2m 43s
1017:	test: 0.7911394	best: 0.7915192 (955)	total: 2m 49s	remaining: 2m 43s
1018:	test: 

1113:	test: 0.7920944	best: 0.7921928 (1111)	total: 3m 6s	remaining: 2m 27s
1114:	test: 0.7920944	best: 0.7921928 (1111)	total: 3m 6s	remaining: 2m 27s
1115:	test: 0.7921844	best: 0.7921928 (1111)	total: 3m 6s	remaining: 2m 27s
1116:	test: 0.7919327	best: 0.7921928 (1111)	total: 3m 6s	remaining: 2m 27s
1117:	test: 0.7918390	best: 0.7921928 (1111)	total: 3m 6s	remaining: 2m 27s
1118:	test: 0.7919295	best: 0.7921928 (1111)	total: 3m 6s	remaining: 2m 27s
1119:	test: 0.7918719	best: 0.7921928 (1111)	total: 3m 7s	remaining: 2m 26s
1120:	test: 0.7918578	best: 0.7921928 (1111)	total: 3m 7s	remaining: 2m 26s
1121:	test: 0.7918578	best: 0.7921928 (1111)	total: 3m 7s	remaining: 2m 26s
1122:	test: 0.7918578	best: 0.7921928 (1111)	total: 3m 7s	remaining: 2m 26s
1123:	test: 0.7918549	best: 0.7921928 (1111)	total: 3m 7s	remaining: 2m 26s
1124:	test: 0.7918909	best: 0.7921928 (1111)	total: 3m 7s	remaining: 2m 26s
1125:	test: 0.7919009	best: 0.7921928 (1111)	total: 3m 7s	remaining: 2m 25s
1126:	test: 

1220:	test: 0.7921969	best: 0.7923606 (1180)	total: 3m 23s	remaining: 2m 10s
1221:	test: 0.7921637	best: 0.7923606 (1180)	total: 3m 23s	remaining: 2m 9s
1222:	test: 0.7921637	best: 0.7923606 (1180)	total: 3m 24s	remaining: 2m 9s
1223:	test: 0.7922159	best: 0.7923606 (1180)	total: 3m 24s	remaining: 2m 9s
1224:	test: 0.7916637	best: 0.7923606 (1180)	total: 3m 24s	remaining: 2m 9s
1225:	test: 0.7916678	best: 0.7923606 (1180)	total: 3m 24s	remaining: 2m 9s
1226:	test: 0.7916941	best: 0.7923606 (1180)	total: 3m 24s	remaining: 2m 9s
1227:	test: 0.7916941	best: 0.7923606 (1180)	total: 3m 25s	remaining: 2m 8s
1228:	test: 0.7917029	best: 0.7923606 (1180)	total: 3m 25s	remaining: 2m 8s
1229:	test: 0.7917201	best: 0.7923606 (1180)	total: 3m 25s	remaining: 2m 8s
1230:	test: 0.7917847	best: 0.7923606 (1180)	total: 3m 25s	remaining: 2m 8s
1231:	test: 0.7918024	best: 0.7923606 (1180)	total: 3m 25s	remaining: 2m 8s
1232:	test: 0.7918024	best: 0.7923606 (1180)	total: 3m 25s	remaining: 2m 8s
1233:	test:

1328:	test: 0.7920604	best: 0.7923945 (1321)	total: 3m 41s	remaining: 1m 51s
1329:	test: 0.7920346	best: 0.7923945 (1321)	total: 3m 41s	remaining: 1m 51s
1330:	test: 0.7920282	best: 0.7923945 (1321)	total: 3m 41s	remaining: 1m 51s
1331:	test: 0.7919893	best: 0.7923945 (1321)	total: 3m 41s	remaining: 1m 51s
1332:	test: 0.7919560	best: 0.7923945 (1321)	total: 3m 42s	remaining: 1m 51s
1333:	test: 0.7920440	best: 0.7923945 (1321)	total: 3m 42s	remaining: 1m 50s
1334:	test: 0.7920796	best: 0.7923945 (1321)	total: 3m 42s	remaining: 1m 50s
1335:	test: 0.7920668	best: 0.7923945 (1321)	total: 3m 42s	remaining: 1m 50s
1336:	test: 0.7924778	best: 0.7924778 (1336)	total: 3m 42s	remaining: 1m 50s
1337:	test: 0.7920940	best: 0.7924778 (1336)	total: 3m 42s	remaining: 1m 50s
1338:	test: 0.7920143	best: 0.7924778 (1336)	total: 3m 43s	remaining: 1m 50s
1339:	test: 0.7920398	best: 0.7924778 (1336)	total: 3m 43s	remaining: 1m 49s
1340:	test: 0.7920147	best: 0.7924778 (1336)	total: 3m 43s	remaining: 1m 49s

1436:	test: 0.7911152	best: 0.7925096 (1350)	total: 3m 59s	remaining: 1m 33s
1437:	test: 0.7913445	best: 0.7925096 (1350)	total: 3m 59s	remaining: 1m 33s
1438:	test: 0.7913425	best: 0.7925096 (1350)	total: 3m 59s	remaining: 1m 33s
1439:	test: 0.7913006	best: 0.7925096 (1350)	total: 3m 59s	remaining: 1m 33s
1440:	test: 0.7912012	best: 0.7925096 (1350)	total: 3m 59s	remaining: 1m 33s
1441:	test: 0.7912413	best: 0.7925096 (1350)	total: 4m	remaining: 1m 32s
1442:	test: 0.7912440	best: 0.7925096 (1350)	total: 4m	remaining: 1m 32s
1443:	test: 0.7912363	best: 0.7925096 (1350)	total: 4m	remaining: 1m 32s
1444:	test: 0.7912377	best: 0.7925096 (1350)	total: 4m	remaining: 1m 32s
1445:	test: 0.7911783	best: 0.7925096 (1350)	total: 4m	remaining: 1m 32s
1446:	test: 0.7911939	best: 0.7925096 (1350)	total: 4m	remaining: 1m 32s
1447:	test: 0.7911108	best: 0.7925096 (1350)	total: 4m	remaining: 1m 31s
1448:	test: 0.7911144	best: 0.7925096 (1350)	total: 4m 1s	remaining: 1m 31s
1449:	test: 0.7911177	best: 

1544:	test: 0.7917110	best: 0.7925096 (1350)	total: 4m 32s	remaining: 1m 20s
1545:	test: 0.7915010	best: 0.7925096 (1350)	total: 4m 33s	remaining: 1m 20s
1546:	test: 0.7915392	best: 0.7925096 (1350)	total: 4m 33s	remaining: 1m 20s
1547:	test: 0.7915434	best: 0.7925096 (1350)	total: 4m 33s	remaining: 1m 19s
1548:	test: 0.7915398	best: 0.7925096 (1350)	total: 4m 33s	remaining: 1m 19s
1549:	test: 0.7915371	best: 0.7925096 (1350)	total: 4m 33s	remaining: 1m 19s
1550:	test: 0.7915475	best: 0.7925096 (1350)	total: 4m 33s	remaining: 1m 19s
1551:	test: 0.7915442	best: 0.7925096 (1350)	total: 4m 33s	remaining: 1m 19s
1552:	test: 0.7915479	best: 0.7925096 (1350)	total: 4m 34s	remaining: 1m 18s
1553:	test: 0.7916851	best: 0.7925096 (1350)	total: 4m 34s	remaining: 1m 18s
1554:	test: 0.7917287	best: 0.7925096 (1350)	total: 4m 34s	remaining: 1m 18s
1555:	test: 0.7917287	best: 0.7925096 (1350)	total: 4m 34s	remaining: 1m 18s
1556:	test: 0.7917366	best: 0.7925096 (1350)	total: 4m 34s	remaining: 1m 18s

1652:	test: 0.7915858	best: 0.7925096 (1350)	total: 4m 53s	remaining: 1m 1s
1653:	test: 0.7915858	best: 0.7925096 (1350)	total: 4m 53s	remaining: 1m 1s
1654:	test: 0.7914673	best: 0.7925096 (1350)	total: 4m 54s	remaining: 1m 1s
1655:	test: 0.7914673	best: 0.7925096 (1350)	total: 4m 54s	remaining: 1m 1s
1656:	test: 0.7914683	best: 0.7925096 (1350)	total: 4m 54s	remaining: 1m
1657:	test: 0.7914502	best: 0.7925096 (1350)	total: 4m 54s	remaining: 1m
1658:	test: 0.7914308	best: 0.7925096 (1350)	total: 4m 54s	remaining: 1m
1659:	test: 0.7914200	best: 0.7925096 (1350)	total: 4m 55s	remaining: 1m
1660:	test: 0.7914042	best: 0.7925096 (1350)	total: 4m 55s	remaining: 1m
1661:	test: 0.7914942	best: 0.7925096 (1350)	total: 4m 55s	remaining: 1m
1662:	test: 0.7914942	best: 0.7925096 (1350)	total: 4m 55s	remaining: 59.9s
1663:	test: 0.7915912	best: 0.7925096 (1350)	total: 4m 55s	remaining: 59.7s
1664:	test: 0.7915912	best: 0.7925096 (1350)	total: 4m 55s	remaining: 59.6s
1665:	test: 0.7915496	best: 0.

1763:	test: 0.7910686	best: 0.7925096 (1350)	total: 5m 13s	remaining: 42s
1764:	test: 0.7911482	best: 0.7925096 (1350)	total: 5m 13s	remaining: 41.8s
1765:	test: 0.7911086	best: 0.7925096 (1350)	total: 5m 13s	remaining: 41.6s
1766:	test: 0.7911086	best: 0.7925096 (1350)	total: 5m 14s	remaining: 41.4s
1767:	test: 0.7911086	best: 0.7925096 (1350)	total: 5m 14s	remaining: 41.2s
1768:	test: 0.7911621	best: 0.7925096 (1350)	total: 5m 14s	remaining: 41.1s
1769:	test: 0.7911553	best: 0.7925096 (1350)	total: 5m 14s	remaining: 40.9s
1770:	test: 0.7911493	best: 0.7925096 (1350)	total: 5m 14s	remaining: 40.7s
1771:	test: 0.7911493	best: 0.7925096 (1350)	total: 5m 14s	remaining: 40.5s
1772:	test: 0.7910668	best: 0.7925096 (1350)	total: 5m 14s	remaining: 40.3s
1773:	test: 0.7910298	best: 0.7925096 (1350)	total: 5m 15s	remaining: 40.1s
1774:	test: 0.7909616	best: 0.7925096 (1350)	total: 5m 15s	remaining: 40s
1775:	test: 0.7909616	best: 0.7925096 (1350)	total: 5m 15s	remaining: 39.8s
1776:	test: 0.79

1873:	test: 0.7909362	best: 0.7925096 (1350)	total: 5m 32s	remaining: 22.4s
1874:	test: 0.7909325	best: 0.7925096 (1350)	total: 5m 33s	remaining: 22.2s
1875:	test: 0.7908938	best: 0.7925096 (1350)	total: 5m 33s	remaining: 22s
1876:	test: 0.7908929	best: 0.7925096 (1350)	total: 5m 33s	remaining: 21.9s
1877:	test: 0.7908743	best: 0.7925096 (1350)	total: 5m 33s	remaining: 21.7s
1878:	test: 0.7908733	best: 0.7925096 (1350)	total: 5m 33s	remaining: 21.5s
1879:	test: 0.7908938	best: 0.7925096 (1350)	total: 5m 34s	remaining: 21.3s
1880:	test: 0.7908943	best: 0.7925096 (1350)	total: 5m 34s	remaining: 21.1s
1881:	test: 0.7908719	best: 0.7925096 (1350)	total: 5m 34s	remaining: 21s
1882:	test: 0.7908265	best: 0.7925096 (1350)	total: 5m 34s	remaining: 20.8s
1883:	test: 0.7908265	best: 0.7925096 (1350)	total: 5m 34s	remaining: 20.6s
1884:	test: 0.7908265	best: 0.7925096 (1350)	total: 5m 35s	remaining: 20.4s
1885:	test: 0.7908244	best: 0.7925096 (1350)	total: 5m 35s	remaining: 20.3s
1886:	test: 0.79

1983:	test: 0.7912148	best: 0.7925096 (1350)	total: 5m 55s	remaining: 2.87s
1984:	test: 0.7912537	best: 0.7925096 (1350)	total: 5m 55s	remaining: 2.69s
1985:	test: 0.7912537	best: 0.7925096 (1350)	total: 5m 55s	remaining: 2.51s
1986:	test: 0.7912537	best: 0.7925096 (1350)	total: 5m 55s	remaining: 2.33s
1987:	test: 0.7912033	best: 0.7925096 (1350)	total: 5m 56s	remaining: 2.15s
1988:	test: 0.7911792	best: 0.7925096 (1350)	total: 5m 56s	remaining: 1.97s
1989:	test: 0.7911454	best: 0.7925096 (1350)	total: 5m 56s	remaining: 1.79s
1990:	test: 0.7911454	best: 0.7925096 (1350)	total: 5m 56s	remaining: 1.61s
1991:	test: 0.7911438	best: 0.7925096 (1350)	total: 5m 57s	remaining: 1.43s
1992:	test: 0.7911372	best: 0.7925096 (1350)	total: 5m 57s	remaining: 1.25s
1993:	test: 0.7911372	best: 0.7925096 (1350)	total: 5m 57s	remaining: 1.07s
1994:	test: 0.7911254	best: 0.7925096 (1350)	total: 5m 57s	remaining: 897ms
1995:	test: 0.7911306	best: 0.7925096 (1350)	total: 5m 57s	remaining: 717ms
1996:	test: 

<catboost.core.CatBoost at 0x12b57b5f8>

# BY.WEB

In [47]:
es = Elasticsearch([{'host': 'localhost', 'port': 9200, 'timeout': 360, 'maxsize': 25}])

In [122]:
def read_docs():
    df = pd.read_csv('texts.csv', skiprows=0)
    ids = df['id'].to_numpy()
    docs = df['text'].to_numpy()
    return ids, docs


def read_graph(nodes):
    df = pd.read_csv('url_graph.csv')
    src = df['src'].to_numpy()
    dst = df['dst'].to_numpy()
    edges = np.hstack((src.reshape(-1,1), dst.reshape(-1,1)))
    graph = nx.DiGraph()
    graph.add_nodes_from(nodes)
    graph.add_edges_from(edges)
    return graph


def read_urls(inv=True):
    df = pd.read_csv('urls.csv')
    if inv:
        urls = {row['html_url']: int(row['html_id']) for _, row in df.iterrows()}
    else:
        urls = {int(row['html_id']): row['html_url'] for _, row in df.iterrows()}
    return urls


def load_lemmas():
    with open('lemmas.pickle', 'rb') as f:
        return pickle.load(f)
    
    
def decode(s):
    s = base64.b64decode(s)
    s = codecs.decode(s, 'cp1251', errors='ignore')
    return s


def get_quieries(relevance):
    queries = {}
    with open('web2008_adhoc.xml','r', encoding="cp1251") as src:
        raw_xml = src.read()
        soup = BeautifulSoup(raw_xml)
        for task in soup.find_all('task'):
            if task['id'] in relevance:
                queries[task['id']] = task.querytext.string
    return queries


def get_relevance(year=2009, urls=None):
    relevance = {}
    with open(f'or_relevant-minus_table_{year}.xml', 'r', encoding="cp1251") as src:
        raw_xml = src.read()
        soup = BeautifulSoup(raw_xml)
        
        for task in soup.find_all('task'):
            documents = task.find_all('document') 
            vital = {}
            for doc in documents:
                if urls:
                    doc['id'] = urls.get(doc['id'], None)
                if doc['relevance'] == 'vital':
                    vital[doc['id']] = 1
                else:
                    vital[doc['id']] = 0
            relevance[task['id']] = vital
            
    return relevance

In [49]:
def create_settings():
    settings = {
        'mappings': {
            'properties': {
                'title': {
                    'type': 'text'
                }, 
                'content': {
                    'type': 'text'
                },
                'raw_content': {
                    'type': 'text'
                },
                'pagerank': {
                    'type': 'rank_feature'
                },
                'url_len': {
                    'type': 'rank_feature'
                }
             }
        },
        'settings': {
            'analysis': {
                'analyzer': {
                    'white_lover': {
                        'tokenizer': 'letter',
                        'filter': [
                            'lowercase', 
                        ]
                    }
                }
            }
        }
    }
    return settings


def create_es_action(index, doc_id, document):
    return {
        '_index': index,
        '_id': doc_id,
        '_source': document
    }


def create_index_with_pagerank(index, ids, all_docs, all_raw_docs, pagerank, urls):
    def next_document():
        for i, doc, raw_doc in tqdm_notebook(list(zip(ids, all_docs, all_raw_docs))):
            if doc is not None:
                doc = str(doc).split('\n', 1)
                if len(doc) == 2:
                    title, body = doc
                    desc = {
                        'title': title,
                        'content': str(body),
                        'raw_content': str(raw_doc),
                        'pagerank': pagerank.get(i, 0), 
                        'url_len': len(urls.get(i, ''))
                    }
                    yield create_es_action(index, int(i), desc)
    
    for ok, result in parallel_bulk(es, next_document(), queue_size=4, thread_count=4, chunk_size=1000):
        if not ok:
            print(result)

def print_index_size(index): 
    print(f"{(es.indices.stats(index)['_all']['primaries']['store']['size_in_bytes'] / 2 ** 30):.2f} GB")

In [116]:
def get_query_builder(title_boost, pagerank_boost, url_length_boost, content_boost=1, raw_content_boost=0):
    return lambda query : {
            'query': {
                'bool': {
                    'should': [
                        {
                            'match': {
                                'content': {
                                    'query': query,
                                    'boost': content_boost
                                }
                            }
                        },
                        {
                            'match': {
                                'raw_content': {
                                    'query': query,
                                    'boost': raw_content_boost
                                }
                            }
                        },
                        {
                            'match': {
                                'title': {
                                    'query': query,
                                    'operator': 'OR',
                                    'boost': title_boost
                                }
                            }
                        },
                        {
                            'rank_feature': {
                                'field': 'pagerank',
                                'boost': pagerank_boost,
                            },
                        },
                        {
                            'rank_feature': {
                                'field': 'url_len',
                                'boost': url_length_boost
                            },
                        }
                    ]
                }
            }
        }


def search(query, index, *args, K=20):
    res = es.search(index=index, body=query, size=K)['hits']
    pretty_result = []
    for hit in res['hits']:
        pretty_result.append({
            'id': hit['_id'],
            'score': hit['_score'],
            'src': hit['_source']
        })
    return pretty_result


def get_relevant_for_k(res, relevant, K=20):
    return sum([relevant[res['id']] if res['id'] in relevant else 0 for res in res[:K]])        


def analyze_results(index, query_builder, lemmatize_query=False, K=20, params={}):
    m = Mystem()
    
    def lemmatize_doc(doc):
        return ''.join(m.lemmatize(str(doc)))

    def lemmatize_collection(docs):
        result = []
        for doc in tqdm_notebook(docs):
            result.append(lemmatize_doc(doc))
        return result

    Q = len(queries)
    qpK, qrK, qR_average, qmapK = 0, 0, 0, 0
    qR = []
    for task, q in tqdm_notebook(queries.items()):
        if lemmatize_query:
            q = lemmatize_doc(q)
        results = search(query_builder(q), index, K)
        cur_relevant = len(relevance[task])
        qpK += get_relevant_for_k(results, relevance[task], K) / K 
        qrK += get_relevant_for_k(results, relevance[task], K) / cur_relevant
        qR.append(get_relevant_for_k(results, relevance[task], cur_relevant) / cur_relevant)
        qR_average += qR[-1]
        
        mapK, cur = 0, 0
        for k in range(min(K, len(results))):
            if results[k]['id'] in relevance[task]:
                cur += 1
                mapK += cur / (k + 1)
        if cur != 0:
            mapK /= cur
        qmapK += mapK
    print('=========')
    print(f'params={params}')
    print(f'p@{K} {qpK / Q}')
    print(f'r@{K} {qrK / Q}')
    print(f'R-precision@{K} {qR_average / Q}')
    print(f'MAP@{K} {qmapK / Q}')
    print()
    
    return qpK / Q, qrK / Q, qR_average / Q, qmapK / Q


def get_documents_for_queries(index, query_builder, lemmatize_query=False, K=100):
    m = Mystem()
    
    def lemmatize_doc(doc):
        return ''.join(m.lemmatize(str(doc)))

    def lemmatize_collection(docs):
        result = []
        for doc in tqdm_notebook(docs):
            result.append(lemmatize_doc(doc))
        return result

    results = []
    for task, q in tqdm_notebook(queries.items()):
        if lemmatize_query:
            q = lemmatize_doc(q)
        results.append(search(query_builder(q), index, K))
    return results


def get_documents_for_query_by_id(index, query_builder, query, doc_ids, lemmatize_query=False):
    m = Mystem()
    
    def lemmatize_doc(doc):
        return ''.join(m.lemmatize(str(doc)))

    def lemmatize_collection(docs):
        result = []
        for doc in tqdm_notebook(docs):
            result.append(lemmatize_doc(doc))
        return result
    
    if lemmatize_query:
        query = lemmatize_doc(query)
    result = search(query_builder(query), index, 100000)
    doc_ids = set(doc_ids)
    return {int(doc['id']): doc for doc in result if int(doc['id']) in doc_ids}


def get_document_by_id(index, doc_id):
    query = {'query': {'terms': {'_id': [str(doc_id)]}}}
    result = search(query, index, 1)
    if len(result) == 0:
        return None
    result = result[0]
    result['score'] = 0
    return result

In [125]:
def count_matches(query, doc, lemmatize_query=True, lemmatize_doc=False):
    m = Mystem()
    
    query = str(query).lower()
    doc = str(doc).lower()
    
    if lemmatize_query:
        query = set(''.join(m.lemmatize(query)).split())
    else:
        query = set(query.split())
        
    if lemmatize_doc:
        doc = ''.join(m.lemmatize(doc)).split()
    else:
        doc = doc.split()
    
    cnt = 0
    for word in query:
        cnt += doc.count(word)
    return cnt, max(1, len(doc)), max(1, len(query))

def count_unique_matches(query, doc, lemmatize_query=True, lemmatize_doc=False):
    m = Mystem()

    query = str(query).lower()
    doc = str(doc).lower()

    if lemmatize_query:
        query = set(''.join(m.lemmatize(query)).split())
    else:
        query = set(query.split())

    if lemmatize_doc:
        doc = set(''.join(m.lemmatize(doc)).split())
    else:
        doc = set(doc.split())

    cnt = 0
    for word in query:
        if word in doc:
            cnt += 1
    return cnt, max(1, len(doc)), max(1, len(query))
    

def get_features(query, doc, urls):
    f1 = doc['score']
    f2 = doc['src']['url_len']
    f3 = doc['src']['pagerank']
    f8, f4, f29 = count_matches(query, doc['src']['content'], lemmatize_query=True, lemmatize_doc=False)
    f9, f5, _ = count_matches(query, doc['src']['title'], lemmatize_query=False, lemmatize_doc=False)
    f10, f6, _ = count_matches(query, doc['src']['title'], lemmatize_query=True, lemmatize_doc=True)
    f11, f7, _ = count_matches(query, doc['src']['raw_content'], lemmatize_query=False, lemmatize_doc=False)
    f12 = f8 / f4
    f13 = f9 / f5
    f14 = f10 / f6
    f15 = f11 / f7
    f16 = urls[int(doc['id'])].count('/')
    f17, f18, f30 = count_unique_matches(query, doc['src']['content'], lemmatize_query=True, lemmatize_doc=False)
    f19, f20, _ = count_unique_matches(query, doc['src']['title'], lemmatize_query=False, lemmatize_doc=False)
    f21, f22, _ = count_unique_matches(query, doc['src']['title'], lemmatize_query=True, lemmatize_doc=True)
    f23, f24, _ = count_unique_matches(query, doc['src']['raw_content'], lemmatize_query=False, lemmatize_doc=False)
    f25 = f17 / f30
    f26 = f21 / f30
    f27 = f23 / f30
    f28 = f19 / f30
    
    
    return np.array([
        f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14, f15,
        f16, f17, f18, f19, f20, f21, f22, f23, f24, f25, f26, f27, f28, f29, f30
    ])

# 'title': title,
#                         'content': str(body),
#                         'raw_content': str(raw_doc),
#                         'pagerank': pagerank.get(i, 0), 
#                         'url_len': len(urls.get(i, ''))


In [51]:
MYANDEX_PAGERANK = 'myandex_pagerank'
# es.indices.delete(index=MYANDEX_PAGERANK, ignore=[400, 404])
# es.indices.create(index=MYANDEX_PAGERANK, body=create_settings())
# tin = time.time()
# create_index_with_pagerank(MYANDEX_PAGERANK, ids, lemmatized_docs, docs, pagerank, urls)
# tout = time.time()

es.indices.open(index=MYANDEX_PAGERANK, ignore=[400, 404])
print_index_size(MYANDEX_PAGERANK)

2.75 GB


In [74]:
relevance = get_relevance(2008, read_urls(inv=True))
queries = get_quieries(relevance)

In [78]:
params = {
    'title_boost': 0.05, 
    'pagerank_boost': 0.3, 
    'url_length_boost': 0.1,
    'content_boost': 1,
    'raw_content_boost': 0
}

In [123]:
urls = read_urls(inv=False)

In [128]:
size = 0
features = 1

X_2008 = []
y_2008 = []
qid_2008 = []

for ind, (qid, docs) in enumerate(relevance.items()):
    es_docs = get_documents_for_query_by_id(MYANDEX_PAGERANK, get_query_builder(**params), 
                                            queries[qid], docs.keys(), lemmatize_query=True)
    
    for task, (doc_id, rel in tqdm_notebook(docs.items()):
        qid_2008.append(qid)
        y_2008.append(int(rel))
        
        tin = time.time()
        
        if doc_id not in es_docs:
            doc = get_document_by_id(MYANDEX_PAGERANK, doc_id)
            if not doc:
                continue
            es_docs[doc_id] = doc
        
        X_2008.append(get_features(queries[qid], doc, urls))
        size += 1
        
        tout = time.time()
        
        print(f'iteration={ind}, qid {qid} processed, size={size}, time={tout-tin:.5f} sec.')
    
            
    
    print(f'iteration={ind}, qid {qid} processed, size={size}, time={tout-tin:.5f} sec.')
        
X_2008 = np.array(X_2008)
y_2008 = np.array(y_2008)
qid_2008 = np.array(qid_2008)

HBox(children=(IntProgress(value=0, max=45), HTML(value='')))




TypeError: cannot unpack non-iterable int object

In [102]:
size

7251

In [103]:
sum(len(doc) for _, doc in relevance.items())

38212

In [75]:
relevance

{'arw53946': {625098: 0,
  861886: 0,
  1460063: 0,
  721539: 0,
  395587: 0,
  692869: 0,
  None: 0,
  867180: 1,
  1401222: 1,
  1100005: 1,
  1256899: 0,
  1478847: 0,
  723243: 0,
  1401464: 0,
  1202610: 0,
  1206322: 0,
  1197964: 0,
  363342: 0,
  1245410: 0,
  1422405: 0,
  478217: 0,
  1294309: 0,
  1316661: 0,
  1283818: 0,
  1330783: 0,
  918889: 0,
  1050868: 0,
  1149284: 0,
  841753: 0,
  1221878: 0,
  1033761: 0,
  374156: 1,
  1423152: 0,
  671978: 0,
  1175887: 0,
  1184434: 0,
  976773: 0,
  1276569: 1,
  1104012: 0,
  637167: 0,
  855672: 1,
  354574: 0,
  1373972: 0,
  8344: 0,
  842914: 0},
 'arw53945': {386633: 0,
  303909: 0,
  None: 0,
  1243140: 0,
  865854: 0,
  1489835: 1,
  1508370: 0,
  1136919: 1,
  1141176: 0,
  836158: 0,
  1142519: 0,
  26416: 0,
  1031180: 0,
  1469175: 0,
  1417670: 0,
  1307322: 0,
  1121616: 1,
  1241941: 1,
  48492: 1,
  848086: 0,
  1409326: 0,
  1399776: 0,
  487628: 1,
  430076: 0,
  1139614: 0,
  463474: 0,
  1514505: 0,
  1296

In [66]:
ids[0]

709987

In [119]:
queries

{'arw49648': 'Настя Задорожная',
 'arw49650': 'кудымкар',
 'arw49663': 'тополинный пух жара',
 'arw49731': 'Battlestar Galactica',
 'arw49752': 'новая поисковая система',
 'arw49762': 'Дэниел Колт ubs',
 'arw49763': 'ванга',
 'arw49814': 'все для туризма',
 'arw49858': 'смартс',
 'arw49874': 'расчет параметров асинхронного двигателя',
 'arw49881': 'миронов вячеслав иванович',
 'arw49918': 'коробка управления',
 'arw49933': 'Color Pilot',
 'arw49949': 'инвестор',
 'arw49960': 'Яцуба Виктор Васильевич',
 'arw49969': 'Мойки Кухонные Franke',
 'arw49973': 'часы работы овир ювао',
 'arw49979': 'велком',
 'arw49988': 'Юбилей Отари Важевича Гурского',
 'arw50032': 'таблица пар трения',
 'arw50051': 'Бензогенератор инверторного типа KIPOR IG1000 (кожух)',
 'arw50093': 'лицей 1535',
 'arw50100': 'чистая ссудная задолженность',
 'arw50104': 'строительство склада типовой проект',
 'arw50108': 'Аватар аниме',
 'arw50125': 'коды к играм',
 'arw50136': 'Мастер производственного обучения.ru Методичес