# Introduction
State notebook purpose here

### Imports
Import libraries and write settings here.

In [1]:
# Data manipulation
import pandas as pd
import numpy as np
from sklearn.datasets import load_svmlight_file

# Options for pandas
pd.options.display.max_columns = 50
pd.options.display.max_rows = 30

# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from IPython import get_ipython
ipython = get_ipython()

# autoreload extension
if 'autoreload' not in ipython.extension_manager.loaded:
    %load_ext autoreload

%autoreload 2

# Visualizations
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
init_notebook_mode(connected=True)
import xgboost as xgb
from scipy.special import expit
from collections import defaultdict
from sklearn.tree import DecisionTreeRegressor
from numba import jit
from multiprocessing import Pool
from catboost import CatBoost, Pool, MetricVisualizer
from copy import deepcopy
from scipy import sparse

In [2]:
def load_data(path):
    X_data, y_data, qid_data = load_svmlight_file(path, query_id=True)
    sorted_by_qid_idxs = np.argsort(qid_data, kind = 'mergesort')
    qid_data = qid_data[sorted_by_qid_idxs]
    X_data = X_data[sorted_by_qid_idxs]
    y_data = y_data[sorted_by_qid_idxs]
    group_sizes = np.unique(qid_data, return_counts=True)[1]
    return X_data, y_data, qid_data, group_sizes

In [3]:
data_folder='l2r/'

In [4]:
X_train, y_train, qid_train, group_train = load_data(data_folder +'train.txt')

In [5]:
X_test, y_test, qid_test, group_test = load_data(data_folder +'test.txt')

In [16]:
qid_train.shape

(473134,)

In [13]:
y_train[0:15]

array([0., 1., 0., 1., 0., 1., 0., 1., 1., 0., 1., 0., 1., 1., 1.])

In [15]:
group_train.shape

(19944,)

# Analysis/Modeling
Do work here

In [6]:
def dcg(y, k=5):
    k_=min(len(y),k)
    return np.sum([(np.power(2, y[i]) - 1) / (np.log2(i + 2) + 1) for i in range(k_)])

In [7]:
def ideal_dcg(scores):
    scores = [score for score in np.sort(scores, kind='mergesort')[::-1]]
    return dcg(scores)

In [8]:
def get_pairs(true_scores):
    len_of_scores = len(true_scores)
    for i in range(len_of_scores):
        for j in range(i, len_of_scores):
            if true_scores[i] > true_scores[j]:
                yield (i, j)

def delta_ndcg(i,j,idcg, y_true):
    i_pow = np.power(2, y_true[i]) - 1
    j_pow = np.power(2, y_true[j]) - 1
    i_log = np.log2(i + 2)
    j_log = np.log2(j + 2)
    return abs(i_pow / j_log - i_pow / i_log + j_pow / i_log - j_pow / j_log) / idcg

In [9]:
def lambdas_and_w(args):
    y_true, y_pred, qid = args
    sorted_indexes = np.argsort(y_pred, kind='mergesort')[::-1]
    rev_indexes = np.argsort(sorted_indexes)
    #print(sorted_indexes, rev_indexes)
    y_true = y_true[sorted_indexes]
    y_pred = y_pred[sorted_indexes]
    dcg_real=ideal_dcg(y_true)
    lambdas=np.zeros(len(y_true))
    w=np.zeros(len(y_true))
    for i, j in get_pairs(y_true):
        d_ndcg=delta_ndcg(i,j,dcg_real, y_true)
        #print(i,j,d_ndcg)
        diff=min(abs(y_pred[i]-y_pred[j]),20)
        sig=expit(-diff)
        lambda_val = -d_ndcg * sig
        w_val = sig * (1 - sig) * d_ndcg
        #print(i,j,d_ndcg,diff,sig, lambda_val, w_val)
        lambdas[i] += lambda_val
        lambdas[j] -= lambda_val
        w[i] += w_val
        w[j] += w_val
    #print(lambdas,w)
    return lambdas[rev_indexes], w[rev_indexes], qid

In [68]:
def gradient(pred, dtrain):
    Y = np.array(dtrain.get_label())
    pred=np.array(pred)
    qids=np.unique(qid_train)
    true_scores = [Y[np.where(qid_train==qid)] for qid in qids]
    pred_scores=[pred[np.where(qid_train==qid)] for qid in qids]
    grad = np.zeros(Y.shape)
    hess = np.zeros(Y.shape)
    pool = Pool(6)
    for lambda_val, w_val, qid in pool.map(lambdas_and_w,
                                             zip(true_scores, pred_scores, qids),
                                             chunksize=1):
        grad[np.where(qid_train==qid)] = lambda_val
        hess[np.where(qid_train==qid)] = w_val
#     for true,pred,qid in zip(true_scores, pred_scores, qids):
#         lambda_val, w_val=lambdas_and_w(y_true=true,y_pred=pred)
#         grad[np.where(qid_train==qid)] = lambda_val
#         hess[np.where(qid_train==qid)] = w_val
    return grad, hess

In [None]:
dtrain = xgb.DMatrix(data = X_train, label = y_train)
dtrain.set_group(group_train)
dtest = xgb.DMatrix(data = X_test)
dtest.set_group(group_test)

Обучал поэтапно, по 100 итерации, на каждом этапе уменьшал 'eta' и увеличивал глубину дерева. 

In [None]:
from multiprocessing import Pool
params = {'eta': 0.5,  'max_depth': 10, 'eval_metric': 'ndcg@5','subsample': 0.5, 'num_parallel_tree': 6}#, 'tree_method': 'gpu_hist'}
xgb_model = xgb.train(params, dtrain, num_boost_round=100, evals=[(dtrain, 'train')],
                        obj=gradient, xgb_model="my_xgboost_0.6_8_0.5_500.xgb",
                        verbose_eval=True)

In [None]:
xgb_model.save_model("my_xgboost_0.5_10_600.xgb")

In [None]:
pred = xgb_model.predict(dtest)

In [None]:
np.save('pred_my_xgboost_600.npy', pred)

In [20]:
def save_submission(pred_qids, preds, filename):
    fout = open(filename, 'w')
    fout.write('QueryId,DocumentId\n')
    for qid in np.unique(pred_qids):
        q_doc_idxs = np.argwhere(pred_qids == qid).ravel()
        q_doc_scores = preds[q_doc_idxs]
        sorted_doc_ids = 1 + q_doc_idxs[np.argsort(q_doc_scores)[::-1]]
        for did in sorted_doc_ids:
            fout.write('{0},{1}\n'.format(qid, did))
        
    fout.close

In [None]:
save_submission(qid_test,pred, 'eta-0.5_depth-10_600.txt')

In [None]:
from catboost import CatBoost, Pool

In [None]:
train = Pool(
    data=X_train,
    label=y_train,
    group_id=qid_train
)
test = Pool(data=X_test, group_id=qid_test)

In [None]:
default_parameters = {
    'iterations': 1000,
    'custom_metric': ['NDCG:top=5'],
    'verbose': False,
    'subsample': 0.7
}
parameters = deepcopy(default_parameters)
parameters['loss_function'] = 'YetiRankPairwise'
parameters['train_dir'] = 'YetiRankPairwise'
model_catboost = CatBoost(parameters)

In [None]:
model_catboost.fit(train, eval_set=train, plot=True)

In [None]:
model_catboost.save_model("catboost1000.ctb")

In [None]:
pred_catboost = model_catboost.predict(test)

In [None]:
save_submission(qid_test,pred_catboost, 'catboost1000.txt')

In [None]:
np.save('pred_catboost.npy', pred_catboost)

In [11]:
def load_submission(filename):
    res1 = []
    res2 = []
    fin = open(filename, 'r')
    fin.readline()
    for l in fin.readlines():
        args = l.split(',')
        args = [x for x in args if len(x) > 0]
        if len(args) < 2:
            continue
        res1.append(args[0])
        res2.append(int(args[1])-1)
    fin.close()
    return np.array(res1),np.array(res2)

In [12]:
pred_xgboost= np.load('pred_my_xgboost_600.npy')
qid_catboost,sub_catboost = load_submission('catboost1000.txt')
qid_my,sub_my=load_submission('eta-0.5_depth-10_600.txt')

In [13]:
qid_test_train=qid_test[np.where(sub_catboost==sub_my)]
X_test_train=X_test[np.where(sub_catboost==sub_my)]
y_pred_train=pred_xgboost[np.where(sub_catboost==sub_my)]
group_test_train=np.unique(qid_test_train, return_counts=True)[1]

In [14]:
X_train_new= sparse.vstack([X_train, X_test_train])
y_train_new=np.hstack((y_train,y_pred_train))
group_train_new=np.hstack((group_train,group_test_train))
qid_train_new=np.hstack((qid_train,qid_test_train))

In [15]:
def gradient_new(pred, dtrain):
    Y = np.array(dtrain.get_label())
    pred=np.array(pred)
    qids=np.unique(qid_train_new)
    true_scores = [Y[np.where(qid_train_new==qid)] for qid in qids]
    pred_scores=[pred[np.where(qid_train_new==qid)] for qid in qids]
    grad = np.zeros(Y.shape)
    hess = np.zeros(Y.shape)
    pool = Pool(6)
    for lambda_val, w_val, qid in pool.map(lambdas_and_w,
                                             zip(true_scores, pred_scores, qids),
                                             chunksize=1):
        grad[np.where(qid_train_new==qid)] = lambda_val
        hess[np.where(qid_train_new==qid)] = w_val
    return grad, hess

In [16]:
dtrain = xgb.DMatrix(data = X_train_new, label = y_train_new)
dtrain.set_group(group_train_new)
dtest = xgb.DMatrix(data = X_test)
dtest.set_group(group_test)

In [None]:
from multiprocessing import Pool
params = {'eta': 0.5,  'max_depth': 8, 'eval_metric': 'ndcg@5','subsample': 0.5, 'num_parallel_tree': 6}#, 'tree_method': 'gpu_hist'}
xgb_model = xgb.train(params, dtrain, num_boost_round=500, evals=[(dtrain, 'dtrain')],
                        obj=gradient_new,
                        verbose_eval=True)

In [18]:
pred = xgb_model.predict(dtest)

In [21]:
save_submission(qid_test,pred, 'easy.txt')

# ListNet
Summarize findings here