In [188]:
import pandas as pd
import numpy as np
import gzip
import seaborn as sns
import xgboost as xgb
from sklearn.datasets import load_svmlight_file
import datetime
from hyperopt import hp, tpe
from hyperopt.fmin import fmin

# Чтение данных

In [133]:
full_train_data = load_svmlight_file('data/train.txt.gz', query_id=True, dtype=np.float16)

In [125]:
test_data = load_svmlight_file('data/test.txt.gz', query_id=True, dtype=np.float16)

In [144]:
# Выделяем трейн и валидацию
cut_off = 373134
train_data = (full_train_data[0][:cut_off], full_train_data[1][:cut_off], full_train_data[2][:cut_off])
valid_data = (full_train_data[0][cut_off:], full_train_data[1][cut_off:], full_train_data[2][cut_off:])

# Model

In [213]:
#full_train
full_train_dataset = xgb.DMatrix(full_train_data[0], label=full_train_data[1], missing=0)
full_train_unique, full_train_counts = np.unique(full_train_data[2], return_counts=True)
full_train_dataset.set_group(full_train_counts)

In [4]:
#train
train_dataset = xgb.DMatrix(train_data[0], label=train_data[1], missing=0)
train_unique, train_counts = np.unique(train_data[2], return_counts=True)
train_dataset.set_group(train_counts)

In [147]:
#valid
valid_dataset = xgb.DMatrix(valid_data[0], label=valid_data[1], missing=0)
valid_unique, valid_counts = np.unique(valid_data[2], return_counts=True)
valid_dataset.set_group(valid_counts)

In [26]:
#test
test_dataset = xgb.DMatrix(test_data[0], label=test_data[1], missing=0)
test_unique, test_counts = np.unique(test_data[2], return_counts=True)
test_dataset.set_group(test_counts)

## подбор параметров

#model
params = {'objective': 'rank:ndcg', 'eta':0.1, 'gamma':1.0, 'min_child_weight':0.1, 'max_depth': 6}

In [193]:
def hyperopt_xgb_score(params):
    model = xgb.train(params, train_dataset)
    eval_result = model.eval(valid_dataset)
    score = float(eval_result[eval_result.index('.')-1:])
    return -score

In [194]:
simple_space_xgb = {
            'objective': 'rank:ndcg', 
            'gamma':hp.quniform('gamma', 0.025, 2, 0.25),
            'min_child_weight':0.1, 
            'n_estimators': hp.choice('n_estimators', range(100, 1000)),
            'eta': hp.quniform('eta', 0.025, 0.5, 0.025),
            'max_depth':  hp.choice('max_depth', np.arange(1, 14, dtype=int)),
}

In [195]:
best = fmin(fn=hyperopt_xgb_score, space=simple_space_xgb, algo=tpe.suggest, max_evals=10)
print('best:')
print(best)

100%|█████████████████████████████████████████████████████████| 10/10 [10:12<00:00, 72.84s/trial, best loss: -0.880678]
best:
{'eta': 0.4, 'gamma': 0.75, 'max_depth': 9, 'n_estimators': 885}


## final model

In [211]:
additional_params = {'objective': 'rank:ndcg', 'min_child_weight':0.1}
final_params = dict(best, **additional_params)

In [221]:
final_params

{'eta': 0.4,
 'gamma': 0.75,
 'max_depth': 9,
 'n_estimators': 885,
 'objective': 'rank:ndcg',
 'min_child_weight': 0.1}

In [214]:
final_model = xgb.train(final_params, full_train_dataset)

In [215]:
#predictions
#full_train_prediction = final_model.predict(full_train_dataset)
test_prediction = final_model.predict(test_dataset)

# Submission

In [216]:
sample_submission = pd.read_csv('data/sample.made.fall.2019')

In [217]:
sample_submission['predicted'] = test_prediction

In [218]:
sample_submission.head()

Unnamed: 0,QueryId,DocumentId,predicted
0,19945,1,0.313466
1,19945,2,0.464219
2,19945,3,0.813083
3,19945,4,1.103751
4,19945,5,0.724924


In [219]:
# Находим топовые 5 документы для каждого запроса
df_agg = sample_submission.groupby(['QueryId', 'DocumentId']).agg({'predicted':'sum'})
g = df_agg['predicted'].groupby(level=0, group_keys=False)
sorted_g = g.nlargest(5)

In [220]:
# Готовим и сохраняем результат
result = sorted_g.reset_index().drop(['predicted'], axis=1)
result.to_csv('submission/submission_{}'.format(datetime.datetime.today().strftime("%y%m%d_%H-%M-%S")),
             index=False)