In [99]:
!pip install catboost



In [100]:
import pandas as pd
import numpy as np
import sklearn

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import ndcg_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

from catboost import CatBoostRanker, Pool, CatBoostRegressor
from copy import deepcopy

In [101]:
train = pd.read_csv("data/train_df.csv")
test = pd.read_csv("data/test_df.csv")

#### NDCG метрика

In [102]:
#Усредняем NDCG по всем запросам
def NDCG(y_true, y_pred):
    """
    Формат:
    y_true = [[1, 0, 1], [0, 1]]
    y_pred = [[0, 1, 1], [0, 0]]

    Будем учитывать предсказания из одного обьекта, если он релевантен.
    """
    ndcg_sum = 0
    n = 0

    for i in range(len(y_true)):
        if len(y_true[i]) == 1 and y_true[i] == 1:
             ndcg_sum += y_pred[i][0]
             n += 1
        if len(y_true[i]) != 1:
            ndcg_sum += ndcg_score([y_true[i]], [y_pred[i]])
            n += 1

    return ndcg_sum / n


#для удобства работы с валидационной выборкой
def NDCG_df_res(df_res):
    """
    Формат
    df_res = pd.DataFrame(data={'search_id' : [..], 'y_pred' : [..], 'y_valid' : [..]})
    """
    ids = list(set(df_res['search_id']))

    y_true = []
    y_pred = []

    for id in ids:
        y_pred.append(list(df_res[df_res['search_id'] == id]['y_pred']))
        y_true.append(list(df_res[df_res['search_id'] == id]['y_valid']))

    return NDCG(y_true, y_pred)

#### Подготовим датасет

1) Заметим,что некотрые признаки принимают коснтантные значения.
Уберем их.

In [103]:
corr = train.corr();
print(corr['search_id'].isnull()[corr['search_id'].isnull()==True])
train = train.drop(['feature_0', 'feature_73', 'feature_74', 'feature_75'], axis=1)
test = test.drop(['feature_0', 'feature_73', 'feature_74', 'feature_75'], axis=1)

feature_0     True
feature_73    True
feature_74    True
feature_75    True
Name: search_id, dtype: bool


2) Уберем признаки сильно коррелирующие, к.Пирсона > 0.8

In [104]:
corr_vector = corr.abs().unstack().sort_values(ascending=False).drop_duplicates()
corr_vector = list(dict(corr_vector[corr_vector < 1][corr_vector > 0.8]).keys())
feature_list1 = set([f[0] for f in corr_vector])
feature_list2 = set([f[0] for f in corr_vector])

drop_features = list(set.union(feature_list1, feature_list2))
drop_features

['feature_55',
 'feature_53',
 'feature_63',
 'feature_62',
 'feature_46',
 'feature_65',
 'feature_12',
 'feature_71',
 'feature_3',
 'feature_76',
 'feature_78',
 'feature_11',
 'feature_54']

In [105]:
train = train.drop(drop_features, axis=1)
test = test.drop(drop_features, axis=1)

3) Подготовим search_id для удобной работы с NDCG. Разобьем тренировочный датасет на train_df и valid_df (valid_size=0.25)

In [106]:
idx = sorted(list(set(train['search_id'])))

drop_idx = []
for id in idx:
    targets = train[train['search_id'] == id]['target']
    if (np.sum(targets) == 0) and (len(targets) < 5):
        drop_idx.append(id)

idx = list(set(idx).difference(set(drop_idx)))
idx_train, idx_valid = idx[:int(len(idx)*0.75)] , idx[int(len(idx)*0.75):]
df_train, df_valid = train[train['search_id'] <= idx_train[-1]], train[train['search_id'] >= idx_valid[0]]

#### Обучение моделей

Максимальные метрики для тренировочной и валидационной выборок

In [107]:
X_train, X_valid = df_train.drop(['search_id', 'target'], axis=1), df_valid.drop(['search_id', 'target'], axis=1)
y_train, y_valid = df_train['target'], df_valid['target']

df_res_train = pd.DataFrame(data={'search_id' : df_train['search_id'], 'y_pred' : y_train, 'y_valid' : y_train})
df_res_valid = pd.DataFrame(data={'search_id' : df_valid['search_id'], 'y_pred' : y_valid, 'y_valid' : y_valid})

print("max_NDCG train =", NDCG_df_res(df_res_train), "max_NDCG valid =", NDCG_df_res(df_res_valid))

max_NDCG train = 0.21521739130434783 max_NDCG valid = 0.21023765996343693


1) Базовое решение: Логистичекая регрессия

In [98]:
param_grid = {'C': [0.001, 0.01, 1, 10],
              'solver': ['lbfgs', 'sag']}


for solver_param in param_grid['solver']:
    for c_param in param_grid['C']:
        X_train, X_valid = df_train.drop(['search_id', 'target'], axis=1), df_valid.drop(['search_id', 'target'], axis=1)
        y_train, y_valid = df_train['target'], df_valid['target']

        # попробуем стнадартизировать данные
        ss = StandardScaler()
        X_train_ss = pd.DataFrame(ss.fit_transform(X_train), columns=X_train.columns)
        X_valid_ss = pd.DataFrame(ss.fit_transform(X_valid), columns=X_valid.columns)

        logreg = LogisticRegression(max_iter=5000, solver=solver_param, C=c_param)
        logreg.fit(X_train_ss, y_train)

        y_pred_train = logreg.predict_proba(X_train_ss)[:,1]
        y_pred_valid = logreg.predict_proba(X_valid_ss)[:,1]
        df_res_train = pd.DataFrame(data={'search_id' : df_train['search_id'], 'y_pred' : y_pred_train, 'y_valid' : y_train})
        df_res_valid = pd.DataFrame(data={'search_id' : df_valid['search_id'], 'y_pred' : y_pred_valid, 'y_valid' : y_valid})

        print(solver_param, c_param, "train = ", NDCG_df_res(df_res_train),
                               "valid = ", NDCG_df_res(df_res_valid))

lbfgs 0.001 train =  0.11256125625907597 valid =  0.11172149939159948
lbfgs 0.01 train =  0.11390205481166492 valid =  0.11242824011560545
lbfgs 1 train =  0.11611434136980149 valid =  0.11293652240768558
lbfgs 10 train =  0.11594822201083711 valid =  0.11293652240768558
sag 0.001 train =  0.11256125625907597 valid =  0.11172149939159948
sag 0.01 train =  0.11390205481166492 valid =  0.11284194100323054
sag 1 train =  0.11597202642026729 valid =  0.11293652240768558
sag 10 train =  0.11594822201083711 valid =  0.11293652240768558


Как мы видим Логистическая регрессия не дает удовлетворительных результатов - в выборке слишком много нулевых векторов. Модель стремиться предсказать нули. Попробуем более продвинутые алгоритмы, например, градиентный бустинг.

2) Градиентный бустинг sklearn

In [114]:
def train_loop(model, param_grid):
    NDCG_train_best = []
    NDCG_valid_best = []

    for lr_param in param_grid['learning_rate']:
        print("lr =", lr_param, ": ")
        for n_est_param in param_grid['n_estimators']:
            print("   n_est =", n_est_param, ": ")
            for depth_param in param_grid['depth']:
                X_train, X_valid = df_train.drop(['search_id', 'target'], axis=1), df_valid.drop(['search_id', 'target'], axis=1)
                y_train, y_valid = df_train['target'], df_valid['target']

                gb = model(learning_rate=lr_param, n_estimators=n_est_param, max_depth=depth_param, verbose=0)
                gb.fit(X_train, y_train)

                y_pred_train = gb.predict(X_train)
                y_pred_valid = gb.predict(X_valid)

                df_res_train = pd.DataFrame(data={'search_id' : df_train['search_id'], 'y_pred' : y_pred_train, 'y_valid' : y_train})
                df_res_valid = pd.DataFrame(data={'search_id' : df_valid['search_id'], 'y_pred' : y_pred_valid, 'y_valid' : y_valid})

                NDCG_train_best.append(NDCG_df_res(df_res_train))
                NDCG_valid_best.append(NDCG_df_res(df_res_valid))

                print("      max_depth =", depth_param, "train =", NDCG_df_res(df_res_train),
                                                        "valid =", NDCG_df_res(df_res_valid))
    return (NDCG_train_best, NDCG_valid_best)

In [109]:
param_grid = {'n_estimators' : [100, 300, 500],
              'learning_rate': [ 0.01, 0.1, 0.3],
              'depth': [3, 6] }

model = GradientBoostingRegressor

tr, vl = train_loop(model, param_grid)

max(tr), max(vl)

lr = 0.01 : 
   n_est = 100 : 
      max_depth = 3 train = 0.12434583274082367 valid = 0.1199896030925312
      max_depth = 6 train = 0.14901551724618017 valid = 0.1419139975973971
   n_est = 300 : 
      max_depth = 3 train = 0.136906891297934 valid = 0.12849300030407373
      max_depth = 6 train = 0.1884303953826852 valid = 0.17474315435774465
   n_est = 500 : 
      max_depth = 3 train = 0.14763875019814887 valid = 0.1376962817113187
      max_depth = 6 train = 0.2044789444140467 valid = 0.18804840235695164
lr = 0.1 : 
   n_est = 100 : 
      max_depth = 3 train = 0.15477183843238773 valid = 0.14486731550486792
      max_depth = 6 train = 0.21112712020577695 valid = 0.1931439161598087
   n_est = 300 : 
      max_depth = 3 train = 0.1931828060989805 valid = 0.17688352734620913
      max_depth = 6 train = 0.21521739130434783 valid = 0.197805269783824
   n_est = 500 : 
      max_depth = 3 train = 0.2069834490056029 valid = 0.19022002953013453
      max_depth = 6 train = 0.2152173913043

(0.21521739130434783, 0.197805269783824)

3) Градиентный бустинг Сatboost

In [116]:
# CatBoostRegressor

param_grid = {'n_estimators' : [100, 300, 500],
              'learning_rate': [ 0.01, 0.2, 0.5],
              'depth': [3, 6] }

model = CatBoostRegressor

tr, vl = train_loop(model, param_grid)

max(tr), max(vl)

lr = 0.01 : 
   n_est = 100 : 
      max_depth = 3 train = 0.11727752918093261 valid = 0.11662966014229795
      max_depth = 6 train = 0.14601817504613357 valid = 0.13873554876858965
   n_est = 300 : 
      max_depth = 3 train = 0.12530307758427978 valid = 0.12214695168489173
      max_depth = 6 train = 0.16179209875825837 valid = 0.15140445380443826
   n_est = 500 : 
      max_depth = 3 train = 0.13419081824442988 valid = 0.13058805088906758
      max_depth = 6 train = 0.17149143671808337 valid = 0.1590722479611344
lr = 0.2 : 
   n_est = 100 : 
      max_depth = 3 train = 0.15126752014066158 valid = 0.1419643871004417
      max_depth = 6 train = 0.18548039419079682 valid = 0.17268431804130926
   n_est = 300 : 
      max_depth = 3 train = 0.18809519567567415 valid = 0.1733670107163559
      max_depth = 6 train = 0.21172397053387307 valid = 0.19232282265013592
   n_est = 500 : 
      max_depth = 3 train = 0.20022654573072965 valid = 0.18511263660988697
      max_depth = 6 train = 0.2151

(0.21521739130434783, 0.19806590105198546)

Обратимся к CatBoostRanker. Минимизация функции потерь RMSE.  

In [117]:
X_train, X_valid = df_train.drop(['search_id', 'target'], axis=1), df_valid.drop(['search_id', 'target'], axis=1)
y_train, y_valid = df_train['target'], df_valid['target']

In [118]:
train_pool = Pool(
    data=X_train.values,
    label=y_train,
    group_id=df_train['search_id']
)

test_pool = Pool(
    data=X_valid.values,
    label=y_valid,
    group_id=df_valid['search_id']
)

In [126]:
default_parameters = {
    'iterations': 1000,
    'custom_metric': ['NDCG'],
    'verbose': 0,
    'random_seed': 0,
    'learning_rate' : 0.1
}

parameters = {}

In [127]:
def fit_model(loss_function, additional_params=None, train_pool=train_pool, test_pool=test_pool):
    parameters = deepcopy(default_parameters)
    parameters['loss_function'] = loss_function
    parameters['train_dir'] = loss_function

    if additional_params is not None:
        parameters.update(additional_params)

    model = CatBoostRanker(**parameters)
    model.fit(train_pool, eval_set=test_pool, plot=True)

    return model

In [128]:
model = fit_model('RMSE', {'custom_metric': ['NDCG']}, train_pool, test_pool)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [130]:
model.best_iteration_, model.best_score_

(999,
 {'learn': {'RMSE': 0.078011510609796},
  'validation': {'NDCG:type=Base': 0.9851887399984477,
   'RMSE': 0.08379707948358492}})

In [131]:
y_pred_train = model.predict(train_pool)
y_pred_valid = model.predict(test_pool)

df_res_valid = pd.DataFrame(data={'search_id' : df_valid['search_id'], 'y_pred' : y_pred_valid, 'y_valid' : y_valid})
df_res_train = pd.DataFrame(data={'search_id' : df_train['search_id'], 'y_pred' : y_pred_train, 'y_valid' : y_train})

NDCG_df_res(df_res_train), NDCG_df_res(df_res_valid)

(0.21470079849198195, 0.19433985710529053)

#### Обучение лучшей модели

Лучшая модель CatBoostRegeressor с метрикой на валидации 0.1980 (max 0.210). Обучим на всем тренировочном датасете и сделаем предсказания для тестовой.


In [133]:
cgb = CatBoostRegressor(learning_rate=0.5, max_depth=6, n_estimators=300, verbose=0)

cgb.fit(train.drop(['search_id', 'target'], axis=1), train['target'])

<catboost.core.CatBoostRegressor at 0x7e22cc8535e0>

In [137]:
res_pred = cgb.predict(test.drop(['search_id', 'target'], axis=1))
df_res_test = pd.DataFrame(data={'search_id' : test['search_id'], 'y_pred' : res_pred, 'y_valid' : test['target']})
NDCG_df_res(df_res_test)

0.11368645007059028