# Santander Customer Satisfaction

## Планы

* Байесовская оптимизация параметров:
    * hyperopt;
    * bayesboost;
    * moe;
    * https://github.com/HIPS/Spearmint.
* Усреднить с моделями, гды выкидывал параметры и с более сложным набором гиперпараметров, но сначала посмотреть на корреляции.
* Взвешенное усреднение.
* Vowpal wabbit с и без полиноминальным представлением переменных.
* Стекинг, что делать с вероятностями?
* Автоматическое решение для поиска лучшей комбинации моделей, причем идеи для более быстрого поиска лучших комбинаций моделей и их параметров можно брать из байесовской оптимизации.


## Решение

In [98]:
%matplotlib inline

import os
import random
import pickle
from importlib import reload

import numpy as np
import scipy as sp
import pandas as pd
import pylab

from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.cross_validation import StratifiedKFold, cross_val_score

import xgboost as xgb

import ml_training.common as common
import sklensemble.ensemblers.averagers as averagers
reload(common)
reload(averagers)

<module 'sklensemble.ensemblers.averagers' from 'E:\\projects\\sklensemble\\ensemblers\\averagers.py'>

In [2]:
def find_constant_columns(df):
    return [column for column in df.columns if len(df[column].unique()) < 2]

In [3]:
def find_equal_columns(df):
    equal_columns = []
    for col_num, column1 in enumerate(df.columns[: -1]):
        for column2 in df.columns[col_num+1:]:
            if not (df[column1] - df[column2]).any():
                equal_columns.append(column2)
    return list(set(equal_columns))

In [4]:
RANDOM_STATE = 100
random.seed(RANDOM_STATE)

In [5]:
train_df = pd.read_csv(os.path.join("input", "train.csv"))

In [6]:
constant_columns = find_constant_columns(train_df)
equal_columns = find_equal_columns(train_df)
drop_columns = list(set(constant_columns).union(equal_columns))

In [7]:
train_df.drop(drop_columns, inplace=True, axis=1)

In [8]:
train_x = train_df.loc[:, "var3": "var38"]
train_y = train_df["TARGET"]

In [9]:
if_mask = common.get_independent_features_mask(train_x.values, tol=0, is_plot=False) # 5e-7
uncorr_mask = common.find_corr_features_mask(train_x.values, trashhold=1.1) # 0.995
res_mask = if_mask & uncorr_mask
if_mask.sum(), uncorr_mask.sum(), res_mask.sum()

(306, 306, 306)

In [10]:
train_x = train_x.loc[:, res_mask]

In [11]:
cv = StratifiedKFold(train_y, 5, shuffle=True, random_state=RANDOM_STATE)

In [100]:
xgbc = xgb.XGBClassifier(n_estimators=150, max_depth=3, seed=RANDOM_STATE)
xgbc_weighted = xgb.XGBClassifier(n_estimators=150, max_depth=3, seed=RANDOM_STATE)
xgbc_hard = xgb.XGBClassifier(n_estimators=350, max_depth=5, seed=4242, colsample_bytree=0.85, learning_rate=0.03,
                              subsample=0.95)
xgbc_very_hard = xgb.XGBClassifier(n_estimators=570, max_depth=5, seed=1234, colsample_bytree=0.7, learning_rate=0.02,
                              subsample=0.68)
bc = averagers.BlendingClassifier([xgbc, xgbc_weighted], [False, True])

In [96]:
params = {"n_estimators": [150], "max_depth": [3], "seed": [RANDOM_STATE]}
params = {"n_estimators": [570], "max_depth": [5], "seed": [1234], "learning_rate": [0.02], "subsample": [0.68],
          "colsample_bytree": [0.7]}
gs = GridSearchCV(xgbc, params, scoring="roc_auc", cv=cv, n_jobs=common.NCPU, refit=False, verbose=10)
gs.fit(train_x, train_y)

[Parallel(n_jobs=3)]: Done   6 out of   5 | elapsed:  8.3min remaining:    0.0s
[Parallel(n_jobs=3)]: Done   6 out of   5 | elapsed:  8.3min remaining:    0.0s
[Parallel(n_jobs=3)]: Done   6 out of   5 | elapsed: 14.8min remaining:    0.0s
[Parallel(n_jobs=3)]: Done   6 out of   5 | elapsed: 14.8min remaining:    0.0s
[Parallel(n_jobs=3)]: Done   5 out of   5 | elapsed: 14.8min finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


GridSearchCV(cv=sklearn.cross_validation.StratifiedKFold(labels=[0 0 ..., 0 0], n_folds=5, shuffle=True, random_state=100),
       error_score='raise',
       estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=150, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=100, silent=True, subsample=1),
       fit_params={}, iid=True, n_jobs=3,
       param_grid={'max_depth': [5], 'n_estimators': [570], 'seed': [1234], 'subsample': [0.68], 'learning_rate': [0.02], 'colsample_bytree': [0.7]},
       pre_dispatch='2*n_jobs', refit=False, scoring='roc_auc', verbose=10)

In [97]:
gs.grid_scores_

[mean: 0.84089, std: 0.00530, params: {'max_depth': 5, 'seed': 1234, 'n_estimators': 570, 'subsample': 0.68, 'learning_rate': 0.02, 'colsample_bytree': 0.7}]

mean: 0.84068, std: 0.00541, params: {'max_depth': 5, 'n_estimators': 350, 'colsample_bytree': 0.85, 'learning_rate': 0.03, 'subsample': 0.95, 'seed': 4242}

mean: 0.83952, std: 0.00440, params: {'seed': 100, 'n_estimators': 150, 'max_depth': 3, 'cc_trashhold': 0.995}

mean: 0.83896, std: 0.00439, params: {'max_depth': 3, 'n_estimators': 150, 'seed': 100, 'qr_tol': 0}

mean: 0.83866, std: 0.00442, params: {'max_depth': 3, 'n_estimators': 150, 'seed': 100, 'qr_tol': 5e-7}

mean: 0.83962, std: 0.00407, params: {'n_estimators': 150, 'seed': 100, 'max_depth': 3, 'fi_trashhold': 0}

mean: 0.83985, std: 0.00420, params: {'n_estimators': 150, 'seed': 100, 'max_depth': 3, 'fi_trashhold': 0.0025}

mean: 0.83874, std: 0.00497, params: {'n_estimators': 150, 'seed': 100, 'max_depth': 3, 'fi_trashhold': 0.0025, 'qr_tol': 5e-7}

In [None]:
cross_val_score(xgbc, train_x, train_y, scoring="roc_auc", cv=cv, n_jobs=NCPU, verbose=10)

In [None]:
params = {"n_estimators": 150, "max_depth": 3, "seed": RANDOM_STATE}
common.cross_val_score_with_weights(xgbc, train_x.values, train_y.values, weights, "roc_auc", cv)

In [14]:
w0 = 1
w1 = train_y.value_counts()[0] / train_y.value_counts()[1] * 0.5
weights = np.array(list((map(lambda x: w1 if x else w0, train_y))))

In [101]:
#xgbc.fit(train_x.values, train_y.values)
#xgbc_weighted.fit(train_x.values, train_y.values, weights)
#xgbc_hard.fit(train_x.values, train_y.values)
xgbc_very_hard.fit(train_x.values, train_y.values)
#bc.fit(train_x.values, train_y.values, weights)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.7,
       gamma=0, learning_rate=0.02, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=570, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=1234, silent=True, subsample=0.68)

In [37]:
common.cross_val_score_with_weights(ac, train_x.values, train_y.values, weights, "roc_auc", cv)

array([ 0.84504669,  0.84399704,  0.84163646,  0.83647443,  0.8287601 ])

In [95]:
np.corrcoef(np.vstack([xgbc.predict_proba(test_x)[:, 1], xgbc_weighted.predict_proba(test_x)[:, 1],
                       xgbc_hard.predict_proba(test_x)[:, 1]]))

array([[ 1.        ,  0.9226716 ,  0.98413532],
       [ 0.9226716 ,  1.        ,  0.91288842],
       [ 0.98413532,  0.91288842,  1.        ]])

In [41]:
test_df = pd.read_csv(os.path.join("input", "test.csv"))
test_df.drop(drop_columns, inplace=True, axis=1)
test_x = test_df.loc[:, "var3": "var38"]
test_x = test_x.loc[:, res_mask]

In [102]:
test_y = xgbc_very_hard.predict_proba(test_x)[:, 1]

In [90]:
precision_recall_fscore_support(train_y, ac.predict(train_x))

(array([ 0.96075903,  0.92857143]),
 array([ 0.99997261,  0.00864362]),
 array([ 0.97997369,  0.0171278 ]),
 array([73012,  3008], dtype=int64))

In [103]:
common.save_output(np.array([test_df["ID"].values, test_y]).T, ["ID", "TARGET"])