# Santander Customer Satisfaction

## Планы

* Взять свой xgboost дома.
* Собрать его под mingw.
* Разобраться с pull request.


* Байесовская оптимизация параметров:
    * hyperopt;
    * bayesboost;
    * moe;
    * https://github.com/HIPS/Spearmint.
* Попробовать преобразования признаков.
* AdaBoost.
* Комбинации моделей:
    * простое усреднение;
    * проверить корреляцию результатов;
    * усреднение по правильным классам;
    * оптимизировать коэффициенты перед моделями.
* Фреймворк для объединения моделей.
* Автоматическое решение для поиска лучшей комбинации моделей, причем идеи для более быстрого поиска лучших комбинаций моделей и их параметров можно брать из байесовской оптимизации.


## Решение

In [1]:
%matplotlib inline

import os
import random
import multiprocessing
from importlib import reload

import numpy as np
import scipy as sp
import pandas as pd
import pylab

from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.cross_validation import StratifiedKFold, cross_val_score
import xgboost as xgb

import ml_training.common as common
reload(common)

<module 'ml_training.common' from 'F:\\Documents\\Projects\\ml_training\\common.py'>

In [2]:
def find_constant_columns(df):
    return [column for column in df.columns if len(df[column].unique()) < 2]

In [3]:
def find_equal_columns(df):
    equal_columns = []
    for col_num, column1 in enumerate(df.columns[: -1]):
        for column2 in df.columns[col_num+1:]:
            if not (df[column1] - df[column2]).any():
                equal_columns.append(column2)
    return list(set(equal_columns))

In [4]:
RANDOM_STATE = 100
NCPU = multiprocessing.cpu_count() - 1
random.seed(RANDOM_STATE)

In [5]:
train_df = pd.read_csv(os.path.join("input", "train.csv"))

In [6]:
constant_columns = find_constant_columns(train_df)
equal_columns = find_equal_columns(train_df)
drop_columns = list(set(constant_columns).union(equal_columns))

In [7]:
train_df.drop(drop_columns, inplace=True, axis=1)

In [8]:
train_x = train_df.loc[:, "var3": "var38"]
train_y = train_df["TARGET"]

In [9]:
if_mask = common.get_independent_features_mask(train_x.values, tol=0, is_plot=False) # 5e-7
uncorr_mask = common.find_corr_features_mask(train_x.values, trashhold=1.1) # 0.995
res_mask = if_mask & uncorr_mask
if_mask.sum(), uncorr_mask.sum(), res_mask.sum()

(306, 306, 306)

In [10]:
train_x = train_x.loc[:, res_mask]

In [11]:
cv = StratifiedKFold(train_y, 5, shuffle=True, random_state=RANDOM_STATE)

In [None]:
xgbc = xgb.XGBClassifier(n_estimators=150, max_depth=3, seed=RANDOM_STATE)

In [None]:
params = {"n_estimators": [150], "max_depth": [3], "seed": [RANDOM_STATE]}
gs = GridSearchCV(xgbc, params, scoring="roc_auc", cv=cv, n_jobs=NCPU, refit=False, verbose=10)
gs.fit(train_x, train_y)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


In [None]:
gs.grid_scores_

mean: 0.84068, std: 0.00541, params: {'max_depth': 5, 'n_estimators': 350, 'colsample_bytree': 0.85, 'learning_rate': 0.03, 'subsample': 0.95, 'seed': 4242}

mean: 0.83952, std: 0.00440, params: {'seed': 100, 'n_estimators': 150, 'max_depth': 3, 'cc_trashhold': 0.995}

mean: 0.83896, std: 0.00439, params: {'max_depth': 3, 'n_estimators': 150, 'seed': 100, 'qr_tol': 0}

mean: 0.83866, std: 0.00442, params: {'max_depth': 3, 'n_estimators': 150, 'seed': 100, 'qr_tol': 5e-7}

mean: 0.83962, std: 0.00407, params: {'n_estimators': 150, 'seed': 100, 'max_depth': 3, 'fi_trashhold': 0}

mean: 0.83985, std: 0.00420, params: {'n_estimators': 150, 'seed': 100, 'max_depth': 3, 'fi_trashhold': 0.0025}

mean: 0.83874, std: 0.00497, params: {'n_estimators': 150, 'seed': 100, 'max_depth': 3, 'fi_trashhold': 0.0025, 'qr_tol': 5e-7}

In [None]:
xgbc.fit(train_x.values, train_y.values)

In [None]:
test_df = pd.read_csv(os.path.join("input", "test.csv"))
test_df.drop(drop_columns, inplace=True, axis=1)
test_x = test_df.loc[:, "var3": "var38"]
test_x = test_x.loc[:, res_mask]

In [None]:
test_y = xgbc.predict_proba(test_x)[:, 1]

In [None]:
common.save_output(np.array([test_df["ID"].values, test_y]).T, ["ID", "TARGET"])

In [None]:
#abc = AdaBoostClassifier(base_estimator=xgbc, random_state=RANDOM_STATE, )

In [None]:
#cross_val_score(abc, train_x, train_y, scoring="roc_auc", cv=cv, n_jobs=NCPU, verbose=10)