# Santander Customer Satisfaction

## Планы

* Оценить полезность фич.
* Добавить небольшой БГК (размер потестировать)
* Попробовать блендинг 10 xgbc с разными seed
* Можно попробовать подобрать лучший seed по lb.
* Просмотреть форум в конце

## Решение

### Общее начало

In [1]:
%matplotlib inline

import os
import copy
import random
import pickle
from importlib import reload

import numpy as np
import scipy as sp
import pandas as pd
import pylab
import hyperopt

from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.cross_validation import StratifiedKFold, cross_val_score

import xgboost as xgb

import ml_training.common as common
import ml_training.vwutils as vwutils
import sklensemble.ensemblers.averagers as averagers

In [None]:
reload(common)

In [None]:
reload(vwutils)

In [None]:
reload(averagers)

In [2]:
def find_constant_columns(df):
    return [column for column in df.columns if len(df[column].unique()) < 2]

In [3]:
def find_equal_columns(df):
    equal_columns = []
    for col_num, column1 in enumerate(df.columns[: -1]):
        for column2 in df.columns[col_num+1:]:
            if not (df[column1] - df[column2]).any():
                equal_columns.append(column2)
    return list(set(equal_columns))

In [4]:
RANDOM_STATE = 1
random.seed(RANDOM_STATE)

In [5]:
train_df = pd.read_csv(os.path.join("input", "train.csv"))
train_y = train_df["TARGET"]
train_df.drop(["ID", "TARGET"], inplace=True, axis=1)
train_df["n0"] = train_df.apply(lambda x: sum(x == 0), axis=1)

In [6]:
constant_columns = find_constant_columns(train_df)
equal_columns = find_equal_columns(train_df)
drop_columns = list(set(constant_columns).union(equal_columns))

In [7]:
train_df.insert(0, "intercept", 1)
train_x = train_df.drop(drop_columns, axis=1)

In [None]:
#if_mask = common.get_independent_features_mask(train_x.values, tol=5e-7, is_plot=False) # 5e-7
#uncorr_mask = common.find_corr_features_mask(train_x.values, trashhold=0.995) # 0.995
#res_mask = if_mask & uncorr_mask
#if_mask.sum(), uncorr_mask.sum(), res_mask.sum()

In [None]:
#train_x_if = train_x.loc[:, if_mask]
#train_x_uc = train_x.loc[:, uncorr_mask]

In [None]:
cv = StratifiedKFold(train_y, 5, shuffle=True, random_state=RANDOM_STATE)

### Создание классификаторов

In [None]:
xgbc = xgb.XGBClassifier(n_estimators=560, max_depth=5, seed=1234, colsample_bytree=0.7, learning_rate=0.02,
                         subsample=0.68)
xgbc2 = xgb.XGBClassifier(n_estimators=560, max_depth=5, seed=1, colsample_bytree=0.7, learning_rate=0.02,
                         subsample=0.68)
xgbc3 = xgb.XGBClassifier(n_estimators=560, max_depth=5, seed=2, colsample_bytree=0.7, learning_rate=0.02,
                         subsample=0.68)
hyperopt_params = {'min_child_weight': 4.829459240879128, 'gamma': 0.0, 'reg_alpha': 1.7252032215472213, 'max_depth': 6,
                   'subsample': 0.8727431980042409, 'learning_rate': 0.012397370469636557, 'reg_lambda': 0.38434128103560095,
                   'n_estimators': 690, 'colsample_bytree': 0.5840573681076153}
xgbc_h =  xgb.XGBClassifier(**hyperopt_params)

In [9]:
xgbc_r = xgb.XGBClassifier(n_estimators=560, max_depth=5, seed=1234, colsample_bytree=0.701, learning_rate=0.0202048,
                            subsample=0.6815)

In [None]:
estimators = [copy.deepcopy(xgbc), copy.deepcopy(xgbc2), copy.deepcopy(xgbc3)]
is_weighted = [False, False, False]
x_mask = [None, None, None]
bc = averagers.BlendingClassifier(estimators, is_weighted, x_mask)

### Grid search

In [None]:
params = {"n_estimators": [570], "max_depth": [5], "seed": [1234], "learning_rate": [0.02], "subsample": [0.68],
          "colsample_bytree": [0.7]}
gs = GridSearchCV(xgbc, params, scoring="roc_auc", cv=cv, n_jobs=common.NCPU, refit=False, verbose=10)
gs.fit(train_x, train_y)

### Веса

In [None]:
w0 = 1
w1 = train_y.value_counts()[0] / train_y.value_counts()[1] * 0.5
weights = np.array(list((map(lambda x: w1 if x else w0, train_y))))

### Обучение классификаторов

In [None]:
xgbc.fit(train_x.values, train_y.values)

In [None]:
bc.fit(train_x.values, train_y.values, weights)

In [11]:
xgbc_r.fit(train_x.values, train_y.values, eval_metric="auc")

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.701,
       gamma=0, learning_rate=0.0202048, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=560, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=1234, silent=True, subsample=0.6815)

### Выходная оценка

In [12]:
test_df = pd.read_csv(os.path.join("input", "test.csv"))
test_id = test_df["ID"].values
test_df.drop(["ID"], inplace=True, axis=1)

test_df["n0"] = test_df.apply(lambda x: sum(x == 0), axis=1)
test_df.insert(0, "intercept", 1)
test_x = test_df.drop(drop_columns, axis=1)

for col in test_x.columns:
    mn = train_df[col].min()
    mx = train_df[col].max()
    
    test_x.loc[test_x[col] < mn, col] = mn
    test_x.loc[test_x[col] > mx, col] = mx

In [13]:
test_y = xgbc_r.predict_proba(test_x.values)[:, 1]

In [14]:
test_y[: 10]

array([ 0.04610872,  0.05319918,  0.00164673,  0.00973685,  0.00163208,
        0.23983237,  0.02952436,  0.16404244,  0.0331476 ,  0.0205161 ], dtype=float32)

In [None]:
common.save_output(np.array([test_id, test_y]).T, ["ID", "TARGET"])

### Тестирование классификаторов

In [None]:
cross_val_score(copy.deepcopy(xgbc), train_x, train_y, scoring="roc_auc", cv=cv, n_jobs=common.NCPU, verbose=1)

In [None]:
cross_val_score(copy.deepcopy(xgbc), train_x, train_y, scoring="roc_auc", cv=cv, n_jobs=common.NCPU, verbose=1)

In [None]:
common.cross_val_score_with_weights(copy.deepcopy(bc), train_x.values, train_y.values, weights, "roc_auc", cv)

In [None]:
np.corrcoef(np.vstack([xgbc.predict_proba(test_x.values)[:, 1], xgbc_weighted.predict_proba(test_x.values)[:, 1],
                       xgbc_if.predict_proba(test_x.values)[:, 1], xgbc_uc.predict_proba(test_x.values)[:, 1],
                       bc.predict_proba(test_x.values)[:, 1]]))

In [None]:
precision_recall_fscore_support(train_y.values, xgbc.predict(train_x.values))

In [None]:
precision_recall_fscore_support(train_y.values, xgbc_weighted.predict(train_x.values))

In [None]:
precision_recall_fscore_support(train_y.values, xgbc_if.predict(train_x.values))

In [None]:
precision_recall_fscore_support(train_y.values, xgbc_uc.predict(train_x.values))

In [None]:
precision_recall_fscore_support(train_y.values, bc.predict(train_x.values))

In [None]:
pylab.figure(figsize=(16, 6))

pylab.grid(True)
pylab.scatter(xgbc.predict_proba(train_x.values)[:, 1], bc.predict_proba(train_x.values)[:, 1], alpha=0.5, color = 'red')

## Hyperopt

In [None]:
def hyperopt_train_test(params):
    clf = xgb.XGBClassifier(**params)
    return cross_val_score(clf, train_x, train_y, scoring="roc_auc", cv=cv, n_jobs=common.NCPU, verbose=1).mean()

In [None]:
space = {
    "n_estimators": hyperopt.hp.choice("n_estimators", range(400, 800)),
    "max_depth": hyperopt.hp.choice("max_depth", range(3, 9)),
    "colsample_bytree": hyperopt.hp.uniform("colsample_bytree", 0.5, 0.95),
    "learning_rate": hyperopt.hp.uniform("learning_rate", 0.005, 0.03),
    "subsample": hyperopt.hp.uniform("subsample", 0.5, 0.95),
    "min_child_weight": hyperopt.hp.uniform("min_child_weight", 1.0, 5.0),
    "gamma": hyperopt.hp.choice("gamma", [0.0, 0.05, 0.1]),
    "reg_lambda": hyperopt.hp.uniform("reg_lambda", 0.0, 1.0),
    "reg_alpha": hyperopt.hp.uniform("reg_alpha", 0.0, 3.0),
}

In [None]:
count = 0
best = 0
best_params = {}
def f(params):
    global best, count, best_params
    acc = hyperopt_train_test(params)
    if acc > best:
        best = acc
        best_params = params
        print("new best - {0}, using - {1}".format(acc, params))
    
    print("iters - {0}, best_acc - {1}".format(count, best))
    count += 1        
    return {"loss": -acc, "status": hyperopt.STATUS_OK}

In [None]:
trials = hyperopt.Trials()
best = hyperopt.fmin(f, space, algo=hyperopt.tpe.suggest, max_evals=300, trials=trials)

### VW

In [None]:
input_train_file = os.path.join("temp", "scs.input_train")
vwutils.make_vw_input(train_x, input_train_file, y=train_y)

In [None]:
input_test_file = os.path.join("temp", "scs.input_test")
vwutils.make_vw_input(test_x, input_test_file)

### Тестирование кода

In [None]:
x = train_x.iloc[: 1000].values
y = train_y.iloc[: 1000].values
w = weights[: 1000]
x_t = test_x.iloc[: 1000].values