# Santander Customer Satisfaction

## Планы

* Просмотреть все важные переменные, построить графики, сравнить как себя ведут при разных значениях выходной переменной
* https://www.kaggle.com/c/santander-customer-satisfaction/forums/t/20275/data-preprocessing-xgb-python
* Изучить закладку в Разное

## Решение

### Общее начало

In [1]:
%matplotlib inline

import os
import copy
import random
import pickle
from importlib import reload

import numpy as np
import scipy as sp
import pandas as pd
import pylab
import hyperopt

from sklearn.grid_search import GridSearchCV
from sklearn.metrics import precision_recall_fscore_support
from sklearn.cross_validation import StratifiedKFold, cross_val_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import get_scorer

import xgboost as xgb

import ml_training.common as common
import ml_training.vwutils as vwutils
import sklensemble.ensemblers.averagers as averagers

In [16]:
reload(common)

<module 'ml_training.common' from 'E:\\Documents\\Projects\\ml_training\\common.py'>

In [None]:
reload(vwutils)

In [None]:
reload(averagers)

In [2]:
def find_constant_columns(df):
    return [column for column in df.columns if len(df[column].unique()) < 2]

In [3]:
def find_equal_columns(df):
    equal_columns = []
    for col_num, column1 in enumerate(df.columns[: -1]):
        for column2 in df.columns[col_num+1:]:
            if not (df[column1] - df[column2]).any():
                equal_columns.append(column2)
    return list(set(equal_columns))

In [4]:
def make_feature_engineering(df):
    df.loc[df["var3"] == df["var3"].min(), "var3"] = df["var3"].mode()
    
    df["var38mc"] = np.isclose(df["var38"], df["var38"].mode())
    df["logvar38"] = df.loc[~df["var38mc"], "var38"].map(np.log)
    df["logvar38"].fillna(0, inplace=True)
    df.drop(["var38"], axis=1, inplace=True)

In [5]:
RANDOM_STATE = 777
random.seed(RANDOM_STATE)

In [6]:
train_df = pd.read_csv(os.path.join("input", "train.csv"))
train_y = train_df["TARGET"]
train_df.drop(["ID", "TARGET"], inplace=True, axis=1)
train_df["n0"] = train_df.apply(lambda x: sum(x == 0), axis=1)

In [7]:
constant_columns = find_constant_columns(train_df)
equal_columns = find_equal_columns(train_df)
drop_columns = list(set(constant_columns).union(equal_columns))

In [8]:
train_x = train_df.drop(drop_columns, axis=1)

In [9]:
make_feature_engineering(train_x)

In [None]:
#scaler = RobustScaler()
#scaled_train_x = scaler.fit_transform(train_x)
#pca = PCA(2)
#pca.fit(scaled_train_x)
#train_pca_df = pd.DataFrame([pca.transform(scaled_train_x)], columns=["pca1", "pca2"])
#train_x = pd.concat([train_x, train_pca_df], axis=1)

In [None]:
#if_mask = common.get_independent_features_mask(train_x.values, tol=5e-7, is_plot=False) # 5e-7
#uncorr_mask = common.find_corr_features_mask(train_x.values, trashhold=0.995) # 0.995
#res_mask = if_mask & uncorr_mask
#if_mask.sum(), uncorr_mask.sum(), res_mask.sum()

In [None]:
#train_x_if = train_x.loc[:, if_mask]
#train_x_uc = train_x.loc[:, uncorr_mask]

In [10]:
cv = StratifiedKFold(train_y, 5, shuffle=True, random_state=RANDOM_STATE)

### Создание классификаторов

In [None]:
xgbc = xgb.XGBClassifier(n_estimators=560, max_depth=5, seed=1234, colsample_bytree=0.7, learning_rate=0.02,
                         subsample=0.68)
xgbc2 = xgb.XGBClassifier(n_estimators=560, max_depth=5, seed=1, colsample_bytree=0.7, learning_rate=0.02,
                         subsample=0.68)
xgbc3 = xgb.XGBClassifier(n_estimators=560, max_depth=5, seed=2, colsample_bytree=0.7, learning_rate=0.02,
                         subsample=0.68)
hyperopt_params = {'max_depth': 6, 'reg_lambda': 0.3014164001762216, 'gamma': 0.0, 'reg_alpha': 1.5182701064783564,
                   'colsample_bytree': 0.5129337218504479, 'min_child_weight': 4.09637273431316, 'n_estimators': 783,
                   'learning_rate': 0.011831660694716906, 'subsample': 0.549934788902083, "seed": 3}
xgbc_h =  xgb.XGBClassifier(**hyperopt_params)

In [None]:
r_params = {"n_estimators": 560, "max_depth": 5, "seed": 1234, "colsample_bytree": 0.701, "learning_rate": 0.0202048,
            "subsample": 0.6815}
xgbc_r = xgb.XGBClassifier(**r_params)

xgbc_r_list = []
for i_seed in range(1, 11):
    i_seed_params = copy.deepcopy(r_params)
    i_seed_params["seed"] = i_seed
    xgbc_r_list.append(xgb.XGBClassifier(**i_seed_params))
    
xgbc_h_list = []
for i_seed in range(2, 7):
    i_seed_params = copy.deepcopy(hyperopt_params)
    i_seed_params["seed"] = i_seed
    xgbc_h_list.append(xgb.XGBClassifier(**i_seed_params))    

In [None]:
is_weighted = [False] * len(xgbc_h_list)
x_mask = [None] * len(xgbc_h_list)
bc = averagers.BlendingClassifier(xgbc_h_list, is_weighted, x_mask)

### Grid search

In [None]:
params = {"n_estimators": [570], "max_depth": [5], "seed": [1234], "learning_rate": [0.02], "subsample": [0.68],
          "colsample_bytree": [0.7]}
gs = GridSearchCV(xgbc, params, scoring="roc_auc", cv=cv, n_jobs=common.NCPU, refit=False, verbose=10)
gs.fit(train_x, train_y)

### Веса

In [None]:
w0 = 1
w1 = train_y.value_counts()[0] / train_y.value_counts()[1] * 0.5
weights = np.array(list((map(lambda x: w1 if x else w0, train_y))))

### Обучение классификаторов

In [None]:
xgbc.fit(train_x.values, train_y.values)

In [None]:
bc.fit(train_x.values, train_y.values, weights)

In [None]:
xgbc_r.fit(train_x.values, train_y.values, eval_metric="auc")

In [None]:
xgbc_h.fit(train_x.values, train_y.values, eval_metric="auc")

### Выходная оценка

In [None]:
test_df = pd.read_csv(os.path.join("input", "test.csv"))
test_id = test_df["ID"].values
test_df.drop(["ID"], inplace=True, axis=1)

test_df["n0"] = test_df.apply(lambda x: sum(x == 0), axis=1)
test_x = test_df.drop(drop_columns, axis=1)

In [None]:
make_feature_engineering(test_x)

In [None]:
#scaled_test_x = scaler.transform(test_x)
#test_pca_df = pd.DataFrame(pca.transform(scaled_test_x), columns=["pca1", "pca2", "pca3"])
#test_x = pd.concat([test_x, test_pca_df], axis=1)

In [None]:
#test_y = xgbc_r.predict_proba(test_x.values)[:, 1]
#test_y = xgbc_h.predict_proba(test_x.values)[:, 1]
test_y = bc.predict_proba(test_x.values)[:, 1]

In [None]:
test_y[: 10]

In [None]:
common.save_output(np.array([test_id, test_y]).T, ["ID", "TARGET"])

### Тестирование классификаторов

In [None]:
cvs = cross_val_score(copy.deepcopy(xgbc_h), train_x, train_y, scoring="roc_auc", cv=cv, n_jobs=common.NCPU, verbose=1)
cvs, cvs.mean()

In [None]:
# as in r without intercept
# array([ 0.84362984,  0.83608526,  0.84830765,  0.84150639,  0.83651392]), 0.84120861257549406
# with 3 pca components
# array([ 0.84474928,  0.83518445,  0.84703559,  0.84057184,  0.83586845]), 0.84068191998743524
# 10 xgbc with different seeds
# array([ 0.84443436,  0.83580992,  0.84851709,  0.84200755,  0.83720884]), 0.84159554920533641
# with 2 pca components
# array([ 0.84360169,  0.83538863,  0.84814913,  0.8421763 ,  0.8365236 ]), 0.84116787178076746
# with var38mc and logvar38, without var38, cleaned var3
# array([ 0.84466562,  0.83450353,  0.84818809,  0.84334212,  0.83541293]), 0.84122245852502497
# after hyperopt with var38mc and logvar38, without var38, cleaned var3
# array([ 0.84644761,  0.83720953,  0.84990076,  0.84543031,  0.83589067]), 0.84297577420381453
# 10 xgbc with different seeds after hyperopt with var38mc and logvar38, without var38, cleaned var3
# array([ 0.84608496,  0.8374632 ,  0.85016673,  0.84540045,  0.83552255]), 0.84292757970569743

In [None]:
cvs = common.cross_val_score_with_weights(copy.deepcopy(bc), train_x.values, train_y.values, weights, "roc_auc", cv)
cvs, cvs.mean()

In [None]:
precision_recall_fscore_support(train_y.values, xgbc_r.predict(train_x.values))

## Hyperopt

In [None]:
def hyperopt_train_test(params):
    clf = xgb.XGBClassifier(**params)
    return cross_val_score(clf, train_x, train_y, scoring="roc_auc", cv=cv, n_jobs=common.NCPU, verbose=1).mean()

In [21]:
space = {
    "n_estimators": hyperopt.hp.choice("n_estimators", range(500, 800)),
    "max_depth": hyperopt.hp.choice("max_depth", range(4, 8)),
    "colsample_bytree": hyperopt.hp.uniform("colsample_bytree", 0.5, 0.9),
    "learning_rate": hyperopt.hp.uniform("learning_rate", 0.005, 0.02),
    "subsample": hyperopt.hp.uniform("subsample", 0.5, 0.9),
    "min_child_weight": hyperopt.hp.uniform("min_child_weight", 1.0, 5.0),
    "reg_lambda": hyperopt.hp.uniform("reg_lambda", 0.0, 0.5),
    "reg_alpha": hyperopt.hp.uniform("reg_alpha", 0.0, 2.0),
    "seed": hyperopt.hp.choice("seed", [RANDOM_STATE]),
}

In [None]:
count = 0
best = 0
best_params = {}
def f(params):
    global best, count, best_params
    acc = hyperopt_train_test(params)
    if acc > best:
        best = acc
        best_params = params
        print("new best - {0}, using - {1}".format(acc, params))
    
    print("iters - {0}, best_acc - {1}".format(count, best))
    count += 1        
    return {"loss": -acc, "status": hyperopt.STATUS_OK}

In [None]:
trials = hyperopt.Trials()
best = hyperopt.fmin(f, space, algo=hyperopt.tpe.suggest, max_evals=300, trials=trials)

In [17]:
hyperopt_tester = common.HyperoptTester(xgb.XGBClassifier, space, RANDOM_STATE, nf_test=4, nf_val=4)
scores = hyperopt_tester.optimize(train_x.values, train_y.values, "roc_auc", max_evals=100)
print(scores.mean(axis=0))

[Parallel(n_jobs=6)]: Done   2 out of   2 | elapsed:   29.1s finished


new best score - 0.839547415551626, best params - {'gamma': 0.0, 'min_child_weight': 3.6880165927588737, 'n_estimators': 471, 'max_depth': 5, 'learning_rate': 0.012373799498423075, 'reg_lambda': 0.10346921330960179, 'colsample_bytree': 0.5810213842368828, 'subsample': 0.6753993242909193, 'reg_alpha': 0.2242249504965924}, num test step - 0
iters - 0, num test step - 0


[Parallel(n_jobs=6)]: Done   2 out of   2 | elapsed:   37.0s finished


new best score - 0.8244822914566069, best params - {'gamma': 0.0, 'min_child_weight': 2.335721991195849, 'n_estimators': 408, 'max_depth': 8, 'learning_rate': 0.0058171277140996415, 'reg_lambda': 0.5467084761834082, 'colsample_bytree': 0.5788734447050233, 'subsample': 0.8629979461102661, 'reg_alpha': 0.3029726202909899}, num test step - 1
iters - 0, num test step - 1
num_cv_step - 0, num_test_step - 0
num_cv_step - 0, num_test_step - 1
num_cv_step - 1, num_test_step - 0
num_cv_step - 1, num_test_step - 1
[[ 0.84191389  0.83440525]
 [ 0.83824571  0.82672313]]


In [20]:
hyperopt_tester.opt_params

{0: [0.83954741555162604,
  {'colsample_bytree': 0.5810213842368828,
   'gamma': 0.0,
   'learning_rate': 0.012373799498423075,
   'max_depth': 5,
   'min_child_weight': 3.6880165927588737,
   'n_estimators': 471,
   'reg_alpha': 0.2242249504965924,
   'reg_lambda': 0.10346921330960179,
   'subsample': 0.6753993242909193}],
 1: [0.8244822914566069,
  {'colsample_bytree': 0.5788734447050233,
   'gamma': 0.0,
   'learning_rate': 0.0058171277140996415,
   'max_depth': 8,
   'min_child_weight': 2.335721991195849,
   'n_estimators': 408,
   'reg_alpha': 0.3029726202909899,
   'reg_lambda': 0.5467084761834082,
   'subsample': 0.8629979461102661}]}

### VW

In [None]:
input_train_file = os.path.join("temp", "scs.input_train")
vwutils.make_vw_input(train_x, input_train_file, y=train_y)

In [None]:
input_test_file = os.path.join("temp", "scs.input_test")
vwutils.make_vw_input(test_x, input_test_file)

### Тестирование кода

In [None]:
x = train_x.iloc[: 1000].values
y = train_y.iloc[: 1000].values
w = weights[: 1000]
x_t = test_x.iloc[: 1000].values

In [None]:
common.get_most_important_features(xgbc_r, train_x.columns.values)

## Feature engineering

In [None]:
df = copy.deepcopy(train_x)

In [None]:
df.loc[df["var3"] == df["var3"].min(), "var3"] = df["var3"].mode()

In [None]:
df["var38mc"] = np.isclose(df["var38"], df["var38"].mode())
df["logvar38"] = df.loc[~df["var38mc"], "var38"].map(np.log)
df["logvar38"].fillna(0, inplace=True)