# Santander Customer Satisfaction

## Планы

* VW на Ubuntu
* Hyperopt для xgboost

## Решение

### Общее начало

In [1]:
%matplotlib inline

import os
import copy
import random
import pickle
from importlib import reload

import numpy as np
import scipy as sp
import pandas as pd
import pylab

from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.cross_validation import StratifiedKFold, cross_val_score

import xgboost as xgb

import ml_training.common as common
import ml_training.vwutils as vwutils
import sklensemble.ensemblers.averagers as averagers

In [None]:
reload(common)

In [35]:
reload(vwutils)

<module 'ml_training.vwutils' from 'E:\\Documents\\Projects\\ml_training\\vwutils.py'>

In [None]:
reload(averagers)

In [2]:
def find_constant_columns(df):
    return [column for column in df.columns if len(df[column].unique()) < 2]

In [3]:
def find_equal_columns(df):
    equal_columns = []
    for col_num, column1 in enumerate(df.columns[: -1]):
        for column2 in df.columns[col_num+1:]:
            if not (df[column1] - df[column2]).any():
                equal_columns.append(column2)
    return list(set(equal_columns))

In [4]:
RANDOM_STATE = 100
random.seed(RANDOM_STATE)

In [5]:
train_df = pd.read_csv(os.path.join("input", "train.csv"))

In [6]:
constant_columns = find_constant_columns(train_df)
equal_columns = find_equal_columns(train_df)
drop_columns = list(set(constant_columns).union(equal_columns))

In [7]:
train_df.drop(drop_columns, inplace=True, axis=1)

In [8]:
train_x = train_df.loc[:, "var3": "var38"]
train_y = train_df["TARGET"]

In [9]:
if_mask = common.get_independent_features_mask(train_x.values, tol=5e-7, is_plot=False) # 5e-7
uncorr_mask = common.find_corr_features_mask(train_x.values, trashhold=0.995) # 0.995
res_mask = if_mask & uncorr_mask
if_mask.sum(), uncorr_mask.sum(), res_mask.sum()

(256, 255, 222)

In [10]:
train_x_if = train_x.loc[:, if_mask]
train_x_uc = train_x.loc[:, uncorr_mask]

In [11]:
cv = StratifiedKFold(train_y, 5, shuffle=True, random_state=RANDOM_STATE)

### Создание классификаторов

In [None]:
xgbc = xgb.XGBClassifier(n_estimators=570, max_depth=5, seed=1234, colsample_bytree=0.7, learning_rate=0.02,
                         subsample=0.68)

In [None]:
xgbc_weighted = xgb.XGBClassifier(n_estimators=570, max_depth=5, seed=1234, colsample_bytree=0.7, learning_rate=0.02,
                                  subsample=0.68)

In [None]:
xgbc_if = xgb.XGBClassifier(n_estimators=570, max_depth=5, seed=1234, colsample_bytree=0.7, learning_rate=0.02,
                            subsample=0.68)

In [None]:
xgbc_uc = xgb.XGBClassifier(n_estimators=570, max_depth=5, seed=1234, colsample_bytree=0.7, learning_rate=0.02,
                            subsample=0.68)

In [None]:
estimators = [copy.deepcopy(xgbc), copy.deepcopy(xgbc_weighted), copy.deepcopy(xgbc_if), copy.deepcopy(xgbc_uc)]
is_weighted = [False, True, False, False]
x_mask = [None, None, if_mask, uncorr_mask]
bc = averagers.BlendingClassifier(estimators, is_weighted, x_mask)

### Grid search

In [None]:
params = {"n_estimators": [150], "max_depth": [3], "seed": [RANDOM_STATE]}
params = {"n_estimators": [570], "max_depth": [5], "seed": [1234], "learning_rate": [0.02], "subsample": [0.68],
          "colsample_bytree": [0.7]}
gs = GridSearchCV(xgbc, params, scoring="roc_auc", cv=cv, n_jobs=common.NCPU, refit=False, verbose=10)
gs.fit(train_x, train_y)

### Веса

In [None]:
w0 = 1
w1 = train_y.value_counts()[0] / train_y.value_counts()[1] * 0.5
weights = np.array(list((map(lambda x: w1 if x else w0, train_y))))

### Обучение классификаторов

In [None]:
xgbc.fit(train_x.values, train_y.values)

In [None]:
xgbc_weighted.fit(train_x.values, train_y.values, weights)

In [None]:
xgbc_if.fit(train_x_if.values, train_y.values)

In [None]:
xgbc_uc.fit(train_x_uc.values, train_y.values)

In [None]:
bc.fit(train_x.values, train_y.values, weights)

### Выходная оценка

In [24]:
test_df = pd.read_csv(os.path.join("input", "test.csv"))
test_df.drop(drop_columns, inplace=True, axis=1)
test_x = test_df.loc[:, "var3": "var38"]

In [None]:
test_y = xgbc_very_hard.predict_proba(test_x)[:, 1]

In [None]:
common.save_output(np.array([test_df["ID"].values, test_y]).T, ["ID", "TARGET"])

### Тестирование классификаторов

In [None]:
cross_val_score(copy.deepcopy(xgbc), train_x, train_y, scoring="roc_auc", cv=cv, n_jobs=common.NCPU, verbose=1)

In [None]:
cross_val_score(copy.deepcopy(xgbc), train_x, train_y, scoring="roc_auc", cv=cv, n_jobs=common.NCPU, verbose=1)

In [None]:
common.cross_val_score_with_weights(copy.deepcopy(bc), train_x.values, train_y.values, weights, "roc_auc", cv)

In [None]:
np.corrcoef(np.vstack([xgbc.predict_proba(test_x.values)[:, 1], xgbc_weighted.predict_proba(test_x.values)[:, 1],
                       xgbc_if.predict_proba(test_x.values)[:, 1], xgbc_uc.predict_proba(test_x.values)[:, 1],
                       bc.predict_proba(test_x.values)[:, 1]]))

In [None]:
precision_recall_fscore_support(train_y.values, xgbc.predict(train_x.values))

In [None]:
precision_recall_fscore_support(train_y.values, xgbc_weighted.predict(train_x.values))

In [None]:
precision_recall_fscore_support(train_y.values, xgbc_if.predict(train_x.values))

In [None]:
precision_recall_fscore_support(train_y.values, xgbc_uc.predict(train_x.values))

In [None]:
precision_recall_fscore_support(train_y.values, bc.predict(train_x.values))

In [None]:
pylab.figure(figsize=(16, 6))

pylab.grid(True)
pylab.scatter(xgbc.predict_proba(train_x.values)[:, 1], bc.predict_proba(train_x.values)[:, 1], alpha=0.5, color = 'red')

### VW

In [36]:
input_train_file = os.path.join("temp", "scs.input_train")
vwutils.make_vw_input(train_x, input_train_file, y=train_y)

In [37]:
input_test_file = os.path.join("temp", "scs.input_test")
vwutils.make_vw_input(test_x, input_test_file)

### Тестирование кода

In [None]:
x = train_x.iloc[: 1000].values
y = train_y.iloc[: 1000].values
w = weights[: 1000]
x_t = test_x.iloc[: 1000].values