In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

import sklearn
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.linear_model import Lasso
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels
from xgboost import XGBClassifier
import xgboost as xgb

In [2]:
def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    np.set_printoptions(precision=2)
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
    classes = classes[unique_labels(y_true, y_pred)]
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

In [3]:
def perform_grid_search(model, x_train, y_train, param_grid, kfold_splits=10, num_jobs = -1, 
                        verbose_num = 20, scoring_method = None):
    
    kfold = StratifiedKFold(n_splits=kfold_splits, shuffle=True, random_state=100)
    
    if scoring_method = None:
        grid_search = GridSearchCV(model, param_grid, n_jobs=num_jobs, cv=kfold, 
                                   verbose=verbose_num)        
    else:
        grid_search = GridSearchCV(model, param_grid, n_jobs=num_jobs, cv=kfold, 
                                   verbose=verbose_num, scoring=scoring_method)
        
    grid_result = grid_search.fit(x_train, y_train)
    return grid_result

In [4]:
def summarize_cv_results(grid_result):
    # summarize results
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    means = grid_result.cv_results_['mean_test_score']
    stds = grid_result.cv_results_['std_test_score']
    params = grid_result.cv_results_['params']
    for mean, stdev, param in zip(means, stds, params):
        print("%f (%f) with: %r" % (mean, stdev, param))

In [6]:
train_df = pd.read_csv('train.csv')
class_names = train_df.target.unique()
train_x = train_df.drop(columns=['ID_code', 'target'])
train_y = train_df['target']
x_train, x_test, y_train, y_test = train_test_split(train_x, train_y, test_size = 0.3)
x_train_with_eval, x_eval, y_train_with_eval, y_eval = train_test_split(x_train, y_train, 
                                                                        test_size = 0.3)
print(x_train_with_eval.shape, x_eval.shape, x_test.shape, y_train_with_eval.shape, 
      y_eval.shape, y_test.shape)

dtrain = xgb.DMatrix(x_train_with_eval, label=y_train_with_eval)
deval = xgb.DMatrix(x_eval, label=y_eval)

(98000, 200) (42000, 200) (60000, 200) (98000,) (42000,) (60000,)


  if getattr(data, 'base', None) is not None and \


In [20]:
max_depth = np.arange(2,10,2)
num_round = np.arange(200,1000,200)
# max_depth = [1,2]
# num_round = np.arange(20,50,20)
param = {'eta': [0.01], 'verbosity': [2], 'objective': ['binary:logistic'], 
         'eval_metric': ['auc'], 'subsample':[0.8], 'alpha': [0.2], 'early_stopping_rounds': [20],
        'n_jobs'=8}
param_grid = dict()
param_grid['max_depth'] = max_depth
param_grid['n_estimators'] = num_round

In [22]:
xgb_grid_search = XGBClassifier(param)
grid_result = perform_grid_search(xgb_grid_search, x_train=x_train_with_eval,
                                  y_train=y_train_with_eval,param_grid=param_grid, num_jobs=8, 
                                  kfold_splits=10, verbose_num=20)
summarize_cv_results(grid_result)

Fitting 10 folds for each of 16 candidates, totalling 160 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   1 tasks      | elapsed: 11.1min
[Parallel(n_jobs=8)]: Done   2 tasks      | elapsed: 11.2min
[Parallel(n_jobs=8)]: Done   3 tasks      | elapsed: 11.2min
[Parallel(n_jobs=8)]: Done   4 tasks      | elapsed: 11.2min
[Parallel(n_jobs=8)]: Done   5 tasks      | elapsed: 11.2min
[Parallel(n_jobs=8)]: Done   6 tasks      | elapsed: 11.2min
[Parallel(n_jobs=8)]: Done   7 tasks      | elapsed: 11.2min
[Parallel(n_jobs=8)]: Done   8 tasks      | elapsed: 11.2min
[Parallel(n_jobs=8)]: Done   9 tasks      | elapsed: 21.9min
[Parallel(n_jobs=8)]: Done  10 tasks      | elapsed: 21.9min
[Parallel(n_jobs=8)]: Done  11 tasks      | elapsed: 31.3min
[Parallel(n_jobs=8)]: Done  12 tasks      | elapsed: 31.3min
[Parallel(n_jobs=8)]: Done  13 tasks      | elapsed: 31.3min
[Parallel(n_jobs=8)]: Done  14 tasks      | elapsed: 31.3min
[Parallel(n_jobs=8)]: Done  15 tasks      | elapsed: 31.3min
[Parallel(

[Parallel(n_jobs=8)]: Done 133 tasks      | elapsed: 583.9min
[Parallel(n_jobs=8)]: Done 134 tasks      | elapsed: 584.4min
[Parallel(n_jobs=8)]: Done 135 tasks      | elapsed: 585.4min
[Parallel(n_jobs=8)]: Done 136 tasks      | elapsed: 585.6min
[Parallel(n_jobs=8)]: Done 137 tasks      | elapsed: 587.6min
[Parallel(n_jobs=8)]: Done 138 tasks      | elapsed: 588.0min
[Parallel(n_jobs=8)]: Done 139 tasks      | elapsed: 614.5min
[Parallel(n_jobs=8)]: Done 140 tasks      | elapsed: 614.7min
[Parallel(n_jobs=8)]: Done 141 tasks      | elapsed: 663.4min
[Parallel(n_jobs=8)]: Done 142 tasks      | elapsed: 664.0min
[Parallel(n_jobs=8)]: Done 143 tasks      | elapsed: 664.7min
[Parallel(n_jobs=8)]: Done 144 tasks      | elapsed: 665.1min
[Parallel(n_jobs=8)]: Done 145 tasks      | elapsed: 667.2min
[Parallel(n_jobs=8)]: Done 154 out of 160 | elapsed: 770.2min remaining: 30.0min
[Parallel(n_jobs=8)]: Done 160 out of 160 | elapsed: 831.0min finished


Best: 0.885259 using {'max_depth': 4, 'n_estimators': 800}
0.846102 (0.006551) with: {'max_depth': 2, 'n_estimators': 200}
0.870627 (0.006504) with: {'max_depth': 2, 'n_estimators': 400}
0.880090 (0.006756) with: {'max_depth': 2, 'n_estimators': 600}
0.884976 (0.006440) with: {'max_depth': 2, 'n_estimators': 800}
0.864568 (0.005203) with: {'max_depth': 4, 'n_estimators': 200}
0.879724 (0.005579) with: {'max_depth': 4, 'n_estimators': 400}
0.884301 (0.005342) with: {'max_depth': 4, 'n_estimators': 600}
0.885259 (0.005710) with: {'max_depth': 4, 'n_estimators': 800}
0.866626 (0.006160) with: {'max_depth': 6, 'n_estimators': 200}
0.875630 (0.006489) with: {'max_depth': 6, 'n_estimators': 400}
0.876784 (0.006512) with: {'max_depth': 6, 'n_estimators': 600}
0.877232 (0.006282) with: {'max_depth': 6, 'n_estimators': 800}
0.859542 (0.005467) with: {'max_depth': 8, 'n_estimators': 200}
0.866821 (0.005464) with: {'max_depth': 8, 'n_estimators': 400}
0.870662 (0.004821) with: {'max_depth': 8, 'n

In [23]:
max_depth = np.arange(4,10,2)
num_round = np.arange(800,10000,2000)
# max_depth = [1,2]
# num_round = np.arange(20,50,20)
param = {'eta': [0.01], 'verbosity': [2], 'objective': ['binary:logistic'], 
         'eval_metric': ['auc'], 'subsample':[0.8], 'alpha': [0.2], 'early_stopping_rounds': [20],
        'n_jobs':[8]}
param_grid = dict()
param_grid['max_depth'] = max_depth
param_grid['n_estimators'] = num_round

xgb_grid_search = XGBClassifier(param)
grid_result = perform_grid_search(xgb_grid_search, x_train=x_train_with_eval,
                                  y_train=y_train_with_eval,param_grid=param_grid, num_jobs=8, 
                                  kfold_splits=5, verbose_num=20)
summarize_cv_results(grid_result)

Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   1 tasks      | elapsed: 56.7min
[Parallel(n_jobs=8)]: Done   2 tasks      | elapsed: 56.7min
[Parallel(n_jobs=8)]: Done   3 tasks      | elapsed: 57.0min
[Parallel(n_jobs=8)]: Done   4 tasks      | elapsed: 57.0min
[Parallel(n_jobs=8)]: Done   5 tasks      | elapsed: 57.2min
[Parallel(n_jobs=8)]: Done   6 tasks      | elapsed: 167.5min
[Parallel(n_jobs=8)]: Done   7 tasks      | elapsed: 167.6min
[Parallel(n_jobs=8)]: Done   8 tasks      | elapsed: 167.7min
[Parallel(n_jobs=8)]: Done   9 tasks      | elapsed: 212.1min
[Parallel(n_jobs=8)]: Done  10 tasks      | elapsed: 213.2min
[Parallel(n_jobs=8)]: Done  11 tasks      | elapsed: 322.8min
[Parallel(n_jobs=8)]: Done  12 tasks      | elapsed: 323.3min
[Parallel(n_jobs=8)]: Done  13 tasks      | elapsed: 323.9min
[Parallel(n_jobs=8)]: Done  14 tasks      | elapsed: 430.7min
[Parallel(n_jobs=8)]: Done  15 tasks      | elapsed: 431.2min


Traceback (most recent call last):
  File "/Users/abhishek/My Project/santander-transaction/venv/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py", line 833, in retrieve
    self._output.extend(job.get(timeout=self.timeout))
  File "/Users/abhishek/My Project/santander-transaction/venv/lib/python3.7/site-packages/sklearn/externals/joblib/_parallel_backends.py", line 521, in wrap_future_result
    return future.result(timeout=timeout)
  File "/usr/local/Cellar/python/3.7.1/Frameworks/Python.framework/Versions/3.7/lib/python3.7/concurrent/futures/_base.py", line 427, in result
    self._condition.wait(timeout)
  File "/usr/local/Cellar/python/3.7.1/Frameworks/Python.framework/Versions/3.7/lib/python3.7/threading.py", line 296, in wait
    waiter.acquire()
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/abhishek/My Project/santander-transaction/venv/lib/python3.7/site-packages/IPython

TypeError: can only concatenate str (not "list") to str

In [24]:
summarize_cv_results(grid_result)

Best: 0.885259 using {'max_depth': 4, 'n_estimators': 800}
0.846102 (0.006551) with: {'max_depth': 2, 'n_estimators': 200}
0.870627 (0.006504) with: {'max_depth': 2, 'n_estimators': 400}
0.880090 (0.006756) with: {'max_depth': 2, 'n_estimators': 600}
0.884976 (0.006440) with: {'max_depth': 2, 'n_estimators': 800}
0.864568 (0.005203) with: {'max_depth': 4, 'n_estimators': 200}
0.879724 (0.005579) with: {'max_depth': 4, 'n_estimators': 400}
0.884301 (0.005342) with: {'max_depth': 4, 'n_estimators': 600}
0.885259 (0.005710) with: {'max_depth': 4, 'n_estimators': 800}
0.866626 (0.006160) with: {'max_depth': 6, 'n_estimators': 200}
0.875630 (0.006489) with: {'max_depth': 6, 'n_estimators': 400}
0.876784 (0.006512) with: {'max_depth': 6, 'n_estimators': 600}
0.877232 (0.006282) with: {'max_depth': 6, 'n_estimators': 800}
0.859542 (0.005467) with: {'max_depth': 8, 'n_estimators': 200}
0.866821 (0.005464) with: {'max_depth': 8, 'n_estimators': 400}
0.870662 (0.004821) with: {'max_depth': 8, 'n

In [26]:
import pickle

In [27]:
filename = 'finalized_model.modelcv'
pickle.dump(grid_result, open(filename, 'wb'))