In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [30]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from xgboost import XGBClassifier

from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('../inputs/train.csv')

In [4]:
# remove unnecesary columns
single_uniq_value_cols = ['cat_var_31', 'cat_var_35', 'cat_var_36', 'cat_var_37', 'cat_var_38', 'cat_var_40', 'cat_var_42']
single_majority_value_cols = ['cat_var_23', 'cat_var_24', 'cat_var_25', 'cat_var_26',\
                              'cat_var_27', 'cat_var_28', 'cat_var_29', 'cat_var_30',\
                              'cat_var_32', 'cat_var_33', 'cat_var_34', 'cat_var_39',\
                              'cat_var_41']

In [6]:
df.drop(single_uniq_value_cols, axis = 1, inplace = True)
df.drop(single_majority_value_cols, axis = 1, inplace = True)

In [15]:
df.fillna('NA', inplace = True)

In [7]:
print (df.dtypes)

transaction_id     object
num_var_1         float64
num_var_2         float64
num_var_3         float64
num_var_4         float64
num_var_5         float64
num_var_6         float64
num_var_7         float64
cat_var_1          object
cat_var_2          object
cat_var_3          object
cat_var_4          object
cat_var_5          object
cat_var_6          object
cat_var_7          object
cat_var_8          object
cat_var_9          object
cat_var_10         object
cat_var_11         object
cat_var_12         object
cat_var_13         object
cat_var_14         object
cat_var_15         object
cat_var_16         object
cat_var_17         object
cat_var_18         object
cat_var_19          int64
cat_var_20          int64
cat_var_21          int64
cat_var_22          int64
target              int64
dtype: object


In [68]:
# check how many columns are common between train and test
col_list = df_submit.columns.tolist()
col_list.remove('transaction_id')
for col in col_list:
    common_len = len(pd.merge(left = pd.DataFrame(df_submit[col].unique(), columns = [col]), \
                                                  right = pd.DataFrame(df[col].unique(), columns = [col]),\
                                                  how = 'inner', on = col))
    print (('%10s, %6d, %6d, %6d') % (col, len(df_submit[col].unique()), len(df[col].unique()), common_len))

 num_var_1,  15184,  13385,  11273
 num_var_2,   5809,   5550,   5309
 num_var_3,     11,      8,      5
 num_var_4,   1061,   1006,    956
 num_var_5,   5321,   4622,   3737
 num_var_6,  13539,  11827,   9839
 num_var_7,  33188,  26213,  15224
 cat_var_1,    535,    535,    531
 cat_var_2,     62,     60,     59
 cat_var_3,    618,    617,    613
 cat_var_4,      2,      2,      2
 cat_var_5,      2,      2,      2
 cat_var_6,    516,    518,    515
 cat_var_7,     22,     20,     19
 cat_var_8,    464,    463,    460
 cat_var_9,      5,      5,      5
cat_var_10,     23,     23,     23
cat_var_11,      5,      5,      5
cat_var_12,      5,      5,      5
cat_var_13,     52,     52,     52
cat_var_14,     12,     12,     12
cat_var_15,      2,      2,      2
cat_var_16,      2,      2,      2
cat_var_17,      2,      2,      2
cat_var_18,      2,      2,      2
cat_var_19,      2,      2,      2
cat_var_20,      2,      2,      2
cat_var_21,      2,      2,      2
cat_var_22,      2, 

In [13]:
input_cols_xgb_encode = df.dtypes[~df.dtypes.isin([np.int64, np.float64])].index.tolist()
# print (input_cols_xgb_encode)
input_cols_xgb_encode.remove('transaction_id')
input_cols_xgb_numeric = df.dtypes[df.dtypes.isin([np.int64, np.float64])].index.tolist()
input_cols_xgb_numeric.remove('target')
target_cols_xgb = ['target']

In [22]:
X_xgb = df[input_cols_xgb_encode + input_cols_xgb_numeric].as_matrix()
label_enc = [LabelEncoder() for i in range(len(input_cols_xgb_encode))]

for i in range(len(label_enc)):
    print (input_cols_xgb_encode[i])
    label_enc[i].fit(X_xgb[:, i])
    X_xgb[:, i] = label_enc[i].transform(X_xgb[:, i])

y_xgb = df['target']

cat_var_1
cat_var_2
cat_var_3
cat_var_4
cat_var_5
cat_var_6
cat_var_7
cat_var_8
cat_var_9
cat_var_10
cat_var_11
cat_var_12
cat_var_13
cat_var_14
cat_var_15
cat_var_16
cat_var_17
cat_var_18


In [31]:
# do stratified splitting as training time is too large otherwise
X_xgb_train, X_xgb_test, y_xgb_train, y_xgb_test = train_test_split(X_xgb, y_xgb, test_size = 0.4, \
                                                                    stratify = y_xgb, random_state = 42)

In [32]:
# get the optimum no of trees for learning rate of 0.1
parameters = {'n_estimators' : [10, 50, 100]}
xgb_clf = XGBClassifier(random_state=42, n_jobs = -1,learning_rate = 0.3)
clf = GridSearchCV(xgb_clf, parameters, cv = 4, scoring = 'roc_auc')
clf.fit(X_xgb_train, y_xgb_train)
print (clf.best_params_)

{'n_estimators': 100}


In [33]:
pd.DataFrame(clf.cv_results_)

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_n_estimators,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,split1_train_score,split2_test_score,split2_train_score,split3_test_score,split3_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,1.71915,0.092863,0.718395,0.719305,10,{'n_estimators': 10},3,0.719194,0.719794,0.719752,0.718115,0.715884,0.719682,0.71875,0.719631,0.015807,0.0012,0.001492,0.00069
1,6.663242,0.13658,0.724462,0.733839,50,{'n_estimators': 50},2,0.726164,0.732198,0.726878,0.733261,0.721304,0.73478,0.723503,0.735118,0.240348,0.002321,0.002215,0.001178
2,15.405108,0.266954,0.724937,0.745599,100,{'n_estimators': 100},1,0.728943,0.743185,0.727138,0.745282,0.722644,0.746589,0.721025,0.747341,1.835164,0.06944,0.003219,0.001576


In [34]:
# now tune the max_depth and min_child_weight
parameters = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2)
}
xgb_clf = XGBClassifier(random_state=42, learning_rate = 0.3, n_estimators = 100)
clf = GridSearchCV(xgb_clf, parameters, cv = 4, scoring = 'roc_auc')
clf.fit(X_xgb_train, y_xgb_train)
print (clf.best_params_)

{'max_depth': 7, 'min_child_weight': 1}


In [35]:
pd.DataFrame(clf.cv_results_)

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_max_depth,param_min_child_weight,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,split1_train_score,split2_test_score,split2_train_score,split3_test_score,split3_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,20.359841,0.31162,0.724937,0.745599,3,1,"{'max_depth': 3, 'min_child_weight': 1}",6,0.728943,0.743185,0.727138,0.745282,0.722644,0.746589,0.721025,0.747341,1.73445,0.010527,0.003219,0.001576
1,12.916319,0.196086,0.725855,0.746815,3,3,"{'max_depth': 3, 'min_child_weight': 3}",2,0.730507,0.745117,0.728371,0.7473,0.724328,0.747624,0.720214,0.74722,0.874035,0.008006,0.003941,0.000992
2,12.357609,0.194765,0.724889,0.74723,3,5,"{'max_depth': 3, 'min_child_weight': 5}",7,0.728444,0.745653,0.727379,0.746878,0.72238,0.748244,0.721352,0.748146,0.06492,0.003446,0.003067,0.001058
3,21.994885,0.330006,0.725286,0.797818,5,1,"{'max_depth': 5, 'min_child_weight': 1}",5,0.730341,0.79313,0.725124,0.798998,0.72458,0.802828,0.721101,0.796317,1.450363,0.032703,0.003301,0.003561
4,23.070078,0.316349,0.725624,0.796825,5,3,"{'max_depth': 5, 'min_child_weight': 3}",4,0.729511,0.793546,0.726839,0.792004,0.723877,0.802254,0.722269,0.799495,0.348643,0.008378,0.002779,0.004201
5,23.229631,0.324401,0.725851,0.794581,5,5,"{'max_depth': 5, 'min_child_weight': 5}",3,0.73088,0.792719,0.727261,0.789829,0.7242,0.797038,0.721062,0.798737,0.272008,0.02073,0.003638,0.003513
6,32.519854,0.465707,0.726367,0.862441,7,1,"{'max_depth': 7, 'min_child_weight': 1}",1,0.731673,0.860613,0.726544,0.865417,0.722059,0.860335,0.725192,0.863398,0.296082,0.024179,0.003469,0.002095
7,32.247919,0.452995,0.724829,0.851764,7,3,"{'max_depth': 7, 'min_child_weight': 3}",8,0.728593,0.851682,0.724043,0.853082,0.722027,0.850947,0.724655,0.851346,0.428996,0.026462,0.00238,0.000804
8,31.747664,0.411032,0.724326,0.84709,7,5,"{'max_depth': 7, 'min_child_weight': 5}",9,0.730793,0.845526,0.7248,0.849775,0.723891,0.845018,0.717819,0.848041,0.315663,0.008574,0.004598,0.001927
9,35.822519,0.501242,0.724222,0.917474,9,1,"{'max_depth': 9, 'min_child_weight': 1}",10,0.728286,0.917099,0.724349,0.920809,0.720412,0.918904,0.72384,0.913083,0.326213,0.010233,0.002793,0.002854


In [39]:
# now tune gamma
parameters = {
 'gamma':[x/10.0 for x in range(0, 6)]
}
xgb_clf = XGBClassifier(random_state=42, learning_rate = 0.3, n_estimators = 100,\
                        max_depth = 7, min_child_weight = 1, n_jobs = -1)
clf = GridSearchCV(xgb_clf, parameters, cv = 4, scoring = 'roc_auc')
clf.fit(X_xgb_train, y_xgb_train)
print (clf.best_params_)

{'gamma': 0.0}


In [40]:
pd.DataFrame(clf.cv_results_)

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_gamma,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,split1_train_score,split2_test_score,split2_train_score,split3_test_score,split3_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,32.861532,0.481146,0.726367,0.862441,0.0,{'gamma': 0.0},1,0.731673,0.860613,0.726544,0.865417,0.722059,0.860335,0.725192,0.863398,1.720997,0.085661,0.003469,0.002095
1,31.80355,0.422498,0.72534,0.861758,0.1,{'gamma': 0.1},3,0.729765,0.860676,0.725719,0.862302,0.72284,0.86624,0.723035,0.857815,1.138195,0.029852,0.002797,0.003046
2,33.128219,0.414794,0.724744,0.861673,0.2,{'gamma': 0.2},4,0.729036,0.860953,0.726862,0.866847,0.721179,0.862586,0.721898,0.856308,1.320895,0.013766,0.003306,0.003772
3,32.789602,0.456254,0.724025,0.858571,0.3,{'gamma': 0.3},6,0.731211,0.860024,0.723478,0.856584,0.719924,0.858772,0.721486,0.858904,2.542959,0.069028,0.004336,0.001246
4,32.959458,0.425751,0.724383,0.861805,0.4,{'gamma': 0.4},5,0.730299,0.864796,0.722398,0.858763,0.720548,0.863083,0.724287,0.860576,2.068837,0.019188,0.003663,0.00231
5,34.217243,0.451944,0.725399,0.861777,0.5,{'gamma': 0.5},2,0.727275,0.860646,0.726453,0.858949,0.723917,0.863699,0.723952,0.863814,2.687178,0.042169,0.001493,0.002069


In [41]:
# now tune subsample and column sample
parameters = {
 'subsample':[x/10.0 for x in range(6,11)],
 'colsample_bytree':[x/10.0 for x in range(6,11)]
}
xgb_clf = XGBClassifier(random_state=42, learning_rate = 0.3, n_estimators = 100,\
                        max_depth = 7, min_child_weight = 1, n_jobs = -1,\
                        gamma = 0)
clf = GridSearchCV(xgb_clf, parameters, cv = 4, scoring = 'roc_auc')
clf.fit(X_xgb_train, y_xgb_train)
print (clf.best_params_)

{'colsample_bytree': 0.8, 'subsample': 0.9}


In [42]:
pd.DataFrame(clf.cv_results_)

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_colsample_bytree,param_subsample,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,split1_train_score,split2_test_score,split2_train_score,split3_test_score,split3_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,29.907971,0.46841,0.724413,0.85065,0.6,0.6,"{'colsample_bytree': 0.6, 'subsample': 0.6}",15,0.728044,0.84852,0.723493,0.853176,0.723247,0.849355,0.722868,0.851548,1.194023,0.096704,0.002108,0.00183
1,27.605926,0.446301,0.725267,0.85755,0.6,0.7,"{'colsample_bytree': 0.6, 'subsample': 0.7}",7,0.728432,0.85734,0.724025,0.859603,0.726137,0.856198,0.722476,0.857057,1.423763,0.033147,0.002242,0.001258
2,27.228731,0.505184,0.725038,0.85675,0.6,0.8,"{'colsample_bytree': 0.6, 'subsample': 0.8}",8,0.730111,0.850516,0.725108,0.859639,0.722847,0.858763,0.722085,0.858084,1.251284,0.063795,0.003133,0.003642
3,22.532653,0.431327,0.724171,0.858634,0.6,0.9,"{'colsample_bytree': 0.6, 'subsample': 0.9}",17,0.725092,0.856241,0.726358,0.859014,0.721512,0.858922,0.723724,0.860359,1.544426,0.015179,0.001796,0.001494
4,18.501383,0.393119,0.7262,0.847904,0.6,1.0,"{'colsample_bytree': 0.6, 'subsample': 1.0}",3,0.732711,0.84517,0.727408,0.851675,0.722813,0.851245,0.721868,0.843527,0.220653,0.005594,0.004304,0.003606
5,31.388328,0.444355,0.723183,0.854196,0.7,0.6,"{'colsample_bytree': 0.7, 'subsample': 0.6}",21,0.726743,0.850237,0.724678,0.856413,0.718104,0.854424,0.723206,0.85571,0.350804,0.018858,0.00319,0.002395
6,31.746616,0.482816,0.724641,0.86036,0.7,0.7,"{'colsample_bytree': 0.7, 'subsample': 0.7}",11,0.730176,0.858808,0.723253,0.864128,0.720318,0.86129,0.724816,0.857216,1.761759,0.039023,0.00358,0.002615
7,29.729116,0.465885,0.724464,0.859971,0.7,0.8,"{'colsample_bytree': 0.7, 'subsample': 0.8}",13,0.727309,0.857096,0.724803,0.861125,0.724414,0.86289,0.721331,0.858772,0.262298,0.009668,0.002123,0.002211
8,27.150066,0.456159,0.725674,0.859126,0.7,0.9,"{'colsample_bytree': 0.7, 'subsample': 0.9}",5,0.72845,0.860483,0.723956,0.85151,0.721404,0.862825,0.728885,0.861686,0.162329,0.024495,0.003131,0.004475
9,25.283763,0.442084,0.725945,0.855768,0.7,1.0,"{'colsample_bytree': 0.7, 'subsample': 1.0}",4,0.728624,0.854176,0.728416,0.857433,0.724867,0.854914,0.721875,0.856548,1.358577,0.025205,0.002784,0.001289


In [43]:
# now tune regularization
parameters = {
 'reg_alpha':[0, 1e-5, 1e-2, 0.1, 1, 100]
}
xgb_clf = XGBClassifier(random_state=42, learning_rate = 0.3, n_estimators = 100,\
                        max_depth = 7, min_child_weight = 1, n_jobs = -1,\
                        gamma = 0, colsample_bytree = 0.8, subsample = 0.9)
clf = GridSearchCV(xgb_clf, parameters, cv = 4, scoring = 'roc_auc')
clf.fit(X_xgb_train, y_xgb_train)
print (clf.best_params_)

{'reg_alpha': 1e-05}


In [44]:
pd.DataFrame(clf.cv_results_)

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_reg_alpha,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,split1_train_score,split2_test_score,split2_train_score,split3_test_score,split3_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,30.273136,0.420562,0.726831,0.866668,0.0,{'reg_alpha': 0},2,0.730161,0.862143,0.72832,0.870823,0.72437,0.86982,0.724471,0.863887,1.728407,0.017361,0.002496,0.003722
1,26.444616,0.403242,0.726831,0.866668,1e-05,{'reg_alpha': 1e-05},1,0.730161,0.862143,0.72832,0.870823,0.72437,0.86982,0.724471,0.863887,0.686379,0.011953,0.002496,0.003722
2,31.927022,0.523259,0.725391,0.866722,0.01,{'reg_alpha': 0.01},5,0.728916,0.866058,0.724583,0.867367,0.721762,0.866272,0.726301,0.867192,1.438631,0.126639,0.002602,0.000566
3,30.811125,0.438099,0.724738,0.867869,0.1,{'reg_alpha': 0.1},6,0.726777,0.868215,0.727051,0.867923,0.720369,0.870621,0.724755,0.864715,0.429789,0.00412,0.002674,0.0021
4,29.520211,0.419725,0.725792,0.871148,1.0,{'reg_alpha': 1},3,0.730175,0.868839,0.725824,0.874972,0.721705,0.871689,0.725466,0.869091,2.040765,0.017339,0.003001,0.002474
5,19.075361,0.253344,0.72541,0.741087,100.0,{'reg_alpha': 100},4,0.730635,0.739353,0.724072,0.741196,0.722776,0.741929,0.724157,0.741869,0.469993,0.005923,0.003066,0.001042


In [47]:
# tune the learning rate
parameters = {
 'learning_rate':[0.001, 0.01, 0.05, 0.1]
}
xgb_clf = XGBClassifier(random_state=42, n_estimators = 500,\
                        max_depth = 7, min_child_weight = 1, n_jobs = -1,\
                        gamma = 0, colsample_bytree = 0.8, subsample = 0.9,\
                        reg_alpha = 1e-5)
clf = GridSearchCV(xgb_clf, parameters, cv = 4, scoring = 'roc_auc')
clf.fit(X_xgb_train, y_xgb_train)
print (clf.best_params_)

{'learning_rate': 0.01}


In [49]:
pd.DataFrame(clf.cv_results_)

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_learning_rate,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,split1_train_score,split2_test_score,split2_train_score,split3_test_score,split3_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,155.233548,1.697775,0.724945,0.730845,0.001,{'learning_rate': 0.001},4,0.725074,0.729771,0.729469,0.729398,0.722801,0.732032,0.722437,0.73218,10.832112,0.115385,0.002801,0.001269
1,162.736508,2.081786,0.727185,0.764619,0.01,{'learning_rate': 0.01},1,0.73001,0.765308,0.728902,0.762186,0.724793,0.764514,0.725036,0.766466,10.988058,0.173284,0.002306,0.001567
2,139.030479,2.14573,0.726918,0.880714,0.05,{'learning_rate': 0.05},2,0.732961,0.880277,0.726596,0.878782,0.723646,0.88368,0.724468,0.880118,11.17666,0.180216,0.003651,0.001808
3,134.367225,2.070954,0.726077,0.925503,0.1,{'learning_rate': 0.1},3,0.73041,0.924448,0.722752,0.92577,0.723807,0.927897,0.727338,0.923896,11.533912,0.134065,0.003024,0.001541


In [51]:
xgb_clf = XGBClassifier(random_state=42, learning_rate = 0.01, n_estimators = 500,\
                        max_depth = 7, min_child_weight = 1, n_jobs = -1,\
                        gamma = 0, colsample_bytree = 0.8, subsample = 0.9,\
                        reg_alpha = 1e-5)

In [69]:
clf.fit(X_xgb, y_xgb)

KeyboardInterrupt: 