In [44]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from pandas import DataFrame as df
from IPython.display import display

import matplotlib
from matplotlib import pyplot as plt
%matplotlib inline

import xgboost as xgb
from xgboost import XGBClassifier

from sklearn.preprocessing import Imputer
from sklearn.model_selection import train_test_split,KFold,StratifiedKFold,ShuffleSplit,StratifiedShuffleSplit
from sklearn.model_selection import cross_val_score,GridSearchCV
from sklearn.metrics import confusion_matrix,classification_report,make_scorer,fbeta_score,roc_auc_score

import gini_metric as gm
import encoding as enc

In [5]:
# *load data*
train_data=pd.read_csv('train.csv')
test_data=pd.read_csv('test.csv') 

In [3]:
# data shape
print('Train Shape: {}'.format(train_data.shape))
print('Test Shape: {}'.format(test_data.shape))

Train Shape: (595212, 59)
Test Shape: (892816, 58)


In [28]:
# data types
#print('Train Data Types:\n{}\n'.format(train_data.dtypes))

In [29]:
# check class (im)balance 
#labels = train_data.groupby('target').size()
#print('Target Variable: {}'.format(labels))

In [99]:
# *split data into X and y*
y=train_data['target'].values
X=train_data.drop(['target','id'],axis=1)
test_id=test_data['id'].values
X_test=test_data.drop(['id'],axis=1)

### Feature Engineering

In [100]:
# feature engineering
X['ps_car_13_reg_03']=pd.Series(X['ps_car_13']*X['ps_reg_03'],index=X.index)
X['ps_reg_01_02_03']=pd.Series(X['ps_reg_01']*X['ps_reg_02']*X['ps_reg_03'],index=X.index)
X['ps_ind_bins_sum']=pd.Series(X['ps_ind_06_bin']+X['ps_ind_07_bin']+X['ps_ind_08_bin']+X['ps_ind_09_bin']+
                               X['ps_ind_10_bin']+X['ps_ind_11_bin']+X['ps_ind_12_bin']+X['ps_ind_13_bin']+
                               X['ps_ind_16_bin']+X['ps_ind_17_bin']+X['ps_ind_18_bin'],index=X.index)
# same treatment for test data 
#X_test['ps_car_13_reg_03']=pd.Series(X_test['ps_car_13']*X_test['ps_reg_03'],index=X_test.index)
#X_test['ps_reg_01_02_03']=pd.Series(X_test['ps_reg_01']*X_test['ps_reg_02']*X_test['ps_reg_03'],index=X_test.index)
#X_test['ps_ind_bins_sum']=pd.Series(X_test['ps_ind_06_bin']+X_test['ps_ind_07_bin']+X_test['ps_ind_08_bin']
                                    #+X_test['ps_ind_09_bin']+X_test['ps_ind_10_bin']+X_test['ps_ind_11_bin']
                                    #+X_test['ps_ind_12_bin']+X_test['ps_ind_13_bin']+X_test['ps_ind_16_bin']
                                    #+X_test['ps_ind_17_bin']+X_test['ps_ind_18_bin'],index=X_test.index)

In [82]:
# feature selection
#X.drop(['ps_ind_06_bin','ps_ind_07_bin','ps_ind_08_bin','ps_ind_09_bin',
        #'ps_ind_10_bin','ps_ind_11_bin','ps_ind_12_bin','ps_ind_13_bin','ps_ind_14'],axis=1,inplace=True)
#X=X.drop(['ps_ind_10_bin','ps_ind_11_bin','ps_ind_12_bin','ps_ind_13_bin'],axis=1)
#X=X.drop(['ps_calc_17_bin','ps_calc_20_bin','ps_car_10_cat','ps_calc_15_bin','ps_calc_01','ps_calc_04'
          #,'ps_calc_09','ps_calc_14'],axis=1)
# same treatment for test data 
#X_test.drop(['ps_ind_06_bin','ps_ind_07_bin','ps_ind_08_bin','ps_ind_09_bin','ps_ind_10_bin','ps_ind_11_bin',
             #'ps_ind_12_bin','ps_ind_13_bin','ps_ind_14','ps_ind_16_bin','ps_ind_17_bin','ps_ind_18_bin']
            #,axis=1,inplace=True)
#X_test=X_test.drop(['ps_ind_10_bin','ps_ind_11_bin','ps_ind_12_bin','ps_ind_13_bin'],axis=1)
#X_test=X_test.drop(['ps_calc_17_bin','ps_calc_20_bin','ps_car_10_cat','ps_calc_15_bin','ps_calc_01','ps_calc_04'
                    #,'ps_calc_09','ps_calc_14'],axis=1)

In [94]:
X.columns

Index(['ps_ind_01', 'ps_ind_02_cat', 'ps_ind_03', 'ps_ind_04_cat',
       'ps_ind_05_cat', 'ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin',
       'ps_ind_09_bin', 'ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_12_bin',
       'ps_ind_13_bin', 'ps_ind_14', 'ps_ind_15', 'ps_ind_16_bin',
       'ps_ind_17_bin', 'ps_ind_18_bin', 'ps_reg_01', 'ps_reg_02', 'ps_reg_03',
       'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_04_cat',
       'ps_car_05_cat', 'ps_car_06_cat', 'ps_car_07_cat', 'ps_car_08_cat',
       'ps_car_09_cat', 'ps_car_10_cat', 'ps_car_11_cat', 'ps_car_11',
       'ps_car_12', 'ps_car_13', 'ps_car_14', 'ps_car_15', 'ps_calc_01',
       'ps_calc_02', 'ps_calc_03', 'ps_calc_04', 'ps_calc_05', 'ps_calc_06',
       'ps_calc_07', 'ps_calc_08', 'ps_calc_09', 'ps_calc_10', 'ps_calc_11',
       'ps_calc_12', 'ps_calc_13', 'ps_calc_14', 'ps_calc_15_bin',
       'ps_calc_16_bin', 'ps_calc_17_bin', 'ps_calc_18_bin', 'ps_calc_19_bin',
       'ps_calc_20_bin', 'ps_ind_bins_sum'

In [31]:
# data summary 
#display(X.describe())

In [None]:
# claim-free %
#cf=1-list(y.values).count(1)/len(y.values)
#print('{:.2%}'.format(cf))

In [None]:
# label encoding
# le=LabelEncoder()
# y=df(le.fit_transform(y))

In [7]:
# *set missing values* 
#X_imp=X.replace(-1,np.nan)
#print(X_imp.isnull().sum())
#X_test_imp=X_test.replace(-1,np.nan)
# X_imp=X_imp.astype('float32')
# try Imputer and compare results to NaN
# imp=Imputer() # default imputes missing values as the mean
# X_imp=df(imp.fit_transform(X_imp))

In [17]:
#X_encoded=enc.one_hot_encode(X_imp)
#X_test_encoded=enc.one_hot_encode(X_test_imp)

In [None]:
# check skewness of input variables
# print('Input Variables:\n{}'.format(X_imp.skew())) # (+) right skewed and (-) left skewed

In [101]:
# *split train data into train and validation sets*
seed=4242
test_size=0.2
X_train,X_val,y_train,y_val=train_test_split(X,y,test_size=test_size,random_state=seed)
print('Train samples: {}\nValidation samples: {}'.format(len(X_train),len(X_val)))

Train samples: 476169
Validation samples: 119043


In [102]:
# *set data to XGBoost format*
dtrain=xgb.DMatrix(X_train,y_train)
dval=xgb.DMatrix(X_val,y_val)
dtest=xgb.DMatrix(X_test)

In [None]:
# cross validation
# cv=KFold(n_splits=10,shuffle=True,random_state=seed)
# cv=StratifiedKFold(n_splits=10,shuffle=True,random_state=seed)
# cv=ShuffleSplit(n_splits=10,random_state=seed)
# metric='roc_auc'
# cv=StratifiedShuffleSplit(n_splits=10,random_state=seed)
# scores=cross_val_score(model,X_train,y_train,cv=cv,scoring=metric,n_jobs=-1)
# print("Metric Chosen: %.3f%%" %(scores.mean()*100.0))

In [42]:
# fit model (XGBoost API) 
#params={}
#params['objective']='binary:logistic'
#params['silent']=True
#params['max_depth']=6
#params['eta']=0.001
#params['subsample']=0.8
#params['colsample_bytree']=0.8
#evals=[(dtrain,'train'),(dval,'val')]
#feval=gm.gini_xgb
#num_boost_round=10000
#early_stopping_rounds=0.1*num_boost_round
#evals_result={}
#verbose_eval=0.01*num_boost_round
#model = xgb.train(params=params,dtrain=dtrain,num_boost_round=num_boost_round,evals=evals,feval=feval,maximize=True,
                  #early_stopping_rounds=early_stopping_rounds,evals_result=evals_result,verbose_eval=verbose_eval)

In [103]:
# *set model (sklearn API)*
model=XGBClassifier(max_depth=5,n_estimators=250,learning_rate=0.1,n_jobs=-1,subsample=0.8,colsample_bytree=0.8)

In [104]:
# fit model (sklearn API)
early_stopping_rounds=50 # generally as a % (e.g. 10%) of training epochs
eval_set=[(X_train,y_train),(X_val,y_val)]
eval_metric='auc'
model.fit(X_train,y_train,early_stopping_rounds=early_stopping_rounds,eval_metric=eval_metric,
          eval_set=eval_set,verbose=True)
results=model.evals_result()

[0]	validation_0-auc:0.599765	validation_1-auc:0.597497
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 50 rounds.
[1]	validation_0-auc:0.615719	validation_1-auc:0.611052
[2]	validation_0-auc:0.618233	validation_1-auc:0.614041
[3]	validation_0-auc:0.622451	validation_1-auc:0.618467
[4]	validation_0-auc:0.62399	validation_1-auc:0.620794
[5]	validation_0-auc:0.625144	validation_1-auc:0.621076
[6]	validation_0-auc:0.626577	validation_1-auc:0.622695
[7]	validation_0-auc:0.62693	validation_1-auc:0.621867
[8]	validation_0-auc:0.628076	validation_1-auc:0.622363
[9]	validation_0-auc:0.62921	validation_1-auc:0.622582
[10]	validation_0-auc:0.629883	validation_1-auc:0.62283
[11]	validation_0-auc:0.630771	validation_1-auc:0.623512
[12]	validation_0-auc:0.630658	validation_1-auc:0.623858
[13]	validation_0-auc:0.63183	validation_1-auc:0.625307
[14]	validation_0-auc:0.631579	validation_1-auc:0.625322
[15

[142]	validation_0-auc:0.697812	validation_1-auc:0.640626
[143]	validation_0-auc:0.698069	validation_1-auc:0.640749
[144]	validation_0-auc:0.698346	validation_1-auc:0.640931
[145]	validation_0-auc:0.698751	validation_1-auc:0.640922
[146]	validation_0-auc:0.699039	validation_1-auc:0.640823
[147]	validation_0-auc:0.699466	validation_1-auc:0.640625
[148]	validation_0-auc:0.699889	validation_1-auc:0.640579
[149]	validation_0-auc:0.700238	validation_1-auc:0.6405
[150]	validation_0-auc:0.700498	validation_1-auc:0.640473
[151]	validation_0-auc:0.700657	validation_1-auc:0.640432
[152]	validation_0-auc:0.700892	validation_1-auc:0.640283
[153]	validation_0-auc:0.701321	validation_1-auc:0.640265
[154]	validation_0-auc:0.701712	validation_1-auc:0.640229
[155]	validation_0-auc:0.702154	validation_1-auc:0.640199
[156]	validation_0-auc:0.702703	validation_1-auc:0.640263
[157]	validation_0-auc:0.703133	validation_1-auc:0.640077
[158]	validation_0-auc:0.70341	validation_1-auc:0.640153
[159]	validation_

In [None]:
# 0.63915; 0.641272; 0.64142

In [None]:
# plot learning curve 
epochs=len(evals_result['val']['gini'])
x_axis=range(0,epochs)
fig,ax=plt.subplots()
ax.plot(x_axis,evals_result['train']['gini'],label='Train')
ax.plot(x_axis,evals_result['val']['gini'],label='Test')
ax.legend()
plt.show()

In [None]:
# predictions
pred=model.predict(dtest)
#pred_prb=model.predict_proba(X_val)[:,1]
#pred=[round(value) for value in pred_prb]

In [None]:
# model performance reports
#print(roc_auc_score(y_val,pred_prb))
#pred=[round(p) for p in pred_prb]
#print('Confusion Matrix:\n{}\n'.format(confusion_matrix(y_val,pred)))
#print('Classification Report:\n{}'.format(classification_report(y_val,pred)))

In [None]:
# grid search
#n_estimators=range(1000,5000,1000)
max_depth=range(1,11,2)
learning_rate=[0.0001,0.001,0.01]
subsample=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]
#colsample_bytree=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]
#colsample_bylevel=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]
#param_grid=dict(colsample_bytree=colsample_bytree)
# one parameter case
#param_type=colsample_bytree
#param_name='colsample_bytree'

#gs=GridSearchCV(model,param_grid,scoring='neg_log_loss',cv=cv,n_jobs=-1)
#scores=gs.fit(X,y)
#print("Best: %f using %s" % (scores.best_score_,scores.best_params_))
#means=scores.cv_results_['mean_test_score']
#stds=scores.cv_results_['std_test_score']
#params=scores.cv_results_['params']
#for mean,std,param in zip(means,stds,params):
    #print("%f (%f) with: %r" %(mean,std,param))

# plot scores
# one parameter case
#plt.errorbar(param_type,means,yerr=stds)
# two parameters case
# means=np.array(means).reshape(len(learning_rate),len(n_estimators))
# for i,value in enumerate(learning_rate):
    # plt.plot(n_estimators,means[i],label='depth: '+str(value))
#plt.legend()
#plt.xlabel(param_name)
#plt.ylabel('Log Loss')
#plt.show()

In [None]:
# plot single tree 
#plot_tree(model,num_trees=4,rankdir='LR') # plot fifth tree
#plt.show()

In [None]:
# plot feature importance
#print(model.feature_importances_)
#plt.bar(range(len(model.feature_importances_)),model.feature_importances_)
#plt.show()
# built-in function in XGBoost library that sorts features by importance 
#plot_importance(model)
#plt.show()

In [None]:
# built-in feature selection 
#thresholds=sort(model.feature_importances_)
#for threshold in thresholds:
    #selection=SelectFromModel(model,threshold=threshold,prefit=True)
    #X_select=selection.transform(X)
    #cv=StratifiedShuffleSplit(n_splits=10,random_state=seed)
    #scores=cross_val_score(model,X_select,y,cv=cv)
    #print("Threshold=%.3f, n=%d, Accuracy: %.2f%%" %(threshold,X_select.shape[1],scores.mean()*100.0))

In [None]:
# creat submission file for Kaggle
sub=pd.DataFrame()
sub['id']=test_id
sub['target']=pred
sub.to_csv('ps_xgb1.csv',index=False)