In [1]:
#Import necessary libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import seaborn as sns
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

In [21]:
#Import data

df = pd.read_csv('train_Df64byy.csv')
test = pd.read_csv('test_YCcRUnU.csv')

In [22]:
#Creating a copy of test

test_dumm = test.drop('ID', axis=1)

In [38]:
df.head()

Unnamed: 0,Region_Code,Holding_Policy_Type,Reco_Policy_Cat,Reco_Policy_Premium,Response,Age,City_Code_C1,City_Code_C10,City_Code_C11,City_Code_C12,...,Holding_Policy_Duration_14+,Holding_Policy_Duration_14.0,Holding_Policy_Duration_2.0,Holding_Policy_Duration_3.0,Holding_Policy_Duration_4.0,Holding_Policy_Duration_5.0,Holding_Policy_Duration_6.0,Holding_Policy_Duration_7.0,Holding_Policy_Duration_8.0,Holding_Policy_Duration_9.0
0,3213,3.0,22,11628.0,0,36.0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,1117,3.0,22,30510.0,0,48.5,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3732,1.0,19,7450.0,1,32.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4378,3.0,19,17780.0,0,50.0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,2190,1.0,16,10404.0,0,44.0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [23]:
test_dumm

Unnamed: 0,City_Code,Region_Code,Accomodation_Type,Reco_Insurance_Type,Upper_Age,Lower_Age,Is_Spouse,Health Indicator,Holding_Policy_Duration,Holding_Policy_Type,Reco_Policy_Cat,Reco_Policy_Premium
0,C1,156,Owned,Individual,30,30,No,,6.0,3.0,5,11934.0
1,C4,7,Owned,Joint,69,68,Yes,X1,3.0,3.0,18,32204.8
2,C1,564,Rented,Individual,28,28,No,X3,2.0,4.0,17,9240.0
3,C3,1177,Rented,Individual,23,23,No,X3,3.0,3.0,18,9086.0
4,C1,951,Owned,Individual,75,75,No,X3,,,5,22534.0
...,...,...,...,...,...,...,...,...,...,...,...,...
21800,C3,1044,Owned,Individual,45,45,No,X1,4.0,1.0,18,15884.0
21801,C4,266,Owned,Individual,59,59,No,X5,6.0,3.0,18,21390.0
21802,C12,2470,Owned,Individual,74,74,No,X3,,,1,17836.0
21803,C10,1676,Rented,Individual,25,25,No,X4,3.0,1.0,19,11568.0


In [24]:
#Simple EDA to check missing value percentage


def simple_eda(x):
    count = x.count()
    miss = x.isnull().sum()
    total = count + miss
    miss_perc = (miss/total) * 100
    
    return pd.Series([count, miss_perc, x.dtype],
                    index = ['Count','Miss_PERC','dtype'])

In [25]:
df.apply(lambda x: simple_eda(x)).T

Unnamed: 0,Count,Miss_PERC,dtype
ID,50882,0.0,int64
City_Code,50882,0.0,object
Region_Code,50882,0.0,int64
Accomodation_Type,50882,0.0,object
Reco_Insurance_Type,50882,0.0,object
Upper_Age,50882,0.0,int64
Lower_Age,50882,0.0,int64
Is_Spouse,50882,0.0,object
Health Indicator,39191,22.9767,object
Holding_Policy_Duration,30631,39.7999,object


In [26]:
#Drop ID, No use of ID 

df.drop('ID',axis=1, inplace=True)

In [15]:
#Missing value imputation

In [27]:
df['Health Indicator'] = df['Health Indicator'].fillna(df['Health Indicator'].mode()[0])
df['Holding_Policy_Duration'] = df['Holding_Policy_Duration'].fillna(df['Holding_Policy_Duration'].mode()[0])
df['Holding_Policy_Type'] = df['Holding_Policy_Type'].fillna(df['Holding_Policy_Type'].mode()[0])

In [28]:
test_dumm['Health Indicator'] = test_dumm['Health Indicator'].fillna(test_dumm['Health Indicator'].mode()[0])
test_dumm['Holding_Policy_Duration'] = test_dumm['Holding_Policy_Duration'].fillna(test_dumm['Holding_Policy_Duration'].mode()[0])
test_dumm['Holding_Policy_Type'] = test_dumm['Holding_Policy_Type'].fillna(test_dumm['Holding_Policy_Type'].mode()[0])

In [29]:
#Derive Age as Average of Upper_age + Lower_Age and the drop Upper_age and Lower_age

df['Age'] = (df['Upper_Age'] + df['Lower_Age'])/2
test_dumm['Age'] = (test['Upper_Age'] + test['Lower_Age'])/2

In [30]:
df.drop(['Lower_Age','Upper_Age'],axis=1, inplace=True)
test_dumm.drop(['Lower_Age','Upper_Age'],axis=1, inplace=True)

In [31]:
#Converting categorical variable to numerical by creating dummies

df = pd.get_dummies(df)
test_dumm = pd.get_dummies(test_dumm)

In [32]:
#Seperate target and independent variables

X = df.drop('Response', axis=1)
Y = df['Response']

In [35]:
# Use StratifiedKFold as this is the best cross-validation for classification problem
# Create model using XGBClassifier. 

kf = StratifiedKFold(n_splits=10,shuffle=True)
pred_test_full =0
cv_score = [] 
i = 1
for train_index, test_index in kf.split(X,Y):
    print("{} of K-Folds".format(i,kf.n_splits))
    xtr,xvl = X.loc[train_index], X.loc[test_index]
    ytr,yvl = Y.loc[train_index], Y.loc[test_index]
    
    param_test = {'reg_alpha':[1e-5, 1e-2, 0.1, 100]}
    #model XGBoost
    xgbm = XGBClassifier(learning_rate =0.5,
              n_estimators=10,
              max_depth=5,
              min_child_weight=2,
              gamma=0.1,
              subsample=0.85,
              colsample_bytree=0.8,
              objective= 'binary:logistic',
              nthread=4,
              scale_pos_weight=1,
              seed=27, 
              param_grid = param_test,
              scoring='roc_auc',
              n_jobs=4,
              iid=False, 
              cv=2,
              verbose=10)
    xgbm.fit(xtr,ytr)
    score = roc_auc_score(yvl, xgbm.predict_proba(xvl)[:,1])
    print("ROC AUC Score : " , score)
    print('Accuracy Score :', accuracy_score(yvl, xgbm.predict(xvl)))
    cv_score.append(score)
    pred_test = xgbm.predict_proba(test_dumm)[:,1]
    pred_test_full +=pred_test
    i+=1

1 of K-Folds
Parameters: { cv, iid, param_grid, scoring, verbose } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


ROC AUC Score :  0.6392079067880516
Accuracy Score : 0.7596777362939674
2 of K-Folds
Parameters: { cv, iid, param_grid, scoring, verbose } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


ROC AUC Score :  0.6468418921883246
Accuracy Score : 0.7622322656710552
3 of K-Folds
Parameters: { cv, iid, param_grid, scoring, verbose } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost cor

In [37]:
#Create submission file 

y_pred_xgb = pd.DataFrame(xgbm.predict_proba(test_dumm)[:,1],columns=['Response'])
y_pred_xgb = pd.concat([test['ID'], y_pred_xgb], axis=1)
y_pred_xgb.to_csv('submission_xgb_new_0.5.csv', index=False)