In [142]:
# import packages
# 12:07

# Data Manipulation
import pandas as pd
import numpy as np

# Training packages
import xgboost as xgb
import lightgbm as lgb
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression

# Ensemble
# Is it true that you can just put sklearn compatible packages into this?
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier

# Cross Validation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
import gc

In [143]:
# Functions that might be useful

# 01 Gini coefficient calculations
# https://www.kaggle.com/c/ClaimPredictionChallenge/discussion/703
def gini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses
 
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)
 
def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)

# 02 Gini coefficient for xgb and lgb
# https://www.kaggle.com/rshally/porto-xgb-lgb-kfold-lb-0-282  (Version 1)
def gini_xgb(pred, y):
    y = y.get_label()
    return 'gini', gini(y, pred) / gini(y, y)

def gini_lgb(preds, dtrain):
    y = list(dtrain.get_label())
    score = gini(y, preds) / gini(y, y)
    return 'gini', score, True

# 02 The Ensemble function
# https://www.kaggle.com/yekenot/simple-stacker-lb-0-284/code (Version 8)
class Ensemble(object):
    def __init__(self, n_splits, stacker, base_models):
        self.n_splits = n_splits
        self.stacker = stacker
        self.base_models = base_models

    def fit_predict(self, X, y, T):
        X = np.array(X)
        y = np.array(y)
        T = np.array(T)

        folds = list(StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=2016).split(X, y))

        S_train = np.zeros((X.shape[0], len(self.base_models)))
        S_test = np.zeros((T.shape[0], len(self.base_models)))
        for i, clf in enumerate(self.base_models):

            S_test_i = np.zeros((T.shape[0], self.n_splits))

            for j, (train_idx, test_idx) in enumerate(folds):
                X_train = X[train_idx]
                y_train = y[train_idx]
                X_holdout = X[test_idx]
#                y_holdout = y[test_idx]

                print ("Fit %s fold %d" % (str(clf).split('(')[0], j+1))
                clf.fit(X_train, y_train)
#                cross_score = cross_val_score(clf, X_train, y_train, cv=3, scoring='roc_auc')
#                print("    cross_score: %.5f" % (cross_score.mean()))
                y_pred = clf.predict_proba(X_holdout)[:,1]                

                S_train[test_idx, i] = y_pred
                S_test_i[:, j] = clf.predict_proba(T)[:,1]
            S_test[:, i] = S_test_i.mean(axis=1)

        results = cross_val_score(self.stacker, S_train, y, cv=3, scoring='roc_auc')
        print("Stacker score: %.5f" % (results.mean()))

        self.stacker.fit(S_train, y)
        res = self.stacker.predict_proba(S_test)[:,1]
        return res

In [144]:
# Loading Files and Picking out NA values
# It seems that if we don't pick out NA values, there will be one missing value in the id column
print('loading files...')
train = pd.read_csv('/Users/maxji/Desktop/Kaggle/0SafeDriver/data/train.csv', na_values=-1)
test = pd.read_csv('/Users/maxji/Desktop/Kaggle/0SafeDriver/data/test.csv', na_values=-1)

# Change format to reduce memory usage
for c in train.select_dtypes(include=['float64']).columns:
    train[c]=train[c].astype(np.float32)
    test[c]=test[c].astype(np.float32)
for c in train.select_dtypes(include=['int64']).columns[2:]:
    train[c]=train[c].astype(np.int8)
    test[c]=test[c].astype(np.int8)  

# Print out the shape of train and test
print(train.shape,test.shape)

loading files...
(595212, 59) (892816, 58)


In [145]:
# Data Manipulation

# 01 Dropping Columns starting with Calc
# Note: This is used by almost all kernels online. 
# Justification: https://www.kaggle.com/arthurtok/interactive-porto-insights-a-plot-ly-tutorial
# At least in Gradient Boosting, the calc variables all show really low correlation with target.

col_to_drop = train.columns[train.columns.str.startswith('ps_calc_')]
print(col_to_drop)
train = train.drop(col_to_drop, axis=1)  
test = test.drop(col_to_drop, axis=1) 

# 02 Dropping more Columns:
# Justification: https://www.kaggle.com/arthurtok/interactive-porto-insights-a-plot-ly-tutorial
# Trying to drop more columns with less weight in Gradient Boosting test.

col_to_drop_2 = [a for a in train.columns if a.endswith('ps_ind_1')]
train = train.drop(col_to_drop_2, axis=1)  
test = test.drop(col_to_drop_2, axis=1) 

# 03 Treating missing values:
# Again, different ways are employed by different Kernels.
# a. Kernels that doesn't treat values (Keep NA in the data)
# https://www.kaggle.com/rshally/porto-xgb-lgb-kfold-lb-0-282  (Version 1)
# b. Kernels that keep NA values as -1
# c. Kernels that change NA values to 999/-999

# 04 Dealing with categorical variables
# a. Make up dummy variables for each of them
cat_features = [a for a in train.columns if a.endswith('cat')]
for column in cat_features:
    temp = pd.get_dummies(pd.Series(train[column]))
    train = pd.concat([train,temp],axis=1)
    train = train.drop([column],axis=1)
    
for column in cat_features:
    temp = pd.get_dummies(pd.Series(test[column]))
    test = pd.concat([test,temp],axis=1)
    test = test.drop([column],axis=1)

print(train.shape,test.shape)

Index(['ps_calc_01', 'ps_calc_02', 'ps_calc_03', 'ps_calc_04', 'ps_calc_05',
       'ps_calc_06', 'ps_calc_07', 'ps_calc_08', 'ps_calc_09', 'ps_calc_10',
       'ps_calc_11', 'ps_calc_12', 'ps_calc_13', 'ps_calc_14',
       'ps_calc_15_bin', 'ps_calc_16_bin', 'ps_calc_17_bin', 'ps_calc_18_bin',
       'ps_calc_19_bin', 'ps_calc_20_bin'],
      dtype='object')
(595212, 200) (892816, 199)


In [146]:
train.describe()

Unnamed: 0,id,target,ps_ind_01,ps_ind_03,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,ps_ind_10_bin,ps_ind_11_bin,...,95,96,97,98,99,100,101,102,103,104
count,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,...,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0
mean,743803.6,0.036448,1.900378,4.423318,0.393742,0.257033,0.163921,0.185304,0.000373,0.001692,...,0.005978,0.003483,0.002493,0.004788,0.020231,0.007468,0.01233,0.003533,0.040762,0.142946
std,429367.8,0.187401,1.983789,2.699902,0.488579,0.436998,0.370205,0.388544,0.019309,0.041097,...,0.077084,0.058912,0.04987,0.069031,0.140791,0.086094,0.110354,0.059336,0.197738,0.350018
min,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,371991.5,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,743547.5,0.0,1.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1115549.0,0.0,3.0,6.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1488027.0,1.0,7.0,11.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [147]:
# Data Preparation for Training
# 01 Dropping columns, separating target function (y) and features (X)
#  X: The values of feature dataframe
#  y: target dataframe
#  features: The columns of feature dataframe
X = train.drop(['id', 'target'], axis=1)
features = X.columns
X = X.values
y = train['target'].values

# 02 Create and prepare the submission dataset
#  sub: The submission dataframe 
sub=test['id'].to_frame()
sub['target']=0

In [148]:
# Training parameters 
# 01 xgboost
# a. Baseline without feature engineering
# https://www.kaggle.com/rshally/porto-xgb-lgb-kfold-lb-0-282   (Version 1)
"""params = {'eta': 0.02, 'max_depth': 4, 'subsample': 0.9, 'colsample_bytree': 0.9, 
          'objective': 'binary:logistic', 'eval_metric': 'auc', 'silent': True}"""
# local cv: 0.2848016, lb: 0.281

# b. XGBoost params with one-hot encoding
# https://www.kaggle.com/yekenot/simple-stacker-lb-0-284/code (Version 8)
xgb_params = {}
xgb_params['objective'] = 'binary:logistic'
xgb_params['learning_rate'] = 0.04
xgb_params['n_estimators'] = 490
xgb_params['max_depth'] = 4
xgb_params['subsample'] = 0.9
xgb_params['colsample_bytree'] = 0.9  
xgb_params['min_child_weight'] = 10


# 02 lightgbm
# a. Baseline without feature engineering
# https://www.kaggle.com/rshally/porto-xgb-lgb-kfold-lb-0-282   (Version 1)

# @鲲(China) lgbm is very sensitive with hyper parameters, my lgbm give me 0.281. Here's my suggestion, 
# use a small max_depth and a num_of_leaves smaller than 2**max_depth, also try bagging with a small bagging frequency
"""params = {'metric': 'auc', 'learning_rate' : 0.01, 'max_depth':10, 'max_bin':10,  'objective': 'binary', 
          'feature_fraction': 0.8,'bagging_fraction':0.9,'bagging_freq':10,  'min_data': 500}"""
# local cv: 0.284789, lb: 0.281
# with dummy variables: local cv: 0.2852834 lb: 0.206 This is inconsistent

# b. With one-hot encoding
# https://www.kaggle.com/yekenot/simple-stacker-lb-0-284/code (Version 8)
# LightGBM params ver1
lgb_params = {}
lgb_params['learning_rate'] = 0.02
lgb_params['n_estimators'] = 650
lgb_params['max_bin'] = 10
lgb_params['subsample'] = 0.8
lgb_params['subsample_freq'] = 10
lgb_params['colsample_bytree'] = 0.8   
lgb_params['min_child_samples'] = 500
lgb_params['seed'] = 99

# https://www.kaggle.com/yekenot/simple-stacker-lb-0-284/code (Version 8)
# LightGBM params ver2
lgb_params2 = {}
lgb_params2['n_estimators'] = 1090
lgb_params2['learning_rate'] = 0.02
lgb_params2['colsample_bytree'] = 0.3   
lgb_params2['subsample'] = 0.7
lgb_params2['subsample_freq'] = 2
lgb_params2['num_leaves'] = 16
lgb_params2['seed'] = 99

# https://www.kaggle.com/yekenot/simple-stacker-lb-0-284/code (Version 8)
# LightGBM params ver3
lgb_params3 = {}
lgb_params3['n_estimators'] = 1100
lgb_params3['max_depth'] = 4
lgb_params3['learning_rate'] = 0.02
lgb_params3['seed'] = 99


# 04 RandomForest params with one-hot encoding
#rf_params = {}
#rf_params['n_estimators'] = 200
#rf_params['max_depth'] = 6
#rf_params['min_samples_split'] = 70
#rf_params['min_samples_leaf'] = 30


# 05 ExtraTrees params with one-hot encoding
#et_params = {}
#et_params['n_estimators'] = 155
#et_params['max_features'] = 0.3
#et_params['max_depth'] = 6
#et_params['min_samples_split'] = 40
#et_params['min_samples_leaf'] = 18

# 06 CatBoost params with one-hot encoding
cat_params = {}
cat_params['iterations'] = 900
cat_params['depth'] = 8
cat_params['rsm'] = 0.95
cat_params['learning_rate'] = 0.03
cat_params['l2_leaf_reg'] = 3.5  
cat_params['border_count'] = 8
cat_params['gradient_iterations'] = 4

In [149]:
# Cross Validation
# There has been some arguments on both sides for KFold and Stratified KFold. Need to investigate more.
# Discussion:
# @KALE I am now using Stratified 5Fold for my cv. But lgb cv score doesn't seem consistent with lb score. 
# lgb 0.282 - lb 0.279; lgb 0.281 - lb 0.280
# @鲲(China) 3 fold without Stratified is very consistent with lb in my side

# Personally, I feel that Stratified Kfold should be better, because the dataset has a imbalanced target, 
# and Stratified Kfold will try to balance out the different classes (0,1) in the target.

# https://www.kaggle.com/rshally/porto-xgb-lgb-kfold-lb-0-282   (Version 1)
nrounds=2000 
kfold = 2
skf = StratifiedKFold(n_splits=kfold, random_state=0)

In [150]:
# Actual Training
# 01 xgboost
# https://www.kaggle.com/rshally/porto-xgb-lgb-kfold-lb-0-282   (Version 1)
# Running time: ~45min

# About xgb_model.best_ntree_limit+50: 
# @Rudolph The credit for that goes to The1owl - it is not imperative but it seems to improve things a bit.
# @David Yang xgb_model.best_ntree_limit+50 seems to be unnecessary but it may get a good result.I think this way is useful to lgb, too. 
# You can change the +n/-n just like a parameter if it would imporove your result.
"""for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print(' xgb kfold: {}  of  {} : '.format(i+1, kfold))
    X_train, X_valid = X[train_index], X[test_index]
    y_train, y_valid = y[train_index], y[test_index]
    
    d_train = xgb.DMatrix(X_train, y_train) 
    d_valid = xgb.DMatrix(X_valid, y_valid) 
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]
    xgb_model = xgb.train(params, d_train, nrounds, watchlist, early_stopping_rounds=100, 
                          feval=gini_xgb, maximize=True, verbose_eval=100)
    prediction = xgb_model.predict(xgb.DMatrix(test[features].values), 
                        ntree_limit=xgb_model.best_ntree_limit+50) / (kfold)
    sub['target'] += prediction
    
gc.collect()
sub.head(2)"""

"for i, (train_index, test_index) in enumerate(skf.split(X, y)):\n    print(' xgb kfold: {}  of  {} : '.format(i+1, kfold))\n    X_train, X_valid = X[train_index], X[test_index]\n    y_train, y_valid = y[train_index], y[test_index]\n    \n    d_train = xgb.DMatrix(X_train, y_train) \n    d_valid = xgb.DMatrix(X_valid, y_valid) \n    watchlist = [(d_train, 'train'), (d_valid, 'valid')]\n    xgb_model = xgb.train(params, d_train, nrounds, watchlist, early_stopping_rounds=100, \n                          feval=gini_xgb, maximize=True, verbose_eval=100)\n    prediction = xgb_model.predict(xgb.DMatrix(test[features].values), \n                        ntree_limit=xgb_model.best_ntree_limit+50) / (kfold)\n    sub['target'] += prediction\n    \ngc.collect()\nsub.head(2)"

In [151]:
# 02 LightGBM
# https://www.kaggle.com/rshally/porto-xgb-lgb-kfold-lb-0-282   (Version 1)
# Running Time:
"""skf = StratifiedKFold(n_splits=kfold, random_state=1)
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print(' lgb kfold: {}  of  {} : '.format(i+1, kfold))
    X_train, X_eval = X[train_index], X[test_index]
    y_train, y_eval = y[train_index], y[test_index]
    lgb_model = lgb.train(lgb_params, lgb.Dataset(X_train, label=y_train), nrounds, 
                  lgb.Dataset(X_eval, label=y_eval), verbose_eval=10, 
                  feval=gini_lgb, early_stopping_rounds=100)
    sub['target'] += lgb_model.predict(test[features].values, 
                        num_iteration=lgb_model.best_iteration) / (kfold)
"""

"skf = StratifiedKFold(n_splits=kfold, random_state=1)\nfor i, (train_index, test_index) in enumerate(skf.split(X, y)):\n    print(' lgb kfold: {}  of  {} : '.format(i+1, kfold))\n    X_train, X_eval = X[train_index], X[test_index]\n    y_train, y_eval = y[train_index], y[test_index]\n    lgb_model = lgb.train(lgb_params, lgb.Dataset(X_train, label=y_train), nrounds, \n                  lgb.Dataset(X_eval, label=y_eval), verbose_eval=10, \n                  feval=gini_lgb, early_stopping_rounds=100)\n    sub['target'] += lgb_model.predict(test[features].values, \n                        num_iteration=lgb_model.best_iteration) / (kfold)\n"

In [152]:
# 03 Trainging various other models
# https://www.kaggle.com/yekenot/simple-stacker-lb-0-284/code (Version 8)
lgb_model = LGBMClassifier(**lgb_params)
lgb_model2 = LGBMClassifier(**lgb_params2)
lgb_model3 = LGBMClassifier(**lgb_params3)
#rf_model = RandomForestClassifier(**rf_params)
#et_model = ExtraTreesClassifier(**et_params)
xgb_model = XGBClassifier(**xgb_params)
#cat_model = CatBoostClassifier(**cat_params)
#gb_model = GradientBoostingClassifier(max_depth=5)
#ada_model = AdaBoostClassifier()
log_model = LogisticRegression()

In [153]:
# Using stacking for the models
# https://www.kaggle.com/yekenot/simple-stacker-lb-0-284/code (Version 8)
stack = Ensemble(n_splits=4,
        stacker = log_model,
        base_models = (lgb_model, lgb_model2, lgb_model3, xgb_model))        

test = test.drop(['id'],axis=1)
y_pred = stack.fit_predict(X, y, test)
sub['target'] = y_pred



Fit LGBMClassifier fold 1




Fit LGBMClassifier fold 2




Fit LGBMClassifier fold 3




Fit LGBMClassifier fold 4




Fit LGBMClassifier fold 1




Fit LGBMClassifier fold 2




Fit LGBMClassifier fold 3




Fit LGBMClassifier fold 4




Fit LGBMClassifier fold 1




Fit LGBMClassifier fold 2




Fit LGBMClassifier fold 3




Fit LGBMClassifier fold 4
Fit XGBClassifier fold 1
Fit XGBClassifier fold 2
Fit XGBClassifier fold 3
Fit XGBClassifier fold 4
Stacker score: 0.64352


In [154]:
sub.to_csv('submission_7.csv', index=False, float_format='%.5f') 
gc.collect()
sub.head(2)

Unnamed: 0,id,target
0,0,0.029547
1,1,0.028325


In [155]:
sub.describe()

Unnamed: 0,id,target
count,892816.0,892816.0
mean,744153.5,0.036393
std,429683.0,0.021028
min,0.0,0.020521
25%,372021.8,0.027141
50%,744307.0,0.031319
75%,1116308.0,0.038425
max,1488026.0,0.990077
