Notebook07 A basic stacking model

Timeline: 2017/11/4

Goal: To have a first glance at stacking with xgboost and lightgbm

I. Import Packages, define functions and import files

In [13]:
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
import gc

print('loading files...')
train = pd.read_csv('/Users/maxji/Desktop/Kaggle/0SafeDriver/data/train.csv', na_values=999)
test = pd.read_csv('/Users/maxji/Desktop/Kaggle/0SafeDriver/data/test.csv', na_values=999)


loading files...
(595212, 39) (892816, 38)


In [15]:
# Provide functions to calculate the gini coefficient
def gini(y, pred):
    g = np.asarray(np.c_[y, pred, np.arange(len(y)) ], dtype=np.float)
    g = g[np.lexsort((g[:,2], -1*g[:,1]))]
    gs = g[:,0].cumsum().sum() / g[:,0].sum()
    gs -= (len(y) + 1) / 2.
    return gs / len(y)

def gini_xgb(pred, y):
    y = y.get_label()
    return 'gini', gini(y, pred) / gini(y, y)

def gini_lgb(preds, dtrain):
    y = list(dtrain.get_label())
    score = gini(y, preds) / gini(y, y)
    return 'gini', score, True

II. Data Manipulation

In [None]:
# Drop calc columns
col_to_drop = train.columns[train.columns.str.startswith('ps_calc_')]
train = train.drop(col_to_drop, axis=1)  
test = test.drop(col_to_drop, axis=1)  

# Reduce memory usage
for c in train.select_dtypes(include=['float64']).columns:
    train[c]=train[c].astype(np.float32)
    test[c]=test[c].astype(np.float32)
for c in train.select_dtypes(include=['int64']).columns[2:]:
    train[c]=train[c].astype(np.int8)
    test[c]=test[c].astype(np.int8)    

print(train.shape, test.shape)

In [14]:
# Preparing for training
X = train.drop(['id', 'target'], axis=1)
features = X.columns
X = X.values
y = train['target'].values
sub=test['id'].to_frame()
sub['target']=0

III. Training

In [17]:
# Initialize CV
nrounds=2000  # need to change to 2000
kfold = 5  # need to change to 5
skf = StratifiedKFold(n_splits=kfold, random_state=0)

# Train lightgbm model
params = {'metric': 'auc', 'learning_rate' : 0.01, 'max_depth':10, 'max_bin':10,  'objective': 'binary', 
          'feature_fraction': 0.8,'bagging_fraction':0.9,'bagging_freq':10,  'min_data': 500}

skf = StratifiedKFold(n_splits=kfold, random_state=1)
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print(' lgb kfold: {}  of  {} : '.format(i+1, kfold))
    X_train, X_eval = X[train_index], X[test_index]
    y_train, y_eval = y[train_index], y[test_index]
    lgb_model = lgb.train(params, lgb.Dataset(X_train, label=y_train), nrounds, 
                  lgb.Dataset(X_eval, label=y_eval), verbose_eval=10, 
                  feval=gini_lgb, early_stopping_rounds=100)
    sub['target'] += lgb_model.predict(test[features].values, 
                        num_iteration=lgb_model.best_iteration) / (3*kfold)
    
sub.to_csv('total.csv', index=False, float_format='%.5f') 
gc.collect()
sub.head(2)

# Train catboost model
model = CatBoostClassifier(
    learning_rate=0.05, 
    depth=6, 
    l2_leaf_reg = 14, 
    iterations = 650,
#    verbose = True,
    loss_function='Logloss'
)

for i, (train_index, test_index) in enumerate(skf.split(X,y)):
    
    # Create data for this fold
    y_train, y_valid = y[train_index], y[test_index]
    X_train, X_valid = X[train_index], X[test_index]
    print(' catboost kfold: {}  of  {} : '.format(i+1, kfold))
    
    # Run model for this fold
    fit_model = model.fit( X_train, y_train )
        
    # Generate validation predictions for this fold
    pred = fit_model.predict_proba(X_valid)[:,1]
    print( "  Gini = ", gini(y_valid, pred) )
    y_valid_pred.iloc[test_index] = pred
    
    # Accumulate test set predictions
    y_test_pred += fit_model.predict_proba(X_test)[:,1]
    
    y_test_pred /= K  # Average test set predictions

    print( "\nGini for full training set:" )
    gini(y, y_valid_pred)
    sub['target'] += y_test_pred/3

gc.collect()
sub.head(2)

# Train xgboost model
params = {'eta': 0.02, 'max_depth': 4, 'subsample': 0.9, 'colsample_bytree': 0.9, 
          'objective': 'binary:logistic', 'eval_metric': 'auc', 'silent': True}


for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print(' xgb kfold: {}  of  {} : '.format(i+1, kfold))
    X_train, X_valid = X[train_index], X[test_index]
    y_train, y_valid = y[train_index], y[test_index]
    d_train = xgb.DMatrix(X_train, y_train) 
    d_valid = xgb.DMatrix(X_valid, y_valid) 
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]
    xgb_model = xgb.train(params, d_train, nrounds, watchlist, early_stopping_rounds=100, 
                          feval=gini_xgb, maximize=True, verbose_eval=10)
    sub['target'] += xgb_model.predict(xgb.DMatrix(test[features].values), 
                        ntree_limit=xgb_model.best_ntree_limit+50) / (3*kfold)
gc.collect()
sub.head(2)

 lgb kfold: 1  of  5 : 
Training until validation scores don't improve for 100 rounds.
[10]	valid_0's auc: 0.624471	valid_0's gini: 0.248969
[20]	valid_0's auc: 0.627194	valid_0's gini: 0.254391
[30]	valid_0's auc: 0.627261	valid_0's gini: 0.254518
[40]	valid_0's auc: 0.627471	valid_0's gini: 0.254938
[50]	valid_0's auc: 0.628113	valid_0's gini: 0.256223
[60]	valid_0's auc: 0.62873	valid_0's gini: 0.257459
[70]	valid_0's auc: 0.628432	valid_0's gini: 0.256864
[80]	valid_0's auc: 0.628391	valid_0's gini: 0.256781
[90]	valid_0's auc: 0.628301	valid_0's gini: 0.256601
[100]	valid_0's auc: 0.628702	valid_0's gini: 0.257404
[110]	valid_0's auc: 0.62857	valid_0's gini: 0.257139
[120]	valid_0's auc: 0.628903	valid_0's gini: 0.257805
[130]	valid_0's auc: 0.628766	valid_0's gini: 0.257531
[140]	valid_0's auc: 0.629423	valid_0's gini: 0.258846
[150]	valid_0's auc: 0.629551	valid_0's gini: 0.259102
[160]	valid_0's auc: 0.629845	valid_0's gini: 0.25969
[170]	valid_0's auc: 0.629842	valid_0's gini:

[230]	valid_0's auc: 0.627576	valid_0's gini: 0.255152
[240]	valid_0's auc: 0.628023	valid_0's gini: 0.256046
[250]	valid_0's auc: 0.628199	valid_0's gini: 0.256398
[260]	valid_0's auc: 0.628681	valid_0's gini: 0.257361
[270]	valid_0's auc: 0.629512	valid_0's gini: 0.259024
[280]	valid_0's auc: 0.629872	valid_0's gini: 0.259744
[290]	valid_0's auc: 0.630342	valid_0's gini: 0.260684
[300]	valid_0's auc: 0.630882	valid_0's gini: 0.261764
[310]	valid_0's auc: 0.631148	valid_0's gini: 0.262296
[320]	valid_0's auc: 0.631309	valid_0's gini: 0.262618
[330]	valid_0's auc: 0.63182	valid_0's gini: 0.26364
[340]	valid_0's auc: 0.632271	valid_0's gini: 0.264542
[350]	valid_0's auc: 0.632604	valid_0's gini: 0.265207
[360]	valid_0's auc: 0.632955	valid_0's gini: 0.265911
[370]	valid_0's auc: 0.633488	valid_0's gini: 0.266976
[380]	valid_0's auc: 0.634011	valid_0's gini: 0.268023
[390]	valid_0's auc: 0.634383	valid_0's gini: 0.268766
[400]	valid_0's auc: 0.634611	valid_0's gini: 0.269223
[410]	valid_

[600]	valid_0's auc: 0.639263	valid_0's gini: 0.278527
[610]	valid_0's auc: 0.639456	valid_0's gini: 0.278913
[620]	valid_0's auc: 0.639472	valid_0's gini: 0.278943
[630]	valid_0's auc: 0.639573	valid_0's gini: 0.279146
[640]	valid_0's auc: 0.639709	valid_0's gini: 0.279417
[650]	valid_0's auc: 0.64006	valid_0's gini: 0.28012
[660]	valid_0's auc: 0.640211	valid_0's gini: 0.280422
[670]	valid_0's auc: 0.640288	valid_0's gini: 0.280577
[680]	valid_0's auc: 0.640289	valid_0's gini: 0.280578
[690]	valid_0's auc: 0.64039	valid_0's gini: 0.280781
[700]	valid_0's auc: 0.640451	valid_0's gini: 0.280903
[710]	valid_0's auc: 0.640454	valid_0's gini: 0.280908
[720]	valid_0's auc: 0.640655	valid_0's gini: 0.281311
[730]	valid_0's auc: 0.640786	valid_0's gini: 0.281571
[740]	valid_0's auc: 0.640873	valid_0's gini: 0.281746
[750]	valid_0's auc: 0.640914	valid_0's gini: 0.281829
[760]	valid_0's auc: 0.641078	valid_0's gini: 0.282157
[770]	valid_0's auc: 0.641199	valid_0's gini: 0.282397
[780]	valid_0

[1060]	valid_0's auc: 0.64651	valid_0's gini: 0.293019
[1070]	valid_0's auc: 0.646503	valid_0's gini: 0.293006
[1080]	valid_0's auc: 0.646519	valid_0's gini: 0.293037
[1090]	valid_0's auc: 0.646498	valid_0's gini: 0.292996
[1100]	valid_0's auc: 0.646437	valid_0's gini: 0.292875
[1110]	valid_0's auc: 0.64642	valid_0's gini: 0.292839
[1120]	valid_0's auc: 0.646469	valid_0's gini: 0.292938
[1130]	valid_0's auc: 0.646406	valid_0's gini: 0.292813
Early stopping, best iteration is:
[1033]	valid_0's auc: 0.646539	valid_0's gini: 0.293079
 lgb kfold: 5  of  5 : 
Training until validation scores don't improve for 100 rounds.
[10]	valid_0's auc: 0.618821	valid_0's gini: 0.237636
[20]	valid_0's auc: 0.621649	valid_0's gini: 0.243305
[30]	valid_0's auc: 0.623032	valid_0's gini: 0.246067
[40]	valid_0's auc: 0.623625	valid_0's gini: 0.247252
[50]	valid_0's auc: 0.624162	valid_0's gini: 0.248326
[60]	valid_0's auc: 0.624288	valid_0's gini: 0.248577
[70]	valid_0's auc: 0.624118	valid_0's gini: 0.24823

 catboost kfold: 1  of  5 : 


NameError: name 'eval_gini' is not defined

In [18]:
sub.describe()

Unnamed: 0,id,target
count,892815.0,892816.0
mean,744154.3,0.012159
std,429682.5,0.006365
min,0.0,0.002795
25%,372023.5,0.007933
50%,744308.0,0.01066
75%,1116309.0,0.014518
max,1488026.0,0.151974


In [19]:
sub =sub*3
sub.to_csv('stacking_submit.csv', float_format='%.6f', index=False)

In [20]:
sub.describe()

Unnamed: 0,id,target
count,892815.0,892816.0
mean,2232463.0,0.036477
std,1289047.0,0.019095
min,0.0,0.008386
25%,1116070.0,0.023798
50%,2232924.0,0.031981
75%,3348927.0,0.043554
max,4464078.0,0.455923


Insight:<br>
The first stacking model helps to combine three 0.281/0.282 model to reach a 0.283 model, so it's pretty useful.