In [2]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
import lightgbm as lgb

from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split
from sklearn.datasets import load_svmlight_file
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.externals import joblib
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.externals import joblib
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.naive_bayes import GaussianNB
import matplotlib.pyplot as plt

from scipy.sparse import csr_matrix
from scipy.sparse import hstack
import time, os, random, sys
import math
import hyperopt.tpe
import hpsklearn.components
import hpsklearn.demo_support
import warnings
warnings.filterwarnings('ignore')
random.seed(1)



WARN: OMP_NUM_THREADS=None =>
... If you are using openblas if you are using openblas set OMP_NUM_THREADS=1 or risk subprocess calls hanging indefinitely


In [3]:
def get_leaf_indices(ensemble, x):
    x = x.astype(np.float32)
    trees = ensemble.estimators_
    n_trees = trees.shape[0]
    indices = []

    for i in range(n_trees):
        tree = trees[i][0].tree_
        indices.append(tree.apply(x))

    indices = np.column_stack(indices)
    return indices

def gbdt_lr_train_test(File):
    #GBDT
    start = time.clock()
    train_df, test_df = train_test_split(File, train_size = 0.8)
    X_train = train_df.drop(train_df.columns[0], axis=1)
    y_train = train_df[train_df.columns[0]]
    X_test = test_df.drop(test_df.columns[0], axis=1)
    y_test = test_df[test_df.columns[0]]
    gbclf = GradientBoostingClassifier(n_estimators=20, max_depth=4, verbose=0)
    tuned_parameter = [{'n_estimators':[20,30,40,50], 'max_depth':[5, 6, 7, 8], 'max_features':[0.5]}]
    gs_clf = GridSearchCV(gbclf, tuned_parameter, cv=5, scoring='roc_auc')
    gs_clf.fit(X_train, y_train)
    print('best parameters set found: ')
    print(gs_clf.best_params_)
    
    y_pred_gbdt = gs_clf.predict_proba(X_test)[:, 1]
    gbdt_auc = roc_auc_score(y_test, y_pred_gbdt)
    print('gbdt auc: %.5f' % gbdt_auc)
    
    gbclf=GradientBoostingClassifier(**gs_clf.best_params_)
    gbclf.fit(X_train,y_train)
    leaf = get_leaf_indices
    X_train_leaves=leaf(gbclf,X_train.values)
    X_test_leaves=leaf(gbclf,X_test.values)

    #GBDT+LR
    lr = LogisticRegression(penalty='l1', C=0.2)
    lr.fit(X_train_leaves, y_train)
    y_pred_gbdtlr1 = lr.predict_proba(X_test_leaves)[:,1]
    gbdtlr_auc1 = roc_auc_score(y_test, y_pred_gbdtlr1)
    print('gbdt+lr auc 1: %.5f' % gbdtlr_auc1)
    
    lr = LogisticRegression(n_jobs=-1)
    X_train_ext = hstack([X_train_leaves, X_train])
    lr.fit(X_train_ext, y_train)
    X_test_ext = hstack([X_test_leaves, X_test])
    y_pred_gbdtlr2 = lr.predict_proba(X_test_ext)[:, 1]
    gbdtlr_auc2 = roc_auc_score(y_test, y_pred_gbdtlr2)
    print('gbdt+lr auc 2: %.5f' % gbdtlr_auc2)
    f_time =time.clock()-start
    print('GBDT+LR time taken: %.2f'% f_time)
    
    #+NB

    
    start = time.clock()
    gnb= GaussianNB()

    gnb.fit(X_train_leaves, y_train)
    Y_pred_nb=gnb.predict_proba(X_test_leaves)[:,1]
    gnb_auc = roc_auc_score(y_test,Y_pred_nb)
    print('GBDT + GNB auc: %.5f'% gnb_auc)    
    '''    
    gnb.fit(X_train_ext, y_train)
    y_pred_gnb2=gnb.predict_proba(X_test_ext)[:,1]
    gnb_auc2=roc_auc_score(y_test,y_pred_gnb2)
    print('GNB auc2: %.5f' % gnb_auc2)
    '''
    #svc
    svc=SVC(probability=True)
    svc.fit(X_train_leaves, y_train)
    Y_pred_svc=svc.predict_proba(X_test_leaves)[:,1]
    svc_auc=roc_auc_score(y_test,Y_pred_svc)
    print('GBDT + SVC auc: %.5f' % svc_auc)
    
    svc.fit(X_train_ext, y_train)
    y_pred_svc2=svc.predict_proba(X_test_ext)[:, 1]
    svc_auc2=roc_auc_score(y_test,y_pred_svc2)
    print('GBDT + SVC auc2: %.5f' % svc_auc2)
    
    #KNN
    knn=KNeighborsClassifier(n_neighbors = 3)
    knn.fit(X_train_leaves, y_train)
    Y_pred_knn=knn.predict_proba(X_test_leaves)[:,1]
    knn_auc=roc_auc_score(y_test,Y_pred_knn)
    print('GBDT + KNN auc : %.5f' % knn_auc)
    
    knn.fit(X_train_ext, y_train)
    y_pred_knn2=knn.predict_proba(X_test_ext)[:, 1]
    knn_auc2=roc_auc_score(y_test,y_pred_knn2)
    print('GBDT + KNN auc2: %.5f' % knn_auc2)
    
    #perceptron

    perceptron = Perceptron()
    perceptron.fit(X_train_leaves, y_train)
    y_pred_perc=perceptron.predict(X_test_leaves)
    perc_auc=roc_auc_score(y_test,y_pred_perc)
    print('GBDT + Perceptron auc : %.5f' % perc_auc)
    
    perceptron.fit(X_train_ext, y_train)
    y_pred_perc2=perceptron.predict(X_test_ext)
    perc_auc2=roc_auc_score(y_test,y_pred_perc2)
    print('GBDT + Perceptron auc2 : %.5f' % perc_auc2 )
    
    #linear svc

    lin = LinearSVC()
    lin.fit(X_train_leaves, y_train)
    y_pred_lin=lin.predict(X_test_leaves)
    lin_auc=roc_auc_score(y_test,y_pred_lin)
    print('GBDT + Linear SVC auc : %.5f' % lin_auc)
    
    lin.fit(X_train_ext, y_train)
    y_pred_lin2=lin.predict(X_test_ext)
    lin_auc2=roc_auc_score(y_test,y_pred_lin2)
    print('GBDT + Linear SVC auc2 : %.5f' % lin_auc2)
    
    #SGD

    sgd = SGDClassifier(loss='log')
    sgd.fit(X_train_leaves, y_train)
    Y_pred_sgd=sgd.predict_proba(X_test_leaves)[:, 1]
    sgd_auc=roc_auc_score(y_test,Y_pred_sgd)
    print('GBDT + SGD auc : %.5f' % sgd_auc)
    
    sgd.fit(X_train_ext, y_train)
    Y_pred_sgd2=sgd.predict_proba(X_test_ext)[:, 1]
    sgd_auc2=roc_auc_score(y_test,Y_pred_sgd2)
    print('GBDT + SGD auc2 : %.5f' % sgd_auc2)
    f_time =time.clock()-start
    print('GBDT + other classfier time taken: %.2f'% f_time)    
    
    #XGB
    start = time.clock()
    xgb=XGBClassifier()
    xgb.fit(X_train,y_train)
    Y_pred_xgb=xgb.predict_proba(X_test)[:,1]
    xgb_auc= roc_auc_score(y_test,Y_pred_xgb)
    print('XGB auc : %.5f' % xgb_auc)
    
    #XGB with leaves

    xgb.fit(X_train_leaves,y_train)
    Y_pred_xgb=xgb.predict(X_test_leaves)
    xgb_auc= roc_auc_score(y_test,Y_pred_xgb)
    print('GBDT + XGB auc: %.5f' % xgb_auc)
    
    #XGB with features ext
    
    xgb.fit(X_train_ext, y_train)
    y_pred_xgb2=xgb.predict(X_test_ext)
    xgb_auc2=roc_auc_score(y_test,y_pred_xgb2)
    print('GBDT + XGB auc2: %.5f' %xgb_auc2)
    
    f_time =time.clock()-start
    print('XGB time taken: %.2f'% f_time)
    
    #lightGBM
    start = time.clock()
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
    
    
    
    # specify your configurations as a dict
    params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2', 'auc'},
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
    
    }

    # train
    gbm = lgb.train(params,
                    lgb_train,
                    num_boost_round=20,
                    valid_sets=lgb_eval,
                    early_stopping_rounds=5,
                    verbose_eval=False)



    y_pred_lgb = gbm.predict(X_test, num_iteration=gbm.best_iteration)
    # eval
    lgb_auc=roc_auc_score(y_test,y_pred_lgb)
    print('lightGBM auc : %.5f' % lgb_auc)
    
    lgb_train=lgb.Dataset(X_train_leaves, y_train)
    lgb_eval=lgb.Dataset(X_test_leaves, y_test, reference=lgb_train)
    gbm = lgb.train(params,
                  lgb_train,
                    num_boost_round=20,
                    valid_sets=lgb_eval,
                    early_stopping_rounds=5,
                   verbose_eval=False)
    y_pred_lgb2 =gbm.predict(X_test_leaves, num_iteration=gbm.best_iteration)
    lgb_auc2=roc_auc_score(y_test, y_pred_lgb2)
    
    print('GBDT + lightGBM auc : %.5f' % lgb_auc2)
    
    lgb_train=lgb.Dataset(X_train_ext, y_train)
    lgb_eval=lgb.Dataset(X_test_ext, y_test, reference=lgb_train)
    gbm = lgb.train(params,
                  lgb_train,
                    num_boost_round=20,
                    valid_sets=lgb_eval,
                    early_stopping_rounds=5,
                   verbose_eval=False)
    y_pred_lgb3 =gbm.predict(X_test_ext, num_iteration=gbm.best_iteration)
    lgb_auc3=roc_auc_score(y_test, y_pred_lgb3)
    
    print('GBDT + lightGBM auc2 : %.5f' % lgb_auc3)
    
    
    f_time=time.clock()-start
    print('lightGBM time taken: %.2f'% f_time)

In [19]:
example = pd.read_csv('example.csv')

In [16]:
gbdt_lr_train_test(example) # V3 trial

best parameters set found: 
{'max_depth': 8, 'max_features': 0.5, 'n_estimators': 100}
gbdt auc: 0.94744
gbdt+lr auc 1: 0.47496
gbdt+lr auc 2: 0.47697
GBDT+LR time taken: 40.24
NB auc:  0.728048687278
SVC auc: 0.50000
SVC auc2: 0.50000
KNN auc : 0.79614
KNN auc2: 0.61520
Perceptron auc : 0.51301
Perceptron auc2 : 0.50000
Linear SVC auc : 0.49594
Linear SVC auc2 : 0.52416
SGD auc : 0.56036
SGD auc2 : 0.49957
GBDT + other classfier time taken: 84.77
XGB auc : 0.86043
XGB auc with feature transformed: 0.53599
XGB time taken: 2.75


In [17]:
gbdt_lr_train_test(example) # V3 trial

best parameters set found: 
{'max_depth': 6, 'max_features': 0.5, 'n_estimators': 100}
gbdt auc: 0.94873
gbdt+lr auc 1: 0.48734
gbdt+lr auc 2: 0.81641
GBDT+LR time taken: 32.58
NB auc:  0.843961860358
SVC auc: 0.50000
SVC auc2: 0.50000
KNN auc : 0.85452
KNN auc2: 0.60983
Perceptron auc : 0.61611
Perceptron auc2 : 0.50000
Linear SVC auc : 0.53852
Linear SVC auc2 : 0.50000
SGD auc : 0.66286
SGD auc2 : 0.50000
GBDT + other classfier time taken: 86.49
XGB auc : 0.86043
XGB auc with feature transformed: 0.69166
XGB time taken: 2.68


In [18]:
gbdt_lr_train_test(example) # V3 trial

best parameters set found: 
{'max_depth': 6, 'max_features': 0.5, 'n_estimators': 100}
gbdt auc: 0.94657
gbdt+lr auc 1: 0.48735
gbdt+lr auc 2: 0.81641
GBDT+LR time taken: 32.95
NB auc:  0.843961860358
SVC auc: 0.50000
SVC auc2: 0.50000
KNN auc : 0.85452
KNN auc2: 0.60983
Perceptron auc : 0.61611
Perceptron auc2 : 0.50000
Linear SVC auc : 0.47925
Linear SVC auc2 : 0.48028
SGD auc : 0.62034
SGD auc2 : 0.50000
GBDT + other classfier time taken: 87.54
XGB auc : 0.86043
XGB auc with feature transformed: 0.69166
XGB time taken: 2.60


In [23]:
gbdt_lr_train_test(example) # V3 trial with n_estimator= 20

best parameters set found: 
{'max_depth': 6, 'max_features': 0.5, 'n_estimators': 100}
gbdt auc: 0.94619
gbdt+lr auc 1: 0.48735
gbdt+lr auc 2: 0.81641
GBDT+LR time taken: 32.65
NB auc: 0.84396
SVC auc: 0.50000
SVC auc2: 0.50000
KNN auc : 0.85452
KNN auc2: 0.60983
Perceptron auc : 0.61611
Perceptron auc2 : 0.50000
Linear SVC auc : 0.47830
Linear SVC auc2 : 0.49935
SGD auc : 0.66557
SGD auc2 : 0.50715
GBDT + other classfier time taken: 85.66
XGB auc : 0.93941
XGB auc with feature transformed: 0.69166
XGB time taken: 2.68


In [32]:
gbdt_lr_train_test(example) 

best parameters set found: 
{'max_depth': 6, 'max_features': 0.5, 'n_estimators': 100}
gbdt auc: 0.94810
gbdt+lr auc 1: 0.48738
gbdt+lr auc 2: 0.81641
GBDT+LR time taken: 32.39
NB auc: 0.84396
SVC auc: 0.50000
SVC auc2: 0.50000
KNN auc : 0.85452
KNN auc2: 0.60983
Perceptron auc : 0.61611
Perceptron auc2 : 0.50000
Linear SVC auc : 0.53175
Linear SVC auc2 : 0.55129
SGD auc : 0.58215
SGD auc2 : 0.50000
GBDT + other classfier time taken: 87.13
XGB auc : 0.93941
XGB auc with feature transformed: 0.69166
XGB time taken: 2.69


In [38]:
gbdt_lr_train_test(example) 

best parameters set found: 
{'max_depth': 6, 'max_features': 0.5, 'n_estimators': 100}
gbdt auc: 0.94665
gbdt+lr auc 1: 0.48735
gbdt+lr auc 2: 0.81641
GBDT+LR time taken: 33.04
NB auc: 0.84396
SVC auc: 0.50000
SVC auc2: 0.50000
KNN auc : 0.85452
KNN auc2: 0.60983
Perceptron auc : 0.61611
Perceptron auc2 : 0.50000
Linear SVC auc : 0.52158
Linear SVC auc2 : 0.50000
SGD auc : 0.63871
SGD auc2 : 0.52030
GBDT + other classfier time taken: 88.46
XGB auc : 0.93941
XGB auc with feature transformed: 0.69166
XGB time taken: 2.65


In [46]:
gbdt_lr_train_test(example) 

best parameters set found: 
{'max_depth': 6, 'max_features': 0.5, 'n_estimators': 100}
gbdt auc: 0.94557
gbdt+lr auc 1: 0.48735
gbdt+lr auc 2: 0.81641
GBDT+LR time taken: 32.15
NB auc: 0.84396
SVC auc: 0.50000
SVC auc2: 0.50000
KNN auc : 0.85452
KNN auc2: 0.60983
Perceptron auc : 0.61611
Perceptron auc2 : 0.50000
Linear SVC auc : 0.51555
Linear SVC auc2 : 0.55458
SGD auc : 0.63701
SGD auc2 : 0.45964
GBDT + other classfier time taken: 86.84
XGB auc : 0.93941
XGB auc with feature transformed: 0.69166
XGB time taken: 2.62
[1]	valid_0's l2: 0.235948	valid_0's auc: 0.909065
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's l2: 0.225072	valid_0's auc: 0.923392
[3]	valid_0's l2: 0.213547	valid_0's auc: 0.926876
[4]	valid_0's l2: 0.203327	valid_0's auc: 0.925996
[5]	valid_0's l2: 0.194702	valid_0's auc: 0.928332
[6]	valid_0's l2: 0.187084	valid_0's auc: 0.930558
[7]	valid_0's l2: 0.179158	valid_0's auc: 0.931759
[8]	valid_0's l2: 0.172098	valid_0's auc: 0.93179
[9]	val

In [52]:
gbdt_lr_train_test(example) 

best parameters set found: 
{'max_depth': 6, 'max_features': 0.5, 'n_estimators': 100}
gbdt auc: 0.94738
gbdt+lr auc 1: 0.48735
gbdt+lr auc 2: 0.81641
GBDT+LR time taken: 32.82
NB auc: 0.84396
SVC auc: 0.50000
SVC auc2: 0.50000
KNN auc : 0.85452
KNN auc2: 0.60983
Perceptron auc : 0.61611
Perceptron auc2 : 0.50000
Linear SVC auc : 0.47487
Linear SVC auc2 : 0.50000
SGD auc : 0.56671
SGD auc2 : 0.49914
GBDT + other classfier time taken: 88.83
XGB auc : 0.93941
XGB auc with feature transformed: 0.69166
XGB time taken: 2.74
[1]	valid_0's l2: 0.235948	valid_0's auc: 0.909065
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's l2: 0.225072	valid_0's auc: 0.923392
[3]	valid_0's l2: 0.213547	valid_0's auc: 0.926876
[4]	valid_0's l2: 0.203327	valid_0's auc: 0.925996
[5]	valid_0's l2: 0.194702	valid_0's auc: 0.928332
[6]	valid_0's l2: 0.187084	valid_0's auc: 0.930558
[7]	valid_0's l2: 0.179158	valid_0's auc: 0.931759
[8]	valid_0's l2: 0.172098	valid_0's auc: 0.93179
[9]	val

In [68]:
gbdt_lr_train_test(example) 

best parameters set found: 
{'max_depth': 7, 'max_features': 0.5, 'n_estimators': 50}
gbdt auc: 0.94642
gbdt+lr auc 1: 0.55789
gbdt+lr auc 2: 0.49749
GBDT+LR time taken: 44.90
NB auc: 0.83106
SVC auc: 0.50000
SVC auc2: 0.50000
KNN auc : 0.81779
KNN auc2: 0.60721
Perceptron auc : 0.41106
Perceptron auc2 : 0.50219
Linear SVC auc : 0.54468
Linear SVC auc2 : 0.46483
SGD auc : 0.45510
SGD auc2 : 0.49869
GBDT + other classfier time taken: 55.01
XGB auc : 0.93941
XGB auc with feature transformed: 0.73873
XGB time taken: 1.72
[1]	valid_0's l2: 0.235948	valid_0's auc: 0.909065
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's l2: 0.225072	valid_0's auc: 0.923392
[3]	valid_0's l2: 0.213547	valid_0's auc: 0.926876
[4]	valid_0's l2: 0.203327	valid_0's auc: 0.925996
[5]	valid_0's l2: 0.194702	valid_0's auc: 0.928332
[6]	valid_0's l2: 0.187084	valid_0's auc: 0.930558
[7]	valid_0's l2: 0.179158	valid_0's auc: 0.931759
[8]	valid_0's l2: 0.172098	valid_0's auc: 0.93179
[9]	vali

In [70]:
gbdt_lr_train_test(example) 

best parameters set found: 
{'max_depth': 8, 'max_features': 0.5, 'n_estimators': 50}
gbdt auc: 0.94755
gbdt+lr auc 1: 0.72260
gbdt+lr auc 2: 0.75584
GBDT+LR time taken: 48.59
NB auc: 0.81587
SVC auc: 0.50000
SVC auc2: 0.50000
KNN auc : 0.69890
KNN auc2: 0.60992
Perceptron auc : 0.65405
Perceptron auc2 : 0.50000
Linear SVC auc : 0.58892
Linear SVC auc2 : 0.55888
SGD auc : 0.69478
SGD auc2 : 0.49852
GBDT + other classfier time taken: 54.99
XGB auc : 0.93941
XGB auc with feature transformed: 0.73586
XGB time taken: 1.74
[1]	valid_0's l2: 0.235948	valid_0's auc: 0.909065
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's l2: 0.225072	valid_0's auc: 0.923392
[3]	valid_0's l2: 0.213547	valid_0's auc: 0.926876
[4]	valid_0's l2: 0.203327	valid_0's auc: 0.925996
[5]	valid_0's l2: 0.194702	valid_0's auc: 0.928332
[6]	valid_0's l2: 0.187084	valid_0's auc: 0.930558
[7]	valid_0's l2: 0.179158	valid_0's auc: 0.931759
[8]	valid_0's l2: 0.172098	valid_0's auc: 0.93179
[9]	vali

In [85]:
gbdt_lr_train_test(example) 

best parameters set found: 
{'max_depth': 7, 'max_features': 0.5, 'n_estimators': 50}
gbdt auc: 0.94734
gbdt+lr auc 1: 0.75421
gbdt+lr auc 2: 0.82609
GBDT+LR time taken: 46.33
NB auc: 0.56546
SVC auc: 0.50146
SVC auc2: 0.50073
KNN auc : 0.41379
KNN auc2: 0.60017
Perceptron auc : 0.73253
Perceptron auc2 : 0.50040
Linear SVC auc : 0.65259
Linear SVC auc2 : 0.51576
SGD auc : 0.57253
SGD auc2 : 0.50050
GBDT + other classfier time taken: 54.35
XGB auc : 0.93941
XGB auc with feature transformed: 0.44018
XGB time taken: 2.04
lightGBM auc : 0.93555
lightGBM auc2 : 0.73047
lightGBM time taken:  0.37850499999967724


In [87]:
gbdt_lr_train_test(example) 

best parameters set found: 
{'max_depth': 8, 'max_features': 0.5, 'n_estimators': 50}
gbdt auc: 0.93250
gbdt+lr auc 1: 0.62916
gbdt+lr auc 2: 0.42805
GBDT+LR time taken: 47.38
NB auc: 0.57196
SVC auc: 0.50000
SVC auc2: 0.50000
KNN auc : 0.46443
KNN auc2: 0.61054
Perceptron auc : 0.43689
Perceptron auc2 : 0.51866
Linear SVC auc : 0.50820
Linear SVC auc2 : 0.51632
SGD auc : 0.44440
SGD auc2 : 0.49403
GBDT + other classfier time taken: 53.71
XGB auc : 0.92600
XGB auc with feature transformed: 0.35298
XGB time taken: 1.77
lightGBM auc : 0.92576
lightGBM auc2 : 0.56333
lightGBM time taken:  0.3965550000002622


In [9]:
gbdt_lr_train_test(example) 

best parameters set found: 
{'max_depth': 8, 'max_features': 0.5, 'n_estimators': 50}
gbdt auc: 0.93637
gbdt+lr auc 1: 0.39040
gbdt+lr auc 2: 0.34028
GBDT+LR time taken: 45.04
NB auc: 0.54322
SVC auc: 0.50079
SVC auc2: 0.50000
KNN auc : 0.51652
KNN auc2: 0.59861
Perceptron auc : 0.51079
Perceptron auc2 : 0.51617
Linear SVC auc : 0.50114
Linear SVC auc2 : 0.50000
SGD auc : 0.49828
SGD auc2 : 0.49859
GBDT + other classfier time taken: 54.02
XGB auc : 0.92641
XGB auc with feature transformed: 0.40932
XGB time taken: 1.73
lightGBM auc : 0.92603
lightGBM auc with feature transformed : 0.23928
lightGBM time taken: 0.37


In [10]:
gbdt_lr_train_test(example) 

best parameters set found: 
{'max_depth': 7, 'max_features': 0.5, 'n_estimators': 50}
gbdt auc: 0.94298
gbdt+lr auc 1: 0.50997
gbdt+lr auc 2: 0.60627
GBDT+LR time taken: 47.34
NB auc: 0.47256
SVC auc: 0.50073
SVC auc2: 0.50000
KNN auc : 0.38880
KNN auc2: 0.62608
Perceptron auc : 0.57129
Perceptron auc2 : 0.49260
Linear SVC auc : 0.60090
Linear SVC auc2 : 0.47130
SGD auc : 0.50228
SGD auc2 : 0.53551
GBDT + other classfier time taken: 54.45
XGB auc : 0.93026
XGB auc with feature transformed: 0.47466
XGB time taken: 1.73
lightGBM auc : 0.92769
lightGBM auc with feature transformed : 0.65718
lightGBM time taken: 0.36


In [12]:
gbdt_lr_train_test(example) # Lasso applied

best parameters set found: 
{'max_depth': 7, 'max_features': 0.5, 'n_estimators': 50}
gbdt auc: 0.95106
gbdt+lr auc 1: 0.61310
gbdt+lr auc 2: 0.55277
GBDT+LR time taken: 52.16
GBDT + GNB auc: 0.78652
GBDT + SVC auc: 0.50000
GBDT + SVC auc2: 0.50000
GBDT + KNN auc : 0.29751
GBDT + KNN auc2: 0.59235
GBDT + Perceptron auc : 0.48839
GBDT + Perceptron auc2 : 0.49191
GBDT + Linear SVC auc : 0.69101
GBDT + Linear SVC auc2 : 0.50000
GBDT + SGD auc : 0.41851
GBDT + SGD auc2 : 0.49983
GBDT + other classfier time taken: 55.11
XGB auc : 0.94305
GBDT + XGB auc: 0.40058
XGB time taken: 1.73
lightGBM auc : 0.94243
GBDT + lightGBM auc : 0.78077
lightGBM time taken: 0.47


In [6]:
gbdt_lr_train_test(example) # Stronger lasso applied C=0.5

best parameters set found: 
{'max_depth': 7, 'max_features': 0.5, 'n_estimators': 50}
gbdt auc: 0.94343
gbdt+lr auc 1: 0.42421
gbdt+lr auc 2: 0.62392
GBDT+LR time taken: 52.81
GBDT + GNB auc: 0.66043
GBDT + SVC auc: 0.50224
GBDT + SVC auc2: 0.50075
GBDT + KNN auc : 0.45883
GBDT + KNN auc2: 0.60745
GBDT + Perceptron auc : 0.46499
GBDT + Perceptron auc2 : 0.49549
GBDT + Linear SVC auc : 0.49307
GBDT + Linear SVC auc2 : 0.47914
GBDT + SGD auc : 0.44157
GBDT + SGD auc2 : 0.48545
GBDT + other classfier time taken: 51.96
XGB auc : 0.93490
GBDT + XGB auc: 0.48161
XGB time taken: 1.77
lightGBM auc : 0.93071
GBDT + lightGBM auc : 0.46530
lightGBM time taken: 0.41


In [10]:
gbdt_lr_train_test(example) # Stronger lasso applied C=0.2

best parameters set found: 
{'max_depth': 8, 'max_features': 0.5, 'n_estimators': 40}
gbdt auc: 0.94281
gbdt+lr auc 1: 0.58251
gbdt+lr auc 2: 0.73086
GBDT+LR time taken: 50.61
GBDT + GNB auc: 0.47418
GBDT + SVC auc: 0.50000
GBDT + SVC auc2: 0.50000
GBDT + KNN auc : 0.70226
GBDT + KNN auc2: 0.58391
GBDT + Perceptron auc : 0.46116
GBDT + Perceptron auc2 : 0.49601
GBDT + Linear SVC auc : 0.60985
GBDT + Linear SVC auc2 : 0.49945
GBDT + SGD auc : 0.58083
GBDT + SGD auc2 : 0.50000
GBDT + other classfier time taken: 44.69
XGB auc : 0.93832
GBDT + XGB auc: 0.55111
XGB time taken: 1.52
lightGBM auc : 0.93366
GBDT + lightGBM auc : 0.68184
lightGBM time taken: 0.42


In [9]:
gbdt_lr_train_test(example) # OHE applied

best parameters set found: 
{'max_depth': 7, 'max_features': 0.5, 'n_estimators': 50}
gbdt auc: 0.94881
gbdt+lr auc 1: 0.73851
gbdt+lr auc 2: 0.76525
GBDT+LR time taken: 55.66
GBDT + GNB auc: 0.76210
GBDT + SVC auc: 0.50000
GBDT + SVC auc2: 0.50000
GBDT + KNN auc : 0.48565
GBDT + KNN auc2: 0.58401
GBDT + Perceptron auc : 0.73822
GBDT + Perceptron auc2 : 0.50680
GBDT + Linear SVC auc : 0.28860
GBDT + Linear SVC auc2 : 0.54970
GBDT + SGD auc : 0.67403
GBDT + SGD auc2 : 0.55257
GBDT + other classfier time taken: 54.93
XGB auc : 0.93885
GBDT + XGB auc: 0.58941
XGB time taken: 1.83
lightGBM auc : 0.93846
GBDT + lightGBM auc : 0.69843
lightGBM time taken: 0.39


In [12]:
gbdt_lr_train_test(example) # features + transformed features on XGB and lightGBM

best parameters set found: 
{'max_depth': 8, 'max_features': 0.5, 'n_estimators': 50}
gbdt auc: 0.94884
gbdt+lr auc 1: 0.31611
gbdt+lr auc 2: 0.32257
GBDT+LR time taken: 50.32
GBDT + GNB auc: 0.57865
GBDT + SVC auc: 0.50000
GBDT + SVC auc2: 0.50000
GBDT + KNN auc : 0.45606
GBDT + KNN auc2: 0.57899
GBDT + Perceptron auc : 0.44331
GBDT + Perceptron auc2 : 0.52453
GBDT + Linear SVC auc : 0.38417
GBDT + Linear SVC auc2 : 0.49853
GBDT + SGD auc : 0.47072
GBDT + SGD auc2 : 0.52974
GBDT + other classfier time taken: 57.04
XGB auc : 0.93690
GBDT + XGB auc: 0.61455
GBDT + XGB auc2: 0.68177
XGB time taken: 3.65
lightGBM auc : 0.93619
GBDT + lightGBM auc : 0.39236
GBDT + lightGBM auc : 0.47664
lightGBM time taken: 0.59


In [13]:
gbdt_lr_train_test(example) # features + transformed features on XGB and lightGBM 2

best parameters set found: 
{'max_depth': 7, 'max_features': 0.5, 'n_estimators': 50}
gbdt auc: 0.93903
gbdt+lr auc 1: 0.76876
gbdt+lr auc 2: 0.59136
GBDT+LR time taken: 47.24
GBDT + GNB auc: 0.75700
GBDT + SVC auc: 0.50000
GBDT + SVC auc2: 0.50000
GBDT + KNN auc : 0.52746
GBDT + KNN auc2: 0.60053
GBDT + Perceptron auc : 0.52003
GBDT + Perceptron auc2 : 0.55061
GBDT + Linear SVC auc : 0.64655
GBDT + Linear SVC auc2 : 0.50044
GBDT + SGD auc : 0.50436
GBDT + SGD auc2 : 0.54754
GBDT + other classfier time taken: 55.72
XGB auc : 0.92993
GBDT + XGB auc: 0.42822
GBDT + XGB auc2: 0.49972
XGB time taken: 3.48
lightGBM auc : 0.92864
GBDT + lightGBM auc : 0.69141
GBDT + lightGBM auc : 0.64543
lightGBM time taken: 0.54


In [14]:
gbdt_lr_train_test(example) # features + transformed features on XGB and lightGBM 3

best parameters set found: 
{'max_depth': 8, 'max_features': 0.5, 'n_estimators': 50}
gbdt auc: 0.94096
gbdt+lr auc 1: 0.53550
gbdt+lr auc 2: 0.46523
GBDT+LR time taken: 50.19
GBDT + GNB auc: 0.49358
GBDT + SVC auc: 0.50000
GBDT + SVC auc2: 0.50000
GBDT + KNN auc : 0.65001
GBDT + KNN auc2: 0.58170
GBDT + Perceptron auc : 0.52455
GBDT + Perceptron auc2 : 0.50074
GBDT + Linear SVC auc : 0.35266
GBDT + Linear SVC auc2 : 0.51834
GBDT + SGD auc : 0.48101
GBDT + SGD auc2 : 0.50478
GBDT + other classfier time taken: 57.01
XGB auc : 0.92904
GBDT + XGB auc: 0.53402
GBDT + XGB auc2: 0.60563
XGB time taken: 3.49
lightGBM auc : 0.92666
GBDT + lightGBM auc : 0.72257
GBDT + lightGBM auc : 0.65327
lightGBM time taken: 0.58


In [17]:
gbdt_lr_train_test(example) # features + transformed features on XGB and lightGBM 4

best parameters set found: 
{'max_depth': 7, 'max_features': 0.5, 'n_estimators': 40}
gbdt auc: 0.93606
gbdt+lr auc 1: 0.79013
gbdt+lr auc 2: 0.80337
GBDT+LR time taken: 46.44
GBDT + GNB auc: 0.31026
GBDT + SVC auc: 0.50000
GBDT + SVC auc2: 0.50000
GBDT + KNN auc : 0.55358
GBDT + KNN auc2: 0.59317
GBDT + Perceptron auc : 0.70135
GBDT + Perceptron auc2 : 0.50000
GBDT + Linear SVC auc : 0.66977
GBDT + Linear SVC auc2 : 0.50446
GBDT + SGD auc : 0.66243
GBDT + SGD auc2 : 0.50013
GBDT + other classfier time taken: 46.59
XGB auc : 0.92765
GBDT + XGB auc: 0.70807
GBDT + XGB auc2: 0.73399
XGB time taken: 2.99
lightGBM auc : 0.92383
GBDT + lightGBM auc : 0.83976
GBDT + lightGBM auc2 : 0.79301
lightGBM time taken: 0.59


In [20]:
gbdt_lr_train_test(example)  # changed apply func

best parameters set found: 
{'max_depth': 7, 'max_features': 0.5, 'n_estimators': 50}
gbdt auc: 0.94117
gbdt+lr auc 1: 0.92528
gbdt+lr auc 2: 0.92536
GBDT+LR time taken: 50.80
GBDT + GNB auc: 0.85217
GBDT + SVC auc: 0.83621
GBDT + SVC auc2: 0.59101
GBDT + KNN auc : 0.89043
GBDT + KNN auc2: 0.61965
GBDT + Perceptron auc : 0.81829
GBDT + Perceptron auc2 : 0.52439
GBDT + Linear SVC auc : 0.68912
GBDT + Linear SVC auc2 : 0.50179
GBDT + SGD auc : 0.71763
GBDT + SGD auc2 : 0.48776
GBDT + other classfier time taken: 55.47
XGB auc : 0.93507
GBDT + XGB auc: 0.86619
GBDT + XGB auc2: 0.86392
XGB time taken: 3.52
lightGBM auc : 0.93294
GBDT + lightGBM auc : 0.93162
GBDT + lightGBM auc2 : 0.93411
lightGBM time taken: 0.91


In [21]:
gbdt_lr_train_test(example)  # changed apply func 2

best parameters set found: 
{'max_depth': 7, 'max_features': 0.5, 'n_estimators': 50}
gbdt auc: 0.94292
gbdt+lr auc 1: 0.92293
gbdt+lr auc 2: 0.92309
GBDT+LR time taken: 48.67
GBDT + GNB auc: 0.85037
GBDT + SVC auc: 0.83145
GBDT + SVC auc2: 0.60346
GBDT + KNN auc : 0.89178
GBDT + KNN auc2: 0.60834
GBDT + Perceptron auc : 0.79659
GBDT + Perceptron auc2 : 0.48741
GBDT + Linear SVC auc : 0.81008
GBDT + Linear SVC auc2 : 0.50000
GBDT + SGD auc : 0.82762
GBDT + SGD auc2 : 0.50547
GBDT + other classfier time taken: 56.06
XGB auc : 0.93415
GBDT + XGB auc: 0.85581
GBDT + XGB auc2: 0.85506
XGB time taken: 3.59
lightGBM auc : 0.93698
GBDT + lightGBM auc : 0.93537
GBDT + lightGBM auc2 : 0.93563
lightGBM time taken: 0.91


In [22]:
gbdt_lr_train_test(example)  # changed apply func 3

best parameters set found: 
{'max_depth': 8, 'max_features': 0.5, 'n_estimators': 50}
gbdt auc: 0.93869
gbdt+lr auc 1: 0.91245
gbdt+lr auc 2: 0.91295
GBDT+LR time taken: 49.16
GBDT + GNB auc: 0.85332
GBDT + SVC auc: 0.78914
GBDT + SVC auc2: 0.60929
GBDT + KNN auc : 0.89101
GBDT + KNN auc2: 0.63190
GBDT + Perceptron auc : 0.66173
GBDT + Perceptron auc2 : 0.50000
GBDT + Linear SVC auc : 0.76114
GBDT + Linear SVC auc2 : 0.54184
GBDT + SGD auc : 0.82460
GBDT + SGD auc2 : 0.54032
GBDT + other classfier time taken: 55.63
XGB auc : 0.93361
GBDT + XGB auc: 0.85799
GBDT + XGB auc2: 0.85924
XGB time taken: 3.42
lightGBM auc : 0.93037
GBDT + lightGBM auc : 0.92904
GBDT + lightGBM auc2 : 0.92766
lightGBM time taken: 0.90


In [4]:
example = pd.read_csv('example2.csv')

In [8]:
gbdt_lr_train_test(example) # trying with other dataset 1

best parameters set found: 
{'max_depth': 7, 'max_features': 0.5, 'n_estimators': 50}
gbdt auc: 0.96370
gbdt+lr auc 1: 0.92933
gbdt+lr auc 2: 0.91187
GBDT+LR time taken: 86.87
GBDT + GNB auc: 0.82161
GBDT + SVC auc: 0.87826
GBDT + SVC auc2: 0.86028
GBDT + KNN auc : 0.92190
GBDT + KNN auc2: 0.91518
GBDT + Perceptron auc : 0.75335
GBDT + Perceptron auc2 : 0.71778
GBDT + Linear SVC auc : 0.82492
GBDT + Linear SVC auc2 : 0.72753
GBDT + SGD auc : 0.68741
GBDT + SGD auc2 : 0.71778
GBDT + other classfier time taken: 83.65
XGB auc : 0.95227
GBDT + XGB auc: 0.88142
GBDT + XGB auc2: 0.88665
XGB time taken: 4.89
lightGBM auc : 0.94759
GBDT + lightGBM auc : 0.95145
GBDT + lightGBM auc2 : 0.95132
lightGBM time taken: 0.98


In [6]:
gbdt_lr_train_test(example) # trying with other dataset 2

best parameters set found: 
{'max_depth': 7, 'max_features': 0.5, 'n_estimators': 50}
gbdt auc: 0.95998
gbdt+lr auc 1: 0.93201
gbdt+lr auc 2: 0.92693
GBDT+LR time taken: 98.81
GBDT + GNB auc: 0.81533
GBDT + SVC auc: 0.89411
GBDT + SVC auc2: 0.88053
GBDT + KNN auc : 0.90855
GBDT + KNN auc2: 0.90549
GBDT + Perceptron auc : 0.83860
GBDT + Perceptron auc2 : 0.54481
GBDT + Linear SVC auc : 0.79475
GBDT + Linear SVC auc2 : 0.76097
GBDT + SGD auc : 0.82281
GBDT + SGD auc2 : 0.30434
GBDT + other classfier time taken: 91.42
XGB auc : 0.95143
GBDT + XGB auc: 0.87517
GBDT + XGB auc2: 0.87759
XGB time taken: 5.17
lightGBM auc : 0.94649
GBDT + lightGBM auc : 0.95125
GBDT + lightGBM auc2 : 0.94941
lightGBM time taken: 1.03


In [7]:
gbdt_lr_train_test(example) # trying with other dataset 3

best parameters set found: 
{'max_depth': 8, 'max_features': 0.5, 'n_estimators': 50}
gbdt auc: 0.95952
gbdt+lr auc 1: 0.92350
gbdt+lr auc 2: 0.92158
GBDT+LR time taken: 84.29
GBDT + GNB auc: 0.81042
GBDT + SVC auc: 0.84675
GBDT + SVC auc2: 0.84359
GBDT + KNN auc : 0.92180
GBDT + KNN auc2: 0.91417
GBDT + Perceptron auc : 0.81990
GBDT + Perceptron auc2 : 0.72683
GBDT + Linear SVC auc : 0.78273
GBDT + Linear SVC auc2 : 0.72034
GBDT + SGD auc : 0.75441
GBDT + SGD auc2 : 0.74351
GBDT + other classfier time taken: 80.03
XGB auc : 0.95045
GBDT + XGB auc: 0.89218
GBDT + XGB auc2: 0.89305
XGB time taken: 4.92
lightGBM auc : 0.94762
GBDT + lightGBM auc : 0.94962
GBDT + lightGBM auc2 : 0.94974
lightGBM time taken: 1.01
