In [2]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
import lightgbm as lgb

from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split
from sklearn.datasets import load_svmlight_file
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.externals import joblib
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.externals import joblib
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.naive_bayes import GaussianNB
import matplotlib.pyplot as plt

from scipy.sparse import csr_matrix
from scipy.sparse import hstack
import time, os, random, sys
import math
import hyperopt.tpe
import hpsklearn.components
import hpsklearn.demo_support
import warnings
warnings.filterwarnings('ignore')
random.seed(1)



WARN: OMP_NUM_THREADS=None =>
... If you are using openblas if you are using openblas set OMP_NUM_THREADS=1 or risk subprocess calls hanging indefinitely


In [3]:
def lightgbm_lr(File):

    start = time.clock()
    train_df, test_df = train_test_split(File, train_size = 0.8)
    X_train = train_df.drop(train_df.columns[0], axis=1)
    y_train = train_df[train_df.columns[0]]
    X_test = test_df.drop(test_df.columns[0], axis=1)
    y_test = test_df[test_df.columns[0]]
    
    # create dataset for lightgbm
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

    # specify your configurations as a dict
    params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': {'binary_logloss'},
        'num_leaves': 63,
        'num_trees': 30,
        'learning_rate': 0.01,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': 0
    }

    # number of leaves,will be used in feature transformation
    num_leaf = 63


    # train
    gbm = lgb.train(params,
                    lgb_train,
                    num_boost_round=100,
                    valid_sets=lgb_train,
                   verbose_eval=False)


    y_pred = gbm.predict(X_train,pred_leaf=True)

    X_train_leaves = np.zeros([len(y_pred),len(y_pred[0]) * num_leaf],dtype=np.int64)
    for i in range(0,len(y_pred)):
        temp = np.arange(len(y_pred[0])) * num_leaf - 1 + np.array(y_pred[i])
        X_train_leaves[i][temp] += 1


    y_pred = gbm.predict(X_test,pred_leaf=True)

    X_test_leaves = np.zeros([len(y_pred),len(y_pred[0]) * num_leaf],dtype=np.int64)
    for i in range(0,len(y_pred)):
        temp = np.arange(len(y_pred[0])) * num_leaf - 1 + np.array(y_pred[i])
        X_test_leaves[i][temp] += 1


    lm = LogisticRegression(penalty='l2',C=0.1) # logestic model construction
    lm.fit(X_train_leaves,y_train)  # fitting the data

    y_pred_est = lm.predict_proba(X_test_leaves)   # Give the probabilty on each label

    gbdtlr_auc1 = roc_auc_score(y_test, y_pred_est[:,1])
    print('lightGBDT+LR auc 1: %.5f' % gbdtlr_auc1)
    
    lr = LogisticRegression(n_jobs=-1)
    X_train_ext = hstack([X_train_leaves, X_train])
    lr.fit(X_train_ext, y_train)
    X_test_ext = hstack([X_test_leaves, X_test])
    y_pred_gbdtlr2 = lr.predict_proba(X_test_ext)[:, 1]
    gbdtlr_auc2 = roc_auc_score(y_test, y_pred_gbdtlr2)
    print('lightGBDT+LR auc 2: %.5f' % gbdtlr_auc2)
    f_time =time.clock()-start
    print('lightGBDT+LR time taken: %.2f'% f_time)
    
    #+NB

    
    start = time.clock()
    gnb= GaussianNB()

    gnb.fit(X_train_leaves, y_train)
    Y_pred_nb=gnb.predict_proba(X_test_leaves)[:,1]
    gnb_auc = roc_auc_score(y_test,Y_pred_nb)
    print('GBDT + GNB auc: %.5f'% gnb_auc)    
    '''    
    gnb.fit(X_train_ext, y_train)
    y_pred_gnb2=gnb.predict_proba(X_test_ext)[:,1]
    gnb_auc2=roc_auc_score(y_test,y_pred_gnb2)
    print('GNB auc2: %.5f' % gnb_auc2)
    '''
    #svc
    svc=SVC(probability=True)
    svc.fit(X_train_leaves, y_train)
    Y_pred_svc=svc.predict_proba(X_test_leaves)[:,1]
    svc_auc=roc_auc_score(y_test,Y_pred_svc)
    print('GBDT + SVC auc: %.5f' % svc_auc)
    
    svc.fit(X_train_ext, y_train)
    y_pred_svc2=svc.predict_proba(X_test_ext)[:, 1]
    svc_auc2=roc_auc_score(y_test,y_pred_svc2)
    print('GBDT + SVC auc2: %.5f' % svc_auc2)
    
    #KNN
    knn=KNeighborsClassifier(n_neighbors = 3)
    knn.fit(X_train_leaves, y_train)
    Y_pred_knn=knn.predict_proba(X_test_leaves)[:,1]
    knn_auc=roc_auc_score(y_test,Y_pred_knn)
    print('GBDT + KNN auc : %.5f' % knn_auc)
    
    knn.fit(X_train_ext, y_train)
    y_pred_knn2=knn.predict_proba(X_test_ext)[:, 1]
    knn_auc2=roc_auc_score(y_test,y_pred_knn2)
    print('GBDT + KNN auc2: %.5f' % knn_auc2)
    
    #perceptron

    perceptron = Perceptron()
    perceptron.fit(X_train_leaves, y_train)
    y_pred_perc=perceptron.predict(X_test_leaves)
    perc_auc=roc_auc_score(y_test,y_pred_perc)
    print('GBDT + Perceptron auc : %.5f' % perc_auc)
    
    perceptron.fit(X_train_ext, y_train)
    y_pred_perc2=perceptron.predict(X_test_ext)
    perc_auc2=roc_auc_score(y_test,y_pred_perc2)
    print('GBDT + Perceptron auc2 : %.5f' % perc_auc2 )
    
    #linear svc

    lin = LinearSVC()
    lin.fit(X_train_leaves, y_train)
    y_pred_lin=lin.predict(X_test_leaves)
    lin_auc=roc_auc_score(y_test,y_pred_lin)
    print('GBDT + Linear SVC auc : %.5f' % lin_auc)
    
    lin.fit(X_train_ext, y_train)
    y_pred_lin2=lin.predict(X_test_ext)
    lin_auc2=roc_auc_score(y_test,y_pred_lin2)
    print('GBDT + Linear SVC auc2 : %.5f' % lin_auc2)
    
    #SGD

    sgd = SGDClassifier(loss='log')
    sgd.fit(X_train_leaves, y_train)
    Y_pred_sgd=sgd.predict_proba(X_test_leaves)[:, 1]
    sgd_auc=roc_auc_score(y_test,Y_pred_sgd)
    print('GBDT + SGD auc : %.5f' % sgd_auc)
    
    sgd.fit(X_train_ext, y_train)
    Y_pred_sgd2=sgd.predict_proba(X_test_ext)[:, 1]
    sgd_auc2=roc_auc_score(y_test,Y_pred_sgd2)
    print('GBDT + SGD auc2 : %.5f' % sgd_auc2)
    f_time =time.clock()-start
    print('GBDT + other classfier time taken: %.2f'% f_time)    
    
    #XGB
    start = time.clock()
    xgb=XGBClassifier()
    xgb.fit(X_train,y_train)
    Y_pred_xgb=xgb.predict_proba(X_test)[:,1]
    xgb_auc= roc_auc_score(y_test,Y_pred_xgb)
    print('XGB auc : %.5f' % xgb_auc)
    
    #XGB with leaves

    xgb.fit(X_train_leaves,y_train)
    Y_pred_xgb=xgb.predict(X_test_leaves)
    xgb_auc= roc_auc_score(y_test,Y_pred_xgb)
    print('GBDT + XGB auc: %.5f' % xgb_auc)
    
    #XGB with features ext
    
    xgb.fit(X_train_ext, y_train)
    y_pred_xgb2=xgb.predict(X_test_ext)
    xgb_auc2=roc_auc_score(y_test,y_pred_xgb2)
    print('GBDT + XGB auc2: %.5f' %xgb_auc2)
    
    f_time =time.clock()-start
    print('XGB time taken: %.2f'% f_time)
    
    #lightGBM
    start = time.clock()
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
    
    
    
    # specify your configurations as a dict
    params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2', 'auc'},
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
    
    }

    # train
    gbm = lgb.train(params,
                    lgb_train,
                    num_boost_round=20,
                    valid_sets=lgb_eval,
                    early_stopping_rounds=5,
                    verbose_eval=False)



    y_pred_lgb = gbm.predict(X_test, num_iteration=gbm.best_iteration)
    # eval
    lgb_auc=roc_auc_score(y_test,y_pred_lgb)
    print('lightGBM auc : %.5f' % lgb_auc)
    
    lgb_train=lgb.Dataset(X_train_leaves, y_train)
    lgb_eval=lgb.Dataset(X_test_leaves, y_test, reference=lgb_train)
    gbm = lgb.train(params,
                  lgb_train,
                    num_boost_round=20,
                    valid_sets=lgb_eval,
                    early_stopping_rounds=5,
                   verbose_eval=False)
    y_pred_lgb2 =gbm.predict(X_test_leaves, num_iteration=gbm.best_iteration)
    lgb_auc2=roc_auc_score(y_test, y_pred_lgb2)
    
    print('GBDT + lightGBM auc : %.5f' % lgb_auc2)
    
    lgb_train=lgb.Dataset(X_train_ext, y_train)
    lgb_eval=lgb.Dataset(X_test_ext, y_test, reference=lgb_train)
    gbm = lgb.train(params,
                  lgb_train,
                    num_boost_round=20,
                    valid_sets=lgb_eval,
                    early_stopping_rounds=5,
                   verbose_eval=False)
    y_pred_lgb3 =gbm.predict(X_test_ext, num_iteration=gbm.best_iteration)
    lgb_auc3=roc_auc_score(y_test, y_pred_lgb3)
    
    print('GBDT + lightGBM auc2 : %.5f' % lgb_auc3)
    
    
    f_time=time.clock()-start
    print('lightGBM time taken: %.2f'% f_time)

In [7]:
example = pd.read_csv('example.csv')

In [18]:
lightgbm_lr(example)

gbdt+lr auc 1: 0.93285
gbdt+lr auc 2: 0.66409
GBDT+LR time taken: 1.46
GBDT + GNB auc: 0.87764
GBDT + SVC auc: 0.93378
GBDT + SVC auc2: 0.65948
GBDT + KNN auc : 0.90291
GBDT + KNN auc2: 0.61911
GBDT + Perceptron auc : 0.82928
GBDT + Perceptron auc2 : 0.50307
GBDT + Linear SVC auc : 0.83684
GBDT + Linear SVC auc2 : 0.50000
GBDT + SGD auc : 0.90660
GBDT + SGD auc2 : 0.52948
GBDT + other classfier time taken: 272.53
XGB auc : 0.93723
GBDT + XGB auc: 0.86437
GBDT + XGB auc2: 0.86131
XGB time taken: 35.44
lightGBM auc : 0.93477
GBDT + lightGBM auc : 0.92693
GBDT + lightGBM auc2 : 0.92838
lightGBM time taken: 1.29


In [20]:
lightgbm_lr(example)

lightGBDT+LR auc 1: 0.93531
lightGBDT+LR auc 2: 0.57736
lightGBDT+LR time taken: 1.23
GBDT + GNB auc: 0.87557
GBDT + SVC auc: 0.93586
GBDT + SVC auc2: 0.63296
GBDT + KNN auc : 0.89698
GBDT + KNN auc2: 0.59092
GBDT + Perceptron auc : 0.81886
GBDT + Perceptron auc2 : 0.54591
GBDT + Linear SVC auc : 0.82871
GBDT + Linear SVC auc2 : 0.46588
GBDT + SGD auc : 0.88501
GBDT + SGD auc2 : 0.48370
GBDT + other classfier time taken: 273.51
XGB auc : 0.93914
GBDT + XGB auc: 0.86303
GBDT + XGB auc2: 0.86624
XGB time taken: 35.62
lightGBM auc : 0.93678
GBDT + lightGBM auc : 0.93421
GBDT + lightGBM auc2 : 0.93481
lightGBM time taken: 1.36


In [21]:
lightgbm_lr(example)

lightGBDT+LR auc 1: 0.93708
lightGBDT+LR auc 2: 0.55838
lightGBDT+LR time taken: 1.21
GBDT + GNB auc: 0.87613
GBDT + SVC auc: 0.93309
GBDT + SVC auc2: 0.64572
GBDT + KNN auc : 0.89276
GBDT + KNN auc2: 0.60250
GBDT + Perceptron auc : 0.81466
GBDT + Perceptron auc2 : 0.50000
GBDT + Linear SVC auc : 0.83640
GBDT + Linear SVC auc2 : 0.50000
GBDT + SGD auc : 0.90296
GBDT + SGD auc2 : 0.49807
GBDT + other classfier time taken: 272.25
XGB auc : 0.93885
GBDT + XGB auc: 0.86197
GBDT + XGB auc2: 0.86210
XGB time taken: 35.98
lightGBM auc : 0.92947
GBDT + lightGBM auc : 0.93193
GBDT + lightGBM auc2 : 0.92949
lightGBM time taken: 1.39


In [None]:
example=pd.read_csv('example2.csv')

In [56]:
lightgbm_lr(example) # trying with other dataset 1

lightGBDT+LR auc 1: 0.95335
lightGBDT+LR auc 2: 0.62290
lightGBDT+LR time taken: 1.29
GBDT + GNB auc: 0.88576
GBDT + SVC auc: 0.94700
GBDT + SVC auc2: 0.90184
GBDT + KNN auc : 0.92145
GBDT + KNN auc2: 0.85685
GBDT + Perceptron auc : 0.86075
GBDT + Perceptron auc2 : 0.33242
GBDT + Linear SVC auc : 0.86186
GBDT + Linear SVC auc2 : 0.56348
GBDT + SGD auc : 0.92780
GBDT + SGD auc2 : 0.71571
GBDT + other classfier time taken: 344.54
XGB auc : 0.94792
GBDT + XGB auc: 0.88256
GBDT + XGB auc2: 0.87486
XGB time taken: 42.65
lightGBM auc : 0.94536
GBDT + lightGBM auc : 0.94502
GBDT + lightGBM auc2 : 0.94892
lightGBM time taken: 1.36


In [None]:
lightgbm_lr(example)# trying with other dataset2

lightGBDT+LR auc 1: 0.96087
lightGBDT+LR auc 2: 0.61448
lightGBDT+LR time taken: 1.05
GBDT + GNB auc: 0.89922
GBDT + SVC auc: 0.95530
GBDT + SVC auc2: 0.91675
GBDT + KNN auc : 0.93072
GBDT + KNN auc2: 0.88133
GBDT + Perceptron auc : 0.77934
GBDT + Perceptron auc2 : 0.50323
GBDT + Linear SVC auc : 0.87976
GBDT + Linear SVC auc2 : 0.55235
GBDT + SGD auc : 0.92308
GBDT + SGD auc2 : 0.49132
GBDT + other classfier time taken: 343.14
XGB auc : 0.95947
GBDT + XGB auc: 0.88968
GBDT + XGB auc2: 0.88518
XGB time taken: 44.83
lightGBM auc : 0.95718
GBDT + lightGBM auc : 0.95651
GBDT + lightGBM auc2 : 0.95730
lightGBM time taken: 1.47


In [None]:
lightgbm_lr(example)# trying with other dataset3

lightGBDT+LR auc 1: 0.95171
lightGBDT+LR auc 2: 0.61480
lightGBDT+LR time taken: 1.14
GBDT + GNB auc: 0.87419
GBDT + SVC auc: 0.94519
GBDT + SVC auc2: 0.91042
GBDT + KNN auc : 0.91728
GBDT + KNN auc2: 0.87366
GBDT + Perceptron auc : 0.84676
GBDT + Perceptron auc2 : 0.69394
GBDT + Linear SVC auc : 0.86568
GBDT + Linear SVC auc2 : 0.77678
GBDT + SGD auc : 0.92247
GBDT + SGD auc2 : 0.45208
GBDT + other classfier time taken: 340.66
XGB auc : 0.94640
