In [45]:
import numpy as np
import pandas as pd

from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.model_selection import KFold
import xgboost as xgb
from sklearn.metrics import accuracy_score
from bayes_opt import BayesianOptimization
from sklearn.metrics import roc_curve, auc, roc_auc_score
import lightgbm as lgb
import gc
from sklearn.grid_search import GridSearchCV

import warnings
warnings.filterwarnings('ignore')



In [46]:
# Load Data

file_name = "../data/train_preprocessed2.csv"
train_df = pd.read_csv(file_name, low_memory = False)

train_df.head()

Unnamed: 0,A..papers,A.papers,B.papers,C.papers,Dif.countries,Perc_non_australian,Number.people,PHD,Max.years.univ,Grants.succ,...,SEO.11,SEO.12,SEO.13,SEO.14,SEO.15,SEO.16,SEO.17,SEO.18,SEO.19,Grant.Status
0,4.0,2.0,0.0,0.0,1,0.0,1,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
1,6.0,12.0,2.0,2.0,1,1.0,1,1.0,20.0,0.0,...,0,0,0,0,0,0,0,0,0,1
2,7.0,20.0,20.0,7.0,2,0.75,4,2.0,50.0,0.0,...,0,0,2,0,0,0,0,0,0,1
3,0.0,3.0,13.0,3.0,1,1.0,2,2.0,15.0,0.0,...,0,0,2,0,0,0,0,0,0,1
4,3.0,0.0,1.0,0.0,1,0.0,1,1.0,10.0,0.0,...,0,0,0,0,0,0,1,0,0,0


In [47]:
#Setup data : Divide Test and Train set

array = train_df.values

data = array[:, 0:70]
target = array[:, 70]

data, target

seed = 7
test_size = 0.2

data_train, data_test, target_train, target_test = train_test_split(data, target, test_size = test_size, random_state = seed)



In [48]:
# XGB (parameter default) Result 

model = xgb.XGBClassifier(eval_metric = 'auc')

# make predictions with kfold cross validation score
kfold = KFold(n_splits = 5, random_state = 7)
results = cross_val_score(model, data, target, cv = kfold)
accuracy = results.mean()*100
print("Accuracy : %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))
    

Accuracy : 84.78% (2.85%)


In [51]:
# XGB model Function

def XGB_Train_Model_using_KFold(min_child_weight, max_depth, gamma, subsample,  colsample_bytree) : 
    xgb_params = {
        'n_trees' : 20,
        'eta' : 0.2,
        'max_depth' : int(max_depth),
        'subsample' : max(min(subsample, 1), 0),
        'objective' : 'reg:linear', 
        'eval_metric' : 'auc',
        'silent' : 1,
        'min_child_weight' : int(min_child_weight),
        'gamma' : max(gamma, 0), 
        'colsample_bytree' : max(min(colsample_bytree, 1), 0)
    }
    
    model = xgb.XGBClassifier(**xgb_params)
    
    kfold = KFold(n_splits = 5, random_state = 7)
    results = cross_val_score(model, data, target, cv = kfold)
    auc = results.mean()*100
    print("AUC : %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))
    
    return auc

In [53]:
#XGB (parameters tuned by Bayesian Optimization) result

xgb_params = {
    
    #Learning Rate 
    #'eta' : (0.01, 0.2),
    
    #Minimum sum of weights : to control overfitting
    'min_child_weight' : (1, 20), 
    
    #Maximum depth of a tree : to control overfitting
    'max_depth' : (2, 10),
    
    #minimum loss reduction required to make a split : makes algorithm conservative
    'gamma' : (0, 10), 
    
    #max_delta_step is not needed since data is not imbalanced
    #'max_delta_step' : (0, 10),
    
    #Fraction of observations to be randomly samples for each tree
    #Lower: prevent overfitting
    'subsample' : (0.5, 1),
    
    #Fraction of columns to be randomly samples for each tree
    'colsample_bytree' : (0.1, 1),
    
    #colsamble_bylevel is not needed since subsample and colsample_bytree will do the job
    #'colsample_bylevel' = (0.1, 1),
    
    #L2 regularization term on weights
    #'lambda' = (?, ?)
    
    #L1 regularization term on weight
    #'alpha' = (?, ?)
    
    #scale_pos_weight is not needed since data is not imbalanced
    #'scale_pos_weight' = (0, 10)
    #'n_splits_param' : (5, 10)
}


xgb_bayesOPT = BayesianOptimization(XGB_Train_Model_using_KFold, xgb_params)
xgb_bayesOPT.maximize(init_points = 5, n_iter = 100)


[31mInitialization[0m
[94m---------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |     gamma |   max_depth |   min_child_weight |   subsample | 
AUC : 82.14% (5.83%)
    1 | 00m05s | [35m  82.14403[0m | [32m            0.6237[0m | [32m   7.4242[0m | [32m     5.0288[0m | [32m           12.6237[0m | [32m     0.6509[0m | 
AUC : 81.37% (5.61%)
    2 | 00m03s |   81.37448 |             0.9829 |    8.4029 |      3.0777 |            18.6693 |      0.8736 | 
AUC : 82.71% (5.84%)
    3 | 00m06s | [35m  82.70673[0m | [32m            0.7951[0m | [32m   6.1211[0m | [32m     6.5510[0m | [32m           12.9404[0m | [32m     0.8849[0m | 
AUC : 84.62% (3.79%)
    4 | 00m09s | [35m  84.62414[0m | [32m            0.8043[0m | [32m   2.0873[0m | [32m     7.9165[0m | [32m            3.1102[0m | [32m     0.5233[0m | 
AUC : 82.67% (5.38%)
    5 | 00m02s |   82.6

KeyboardInterrupt: 

In [9]:
#lightGBM (parameter default) result

lgb_train = lgb.Dataset(data_train, target_train)
lgb_eval = lgb.Dataset(data_test, target_test, reference=lgb_train)
lgb_params = {
        
    #static parameters
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2', 'auc'},
    'feature_fraction': 0.9,
    'bagging_fraction': 1,
    'bagging_freq': 5,
    'verbose': 0
}
cv_results = lgb.cv(lgb_params, lgb_train, num_boost_round=20, nfold=5, 
                verbose_eval=20, early_stopping_rounds=5)
cv_results['auc-mean'][-1]

[20]	cv_agg's auc: 0.94954 + 0.0060144	cv_agg's l2: 0.0937773 + 0.00539591


0.9495402224421168

In [10]:
#LGB model function (eval metric = auc, using kfold)

def LGB_Train_Model(learning_rate, max_depth, min_child_weight, colsample_bytree, subsample ) :
    lgb_train = lgb.Dataset(data_train, target_train)
    lgb_eval = lgb.Dataset(data_test, target_test, reference=lgb_train)
    
    # specify your configurations as a dict
    lgb_params = {
        
    #static parameters
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2', 'auc'},
    'feature_fraction': 0.9,
    'bagging_fraction': 1,
    'bagging_freq': 5,
    'verbose': 0,
        
        
    
    'learning_rate': max(learning_rate, 0),
    'max_depth': int(max_depth),
    'min_child_weight' : int(min_child_weight),
    'colsample_bytree' : max(min(colsample_bytree, 1), 0),
    'subsample' : max(min(subsample, 1), 0)
    }

    cv_results = lgb.cv(lgb_params, lgb_train, num_boost_round=20, nfold=5, 
                    verbose_eval=20, early_stopping_rounds=5)

    return cv_results['auc-mean'][-1]
    

In [17]:
#lightGBM (parameters tuned with Bayesian Optimization) Result

lgb_params = {
    'learning_rate' : (0.01, 2), 
    'max_depth' : (2, 10), 
    'min_child_weight' : (1, 10), 
    'colsample_bytree' : (0.1, 10), 
    'subsample' : (0.5, 1)
    
}


lgb_bayesOPT = BayesianOptimization(LGB_Train_Model, lgb_params)
lgb_bayesOPT.maximize(init_points = 5, n_iter = 25)

[31mInitialization[0m
[94m---------------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |   learning_rate |   max_depth |   min_child_weight |   subsample | 
[20]	cv_agg's auc: 0.934641 + 0.00900282	cv_agg's l2: 0.110407 + 0.00650444
    1 | 00m00s | [35m   0.93464[0m | [32m            2.1226[0m | [32m         0.7158[0m | [32m     2.7484[0m | [32m            7.0980[0m | [32m     0.7328[0m | 
    2 | 00m00s | [35m   0.94739[0m | [32m            2.5823[0m | [32m         0.5638[0m | [32m     8.7595[0m | [32m            9.5459[0m | [32m     0.8920[0m | 
[20]	cv_agg's auc: 0.943509 + 0.00615786	cv_agg's l2: 0.098954 + 0.00503382
    3 | 00m00s |    0.94351 |             8.7015 |          0.3612 |      3.6977 |             4.5547 |      0.5885 | 
    4 | 00m00s |    0.92216 |             4.9546 |          1.7841 |      8.4787 |             8.2556 |      0.85

In [60]:
import time

xgb_clf = xgb.XGBClassifier(eval_metric = 'auc', n_trees = 20)

xgb_params = {
    'learning_rate' : [0.2],
    'min_child_weight' : np.arange(1, 20, 4),      # 5
    'max_depth' : np.arange(2, 10, 2),             # 4 
    'gamma' : np.arange(0, 10, 2),                 # 5
    #'subsample' : np.arange(0.5, 1.0, 0.1),        # 5
    #'colsample_bytree' : np.arange(0.1, 1.0, 0.2), # 5
    'objective' : ['reg:linear'],
    'silent' : [1],
}

GSCV = GridSearchCV(xgb_clf, xgb_params, cv = 5, scoring = 'accuracy', n_jobs = 1, verbose = 1)

start_time = time.time()
GSCV.fit(data, target)
elapsed_time = time.time() - start_time
print("%s seconds elpased."%elapsed_time)




Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:  9.4min finished


566.7061882019043 seconds elpased.


In [61]:
best_parameters, score, _ = max(GSCV.grid_scores_, key=lambda x: x[1])
print('AUC score:', score)
print('best parameters:', best_parameters)

AUC score: 0.8352090032154341
best parameters: {'gamma': 6, 'learning_rate': 0.2, 'max_depth': 6, 'min_child_weight': 17, 'objective': 'reg:linear', 'silent': 1}
