In [35]:
import numpy as np
import pandas as pd

from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.model_selection import KFold
import xgboost as xgb
from sklearn.metrics import accuracy_score
from bayes_opt import BayesianOptimization
from sklearn.metrics import roc_curve, auc, roc_auc_score
import lightgbm as lgb

import matplotlib.pyplot as plt
from matplotlib import gridspec
%matplotlib inline



In [11]:
# Load Data

file_name = "../data/train_preprocessed2.csv"
train_df = pd.read_csv(file_name, low_memory = False)

train_df.head()

Unnamed: 0,A..papers,A.papers,B.papers,C.papers,Dif.countries,Perc_non_australian,Number.people,PHD,Max.years.univ,Grants.succ,...,SEO.11,SEO.12,SEO.13,SEO.14,SEO.15,SEO.16,SEO.17,SEO.18,SEO.19,Grant.Status
0,4.0,2.0,0.0,0.0,1,0.0,1,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
1,6.0,12.0,2.0,2.0,1,1.0,1,1.0,20.0,0.0,...,0,0,0,0,0,0,0,0,0,1
2,7.0,20.0,20.0,7.0,2,0.75,4,2.0,50.0,0.0,...,0,0,2,0,0,0,0,0,0,1
3,0.0,3.0,13.0,3.0,1,1.0,2,2.0,15.0,0.0,...,0,0,2,0,0,0,0,0,0,1
4,3.0,0.0,1.0,0.0,1,0.0,1,1.0,10.0,0.0,...,0,0,0,0,0,0,1,0,0,0


In [12]:
#Setup data : Divide Test and Train set

array = train_df.values

data = array[:, 0:70]
target = array[:, 70]

data, target

seed = 7
test_size = 0.2

data_train, data_test, target_train, target_test = train_test_split(data, target, test_size = test_size, random_state = seed)

In [13]:
# set XGB Model -> parameters set default

model = xgb.XGBClassifier()

# make predictions with kfold cross validation score
kfold = KFold(n_splits = 10, random_state = 7)
results = cross_val_score(model, data, target, cv = kfold)
accuracy = results.mean()*100
print("Accuracy : %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))
    

Accuracy : 85.81% (4.06%)


In [None]:
def visualize 

In [21]:
# Making a Model function for bayesian optimization

def XGB_Train_Model_using_KFold(eta, min_child_weight, max_depth, gamma, subsample,  colsample_bytree, n_splits_param) : 
    xgb_params = {
        'n_trees' : 250,
        'eta' : max(eta, 0),
        'max_depth' : int(max_depth),
        'subsample' : max(min(subsample, 1), 0),
        'objective' : 'reg:linear', 
        'base_score' : np.mean(target),
        'silent' : 1,
        'min_child_weight' : int(min_child_weight),
        'gamma' : max(gamma, 0), 
        'colsample_bytree' : max(min(colsample_bytree, 1), 0)
    }
    
    model = xgb.XGBClassifier(**xgb_params)
    
    n_splits = int(max(n_splits_param, 5))
    kfold = KFold(n_splits = n_splits, random_state = 7)
    results = cross_val_score(model, data, target, cv = kfold)
    accuracy = results.mean()*100
    print("Accuracy : %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))
    
    '''
    model.fit(data_train, target_train)
    target_pred = model.predict(data_test)
    predictions = [round(value) for value in target_pred]
    accuracy = accuracy_score(target_test, predictions)
    
    '''
    return accuracy

In [22]:
xgb_params = {
    
    #Learning Rate 
    'eta' : (0.01, 0.2),
    
    #Minimum sum of weights : to control overfitting
    'min_child_weight' : (1, 20), 
    
    #Maximum depth of a tree : to control overfitting
    'max_depth' : (2, 10),
    
    #minimum loss reduction required to make a split : makes algorithm conservative
    'gamma' : (0, 10), 
    
    #max_delta_step is not needed since data is not imbalanced
    #'max_delta_step' : (0, 10),
    
    #Fraction of observations to be randomly samples for each tree
    #Lower: prevent overfitting
    'subsample' : (0.5, 1),
    
    #Fraction of columns to be randomly samples for each tree
    'colsample_bytree' : (0.1, 1),
    
    #colsamble_bylevel is not needed since subsample and colsample_bytree will do the job
    #'colsample_bylevel' = (0.1, 1),
    
    #L2 regularization term on weights
    #'lambda' = (?, ?)
    
    #L1 regularization term on weight
    #'alpha' = (?, ?)
    
    #scale_pos_weight is not needed since data is not imbalanced
    #'scale_pos_weight' = (0, 10)
    'n_splits_param' : (5, 10)
}


xgb_bayesOPT = BayesianOptimization(XGB_Train_Model_using_KFold, xgb_params)
xgb_bayesOPT.maximize(init_points = 5, n_iter = 25)


[31mInitialization[0m
[94m----------------------------------------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |       eta |     gamma |   max_depth |   min_child_weight |   n_splits_param |   subsample | 
Accuracy : 84.46% (2.40%)
    1 | 00m01s | [35m  84.46316[0m | [32m            0.1276[0m | [32m   0.0978[0m | [32m   1.6216[0m | [32m     6.5199[0m | [32m           17.0277[0m | [32m          5.7346[0m | [32m     0.6046[0m | 
Accuracy : 81.09% (5.26%)
    2 | 00m04s |   81.08705 |             0.9244 |    0.1938 |    9.2053 |      3.6389 |            16.9497 |           6.8325 |      0.8845 | 
Accuracy : 81.83% (5.34%)
    3 | 00m01s |   81.83375 |             0.2138 |    0.1013 |    8.7100 |      2.3717 |             1.2685 |           6.9365 |      0.5920 | 
Accuracy : 83.64% (4.20%)
    4 | 00m16s |   83.63574 |             0.6920 |    0.1515 |    9.6650

  " state: %s" % convergence_dict)


Accuracy : 83.67% (3.61%)
   20 | 00m12s |   83.66932 |             0.1000 |    0.1192 |    1.9424 |      9.6183 |             8.2421 |           9.0787 |      1.0000 | 
Accuracy : 85.24% (4.27%)
   21 | 00m24s |   85.24398 |             1.0000 |    0.0100 |    3.0837 |      8.7486 |            15.3883 |           8.4770 |      1.0000 | 
Accuracy : 83.89% (4.40%)
   22 | 00m15s |   83.88838 |             0.3934 |    0.0100 |    7.6518 |      4.6760 |             5.2616 |           7.3869 |      0.5000 | 
Accuracy : 85.25% (4.43%)
   23 | 00m23s |   85.25494 |             0.8045 |    0.0100 |    1.6985 |      9.0445 |            16.7597 |           7.7789 |      1.0000 | 


  " state: %s" % convergence_dict)


Accuracy : 85.50% (4.48%)
   24 | 00m31s |   85.49627 |             1.0000 |    0.0100 |    1.3573 |      9.2668 |            12.5456 |          10.0000 |      1.0000 | 
Accuracy : 83.79% (4.96%)
   25 | 00m13s |   83.78611 |             0.6099 |    0.1224 |    5.2340 |      7.0468 |            16.2569 |           5.7088 |      1.0000 | 
Accuracy : 84.77% (4.40%)
   26 | 00m39s |   84.77373 |             0.7462 |    0.0100 |    4.4719 |      9.5183 |             6.6811 |           9.7206 |      0.5000 | 
Accuracy : 84.91% (6.54%)
   27 | 00m18s |   84.91104 |             0.1436 |    0.0100 |    3.2550 |      8.2441 |             7.0053 |          10.0000 |      0.8234 | 
Accuracy : 84.29% (3.59%)
   28 | 00m20s |   84.29031 |             1.0000 |    0.2000 |   10.0000 |      6.5502 |             7.2513 |           7.4149 |      1.0000 | 
Accuracy : 86.40% (4.49%)
   29 | 00m32s | [35m  86.40367[0m | [32m            0.6076[0m | [32m   0.0100[0m | [32m   3.0492[0m | [32m     9.9

In [33]:
#lightGBM

def LGB_Train_Model(learning_rate, max_depth, min_child_weight, colsample_bytree, subsample ) :
    lgb_train = lgb.Dataset(data_train, target_train)
    lgb_eval = lgb.Dataset(data_test, target_test, reference=lgb_train)
    
    # specify your configurations as a dict
    lgb_params = {
        
    #static parameters
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2', 'auc'},
    'feature_fraction': 0.9,
    'bagging_fraction': 1,
    'bagging_freq': 5,
    'verbose': 0,
        
        
    
    'learning_rate': max(learning_rate, 0),
    'max_depth': int(max_depth),
    'min_child_weight' : int(min_child_weight),
    'colsample_bytree' : max(min(colsample_bytree, 1), 0),
    'subsample' : max(min(subsample, 1), 0)
    }

    # train
    gbm = lgb.train(lgb_params,
                    lgb_train,
                    num_boost_round = 20,
                    valid_sets=lgb_eval,
                    early_stopping_rounds = 5,
                    verbose_eval=False)



    y_pred_lgb = gbm.predict(data_test, num_iteration=gbm.best_iteration)
    # eval
    lgb_auc=roc_auc_score(target_test,y_pred_lgb)
    #print('lightGBM auc : %.5f' % lgb_auc)
    return lgb_auc


In [None]:
lgb_params = {
    'learning_rate' : (0.01, 2), 
    'max_depth' : (2, 10), 
    'min_child_weight' : (1, 10), 
    'colsample_bytree' : (0.1, 10), 
    'subsample' : (0.5, 1)
    
}


lgb_bayesOPT = BayesianOptimization(LGB_Train_Model, lgb_params)
lgb_bayesOPT.maximize(init_points = 5, n_iter = 25)

[31mInitialization[0m
[94m---------------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |   learning_rate |   max_depth |   min_child_weight |   subsample | 
    1 | 00m00s | [35m   0.94892[0m | [32m            7.1496[0m | [32m         0.9163[0m | [32m     6.9190[0m | [32m            1.0144[0m | [32m     0.5825[0m | 
    2 | 00m00s |    0.94178 |             5.5496 |          1.2430 |      6.9724 |             9.6937 |      0.9394 | 
    3 | 00m00s | [35m   0.96102[0m | [32m            5.5036[0m | [32m         0.1237[0m | [32m     7.6376[0m | [32m            8.7081[0m | [32m     0.6177[0m | 
    4 | 00m00s |    0.94265 |             0.3256 |          1.4206 |      2.6984 |             1.4903 |      0.5406 | 
    5 | 00m00s |    0.94412 |             9.7985 |          0.3609 |      2.0446 |             1.8776 |      0.7085 | 
[31mBayesian Optimization

  " state: %s" % convergence_dict)


   11 | 00m07s |    0.89843 |            10.0000 |          0.0100 |      2.0960 |             9.5775 |      0.5000 | 
   12 | 00m07s |    0.89843 |             4.9837 |          0.0100 |      2.5299 |             1.0080 |      0.9483 | 
   13 | 00m11s |    0.93069 |            10.0000 |          2.0000 |      6.8500 |             5.6449 |      0.5000 | 
   14 | 00m09s |    0.94865 |            10.0000 |          0.0100 |     10.0000 |             1.0000 |      0.5000 | 
   15 | 00m09s |    0.93081 |             0.1000 |          2.0000 |     10.0000 |             1.0000 |      0.5000 | 
   16 | 00m08s |    0.83801 |            10.0000 |          2.0000 |      2.0000 |             1.0000 |      0.5000 | 
   17 | 00m13s |    0.93069 |             0.1000 |          2.0000 |      6.3209 |             5.0912 |      1.0000 | 
   18 | 00m14s |    0.94944 |            10.0000 |          0.0100 |      6.1124 |             3.9197 |      1.0000 | 
   19 | 00m14s |    0.94681 |             0.1000

  " state: %s" % convergence_dict)


   21 | 00m13s |    0.93081 |             6.4136 |          2.0000 |     10.0000 |            10.0000 |      0.5000 | 
   22 | 00m11s |    0.89843 |             7.1253 |          0.0100 |      2.0000 |             5.3529 |      0.5000 | 
   23 | 00m12s |    0.94865 |             0.1000 |          0.0100 |     10.0000 |             5.8810 |      0.5000 | 
   24 | 00m10s |    0.94765 |             0.1000 |          0.0100 |      7.7256 |            10.0000 |      1.0000 | 
   25 | 00m10s |    0.89843 |             0.1000 |          0.0100 |      2.0000 |             1.0000 |      1.0000 | 
   26 | 00m12s |    0.91887 |             0.1000 |          2.0000 |      4.6441 |            10.0000 |      0.5000 | 
   27 | 00m10s |    0.93792 |            10.0000 |          0.0100 |      4.8637 |             1.0000 |      0.5000 | 
   28 | 00m10s |    0.94865 |             8.5471 |          0.0100 |     10.0000 |             4.7186 |      0.5000 | 
   29 | 00m10s |    0.94865 |             2.3472