In [7]:
import numpy as np
import pandas as pd

from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.model_selection import KFold
import xgboost as xgb
from sklearn.metrics import accuracy_score
from bayes_opt import BayesianOptimization
from sklearn.metrics import roc_curve, auc, roc_auc_score
import lightgbm as lgb
import gc
from sklearn.grid_search import GridSearchCV
import time
import warnings
warnings.filterwarnings('ignore')



In [8]:
# Load Data

file_name = "../data/train_preprocessed2.csv"
train_df = pd.read_csv(file_name, low_memory = False)

train_df.head()

Unnamed: 0,A..papers,A.papers,B.papers,C.papers,Dif.countries,Perc_non_australian,Number.people,PHD,Max.years.univ,Grants.succ,...,SEO.11,SEO.12,SEO.13,SEO.14,SEO.15,SEO.16,SEO.17,SEO.18,SEO.19,Grant.Status
0,4.0,2.0,0.0,0.0,1,0.0,1,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
1,6.0,12.0,2.0,2.0,1,1.0,1,1.0,20.0,0.0,...,0,0,0,0,0,0,0,0,0,1
2,7.0,20.0,20.0,7.0,2,0.75,4,2.0,50.0,0.0,...,0,0,2,0,0,0,0,0,0,1
3,0.0,3.0,13.0,3.0,1,1.0,2,2.0,15.0,0.0,...,0,0,2,0,0,0,0,0,0,1
4,3.0,0.0,1.0,0.0,1,0.0,1,1.0,10.0,0.0,...,0,0,0,0,0,0,1,0,0,0


In [9]:
#Setup data : Divide Test and Train set

array = train_df.values

data = array[:, 0:70]
target = array[:, 70]

data, target

seed = 7
test_size = 0.2

data_train, data_test, target_train, target_test = train_test_split(data, target, test_size = test_size, random_state = seed)



In [10]:
# XGB (parameter default) Result 

model = xgb.XGBClassifier(eval_metric = 'auc')

# make predictions with kfold cross validation score
kfold = KFold(n_splits = 5, random_state = 7)
results = cross_val_score(model, data, target, cv = kfold)
accuracy = results.mean()*100
print("Accuracy : %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))
    

Accuracy : 84.78% (2.85%)


In [11]:
# XGB model Function

def XGB_Train_Model_using_KFold(min_child_weight, max_depth, gamma, subsample,  colsample_bytree) : 
    xgb_params = {
        'n_trees' : 20,
        'eta' : 0.2,
        'max_depth' : int(max_depth),
        'subsample' : max(min(subsample, 1), 0),
        'objective' : 'reg:linear', 
        'eval_metric' : 'auc',
        'silent' : 1,
        'min_child_weight' : int(min_child_weight),
        'gamma' : max(gamma, 0), 
        'colsample_bytree' : max(min(colsample_bytree, 1), 0)
    }
    
    model = xgb.XGBClassifier(**xgb_params)
    
    kfold = KFold(n_splits = 5, random_state = 7)
    results = cross_val_score(model, data, target, cv = kfold)
    auc = results.mean()*100
    print("AUC : %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))
    
    return auc

In [12]:
#XGB (parameters tuned by Bayesian Optimization) result

xgb_params = {
    
    #Learning Rate 
    #'eta' : (0.01, 0.2),
    
    #Minimum sum of weights : to control overfitting
    'min_child_weight' : (1, 20), 
    
    #Maximum depth of a tree : to control overfitting
    'max_depth' : (2, 10),
    
    #minimum loss reduction required to make a split : makes algorithm conservative
    'gamma' : (0, 10), 
    
    #max_delta_step is not needed since data is not imbalanced
    #'max_delta_step' : (0, 10),
    
    #Fraction of observations to be randomly samples for each tree
    #Lower: prevent overfitting
    'subsample' : (0.5, 1),
    
    #Fraction of columns to be randomly samples for each tree
    'colsample_bytree' : (0.1, 1),
    
    #colsamble_bylevel is not needed since subsample and colsample_bytree will do the job
    #'colsample_bylevel' = (0.1, 1),
    
    #L2 regularization term on weights
    #'lambda' = (?, ?)
    
    #L1 regularization term on weight
    #'alpha' = (?, ?)
    
    #scale_pos_weight is not needed since data is not imbalanced
    #'scale_pos_weight' = (0, 10)
    #'n_splits_param' : (5, 10)
}


xgb_bayesOPT = BayesianOptimization(XGB_Train_Model_using_KFold, xgb_params)
start_time = time.time()
xgb_bayesOPT.maximize(init_points = 5, n_iter = 100)
elapsed_time = time.time() - start_time
print("elapsed time : %s"%elapsed_time)

[31mInitialization[0m
[94m---------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |     gamma |   max_depth |   min_child_weight |   subsample | 
AUC : 81.88% (6.08%)
    1 | 00m04s | [35m  81.87992[0m | [32m            0.8075[0m | [32m   5.7409[0m | [32m     3.0852[0m | [32m           12.8399[0m | [32m     0.6040[0m | 
AUC : 82.33% (4.04%)
    2 | 00m04s | [35m  82.32733[0m | [32m            0.3411[0m | [32m   7.0890[0m | [32m     7.0304[0m | [32m            1.1893[0m | [32m     0.5380[0m | 
AUC : 84.92% (3.46%)
    3 | 00m07s | [35m  84.92274[0m | [32m            0.7121[0m | [32m   2.4828[0m | [32m     6.4921[0m | [32m           18.5506[0m | [32m     0.6307[0m | 
AUC : 83.26% (5.04%)
    4 | 00m03s |   83.25772 |             0.6164 |    0.4013 |      3.5158 |            19.2232 |      0.6045 | 
AUC : 84.83% (3.22%)
    5 | 00m08s |   84.8

AUC : 85.99% (3.18%)
   56 | 00m12s | [35m  85.99055[0m | [32m            0.3891[0m | [32m   2.0295[0m | [32m     6.2355[0m | [32m           19.7135[0m | [32m     1.0000[0m | 
AUC : 85.89% (2.97%)
   57 | 00m12s |   85.88720 |             0.3657 |    2.4536 |      6.2515 |            19.6551 |      1.0000 | 
AUC : 85.66% (2.78%)
   58 | 00m12s |   85.65752 |             0.3366 |    2.0257 |      6.1005 |            19.8501 |      1.0000 | 
AUC : 85.32% (3.56%)
   59 | 00m13s |   85.32457 |             0.5612 |    2.2080 |      6.3127 |            19.4655 |      1.0000 | 
AUC : 83.70% (0.89%)
   60 | 00m10s |   83.70480 |             0.1000 |    2.3613 |      6.3412 |            19.5207 |      1.0000 | 
AUC : 83.69% (5.16%)
   61 | 00m14s |   83.69418 |             0.6753 |    2.9041 |      6.6736 |            19.9540 |      0.6002 | 
AUC : 82.53% (5.95%)
   62 | 00m13s |   82.53452 |             0.5533 |    6.1109 |      6.7107 |             5.2662 |      0.7489 | 
AUC : 85

In [13]:
#lightGBM (parameter default) result

lgb_train = lgb.Dataset(data, target)
lgb_params = {
        
    #static parameters
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2', 'auc'},
    'feature_fraction': 0.9,
    'bagging_fraction': 1,
    'bagging_freq': 5,
    'verbose': 0
}
cv_results = lgb.cv(lgb_params, lgb_train, num_boost_round=20, nfold=5, 
                verbose_eval=20, early_stopping_rounds=5)
cv_results['auc-mean'][-1]

[20]	cv_agg's auc: 0.952692 + 0.00370755	cv_agg's l2: 0.0911132 + 0.00317306


0.9526915888168916

In [26]:
#LGB model function (eval metric = auc, using kfold)

def LGB_Train_Model(gamma, max_depth, min_child_weight, colsample_bytree, subsample, n_boost_roundP, num_iterations ) :
    lgb_train = lgb.Dataset(data, target)
    
    # specify your configurations as a dict
    lgb_params = {
        
    #static parameters
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2', 'auc'},
    'feature_fraction': 0.9,
    'bagging_fraction': 1,
    'bagging_freq': 5,
    'verbose': 0,
    'learning_rate': 0.2,
        
    'num_iterations' : int(num_iterations),
    'max_depth': int(max_depth),
    'min_child_weight' : int(min_child_weight),
    'colsample_bytree' : max(min(colsample_bytree, 1), 0),
    'subsample' : max(min(subsample, 1), 0),
    'gamma' : max(gamma, 0), 
    }

    cv_results = lgb.cv(lgb_params, lgb_train, num_boost_round=int(n_boost_roundP), nfold=5, early_stopping_rounds=5)
    return cv_results['auc-mean'][-1]
    

In [36]:
#lightGBM (parameters tuned with Bayesian Optimization) Result

lgb_params = {
    'max_depth' : (2, 10), 
    'min_child_weight' : (1, 10), 
    'colsample_bytree' : (0.1, 10), 
    'subsample' : (0.5, 1),
    'gamma' : (0, 10),
    'num_iterations' : (50, 200),
    'n_boost_roundP': (20, 100)
}


lgb_bayesOPT = BayesianOptimization(LGB_Train_Model, lgb_params)
start_time = time.time()
lgb_bayesOPT.maximize(init_points = 5, n_iter = 25)
elapsed_time = time.time() - start_time
print("elpased time is %s"%elapsed_time)

[31mInitialization[0m
[94m-----------------------------------------------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |     gamma |   max_depth |   min_child_weight |   n_boost_roundP |   num_iterations |   subsample | 
    1 | 00m01s | [35m   0.94898[0m | [32m            9.3173[0m | [32m   7.9754[0m | [32m     2.8515[0m | [32m            1.6913[0m | [32m         24.6957[0m | [32m        150.1166[0m | [32m     0.6204[0m | 
    2 | 00m00s | [35m   0.95885[0m | [32m            1.4491[0m | [32m   2.0217[0m | [32m     7.7431[0m | [32m            6.2489[0m | [32m         36.8139[0m | [32m        132.1710[0m | [32m     0.9812[0m | 
    3 | 00m00s |    0.95729 |             5.6036 |    6.9760 |      5.7568 |             9.0937 |          80.5377 |         110.2887 |      0.8057 | 
    4 | 00m01s |    0.94770 |             3.6660 |    4.9478 |      2

In [37]:


import time

xgb_clf = xgb.XGBClassifier(eval_metric = 'auc', n_trees = 20)

xgb_params = {
    'learning_rate' : [0.2],
    'min_child_weight' : np.arange(1, 20, 5),      # 4
    'max_depth' : np.arange(2, 10, 2),             # 4 
    'gamma' : np.arange(0, 10, 2.5),                 # 4
    'subsample' : np.arange(0.5, 1.0, 0.25),        # 2
    'colsample_bytree' : np.arange(0.1, 1.0, 0.3), # 3
    'objective' : ['reg:linear'],
    'silent' : [1],
}

GSCV = GridSearchCV(xgb_clf, xgb_params, cv = 5, scoring = 'accuracy', n_jobs = 1, verbose = 1)

start_time = time.time()
GSCV.fit(data, target)
elapsed_time = time.time() - start_time
print("%s seconds elpased."%elapsed_time)




Fitting 5 folds for each of 384 candidates, totalling 1920 fits


[Parallel(n_jobs=1)]: Done 1920 out of 1920 | elapsed: 23.9min finished


1438.6167149543762 seconds elpased.


In [38]:
best_parameters, score, _ = max(GSCV.grid_scores_, key=lambda x: x[1])
print('AUC score:', score)
print('best parameters:', best_parameters)

AUC score: 0.8352090032154341
best parameters: {'colsample_bytree': 0.7000000000000001, 'gamma': 5.0, 'learning_rate': 0.2, 'max_depth': 8, 'min_child_weight': 16, 'objective': 'reg:linear', 'silent': 1, 'subsample': 0.75}


In [40]:

file_name1 = "../data/train_preprocessed1.csv"
file_name2 = "../data/train_preprocessed2.csv"

train_df1 = pd.read_csv(file_name1, low_memory = False)
train_df2 = pd.read_csv(file_name2, low_memory = False)

data_df1 = train_df1.drop(['Grant.Status'], axis = 1)
target_df1 = train_df1['Grant.Status']

data_df2 = train_df2.drop(['Grant.Status'], axis = 1)
target_df2 = train_df2['Grant.Status']

data1 = data_df1.values
target1 = target_df1.values
data2 = data_df2.values
target2 = target_df2.values


In [46]:

def preprocessing_XGB_KFold(preprocessing, min_child_weight, max_depth, gamma, subsample,  colsample_bytree) : 
    xgb_params = {
        'n_trees' : 20,
        'eta' : 0.2,
        'max_depth' : int(max_depth),
        'subsample' : max(min(subsample, 1), 0),
        'objective' : 'reg:linear', 
        'eval_metric' : 'auc',
        'silent' : 1,
        'min_child_weight' : int(min_child_weight),
        'gamma' : max(gamma, 0), 
        'colsample_bytree' : max(min(colsample_bytree, 1), 0)
    }
    preprocessing = round(preprocessing)
    if preprocessing == 1 : 
        data = data1
        target = target1
    else : 
        data = data2
        target = target2
    model = xgb.XGBClassifier(**xgb_params)
    
    kfold = KFold(n_splits = 5, random_state = 7)
    results = cross_val_score(model, data, target, cv = kfold)
    auc = results.mean()*100
    print("AUC : %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))
    
    return auc

In [47]:
#XGB (parameters tuned by Bayesian Optimization) result

xgb_params = {
    'preprocessing' : (1, 2),
    'min_child_weight' : (1, 20), 
    'max_depth' : (2, 10),
    'gamma' : (0, 10),
    'subsample' : (0.5, 1),
    'colsample_bytree' : (0.1, 1)
}


xgb_bayesOPT = BayesianOptimization(preprocessing_XGB_KFold, xgb_params)
start_time = time.time()
xgb_bayesOPT.maximize(init_points = 5, n_iter = 100)
elapsed_time = time.time() - start_time
print("elapsed time : %s"%elapsed_time)

[31mInitialization[0m
[94m---------------------------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |     gamma |   max_depth |   min_child_weight |   preprocessing |   subsample | 
AUC : 81.49% (5.91%)
    1 | 00m12s | [35m  81.48952[0m | [32m            0.7895[0m | [32m   8.5295[0m | [32m     9.8107[0m | [32m            5.0134[0m | [32m         1.7871[0m | [32m     0.5251[0m | 
AUC : 82.20% (5.98%)
    2 | 00m01s | [35m  82.20467[0m | [32m            0.2732[0m | [32m   1.7262[0m | [32m     4.6955[0m | [32m           10.2691[0m | [32m         1.0363[0m | [32m     0.6589[0m | 
AUC : 84.96% (3.00%)
    3 | 00m06s | [35m  84.95700[0m | [32m            0.9594[0m | [32m   0.3385[0m | [32m     5.9580[0m | [32m            1.2928[0m | [32m         1.8937[0m | [32m     0.7557[0m | 
AUC : 81.83% (5.75%)
    4 | 00m08s |   81.83394 |           

AUC : 85.58% (3.46%)
   48 | 00m14s |   85.57721 |             0.6671 |    2.9269 |      6.9001 |             3.2694 |          1.5832 |      0.9118 | 
AUC : 85.45% (3.58%)
   49 | 00m12s |   85.45090 |             0.3528 |    1.6198 |      5.7951 |             1.2725 |          2.0000 |      0.7994 | 
AUC : 82.17% (5.88%)
   50 | 00m14s |   82.16701 |             0.5660 |    6.7937 |      5.0224 |             4.9484 |          1.5012 |      0.7350 | 
AUC : 85.14% (3.34%)
   51 | 00m14s |   85.14082 |             0.5995 |    2.5486 |      6.7978 |             2.2762 |          1.8404 |      0.8801 | 
AUC : 81.93% (7.04%)
   52 | 00m13s |   81.92891 |             0.7416 |    3.9805 |      6.9019 |             2.5142 |          1.3407 |      1.0000 | 
AUC : 85.28% (3.31%)
   53 | 00m13s |   85.27858 |             0.4903 |    3.2451 |      6.3209 |             4.0432 |          1.9837 |      0.8335 | 
AUC : 81.94% (5.88%)
   54 | 00m19s |   81.93734 |             0.7452 |    7.7360 |     

AUC : 84.66% (3.59%)
  102 | 00m24s |   84.65868 |             0.7477 |    3.1513 |      5.4609 |             3.1625 |          1.8877 |      0.8340 | 
AUC : 85.24% (2.98%)
  103 | 00m25s |   85.24403 |             0.3771 |    1.8530 |      6.1530 |             1.4908 |          1.8888 |      0.8161 | 
AUC : 85.09% (3.07%)
  104 | 00m26s |   85.09483 |             0.5454 |    3.1790 |      7.0486 |             2.5516 |          1.6835 |      0.8414 | 
AUC : 84.70% (3.33%)
  105 | 00m28s |   84.70455 |             0.7973 |    3.0149 |      7.0695 |             3.5742 |          1.6017 |      0.8310 | 
elapsed time : 5927.012678861618
