In [20]:
import numpy as np
import pandas as pd

from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.model_selection import KFold
import xgboost as xgb
from sklearn.metrics import accuracy_score
from bayes_opt import BayesianOptimization
from sklearn.metrics import roc_curve, auc, roc_auc_score
import lightgbm as lgb

import gc

import warnings
warnings.filterwarnings('ignore')



In [21]:
# Load Data

file_name = "../data/train_preprocessed2.csv"
train_df = pd.read_csv(file_name, low_memory = False)

train_df.head()

Unnamed: 0,A..papers,A.papers,B.papers,C.papers,Dif.countries,Perc_non_australian,Number.people,PHD,Max.years.univ,Grants.succ,...,SEO.11,SEO.12,SEO.13,SEO.14,SEO.15,SEO.16,SEO.17,SEO.18,SEO.19,Grant.Status
0,4.0,2.0,0.0,0.0,1,0.0,1,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
1,6.0,12.0,2.0,2.0,1,1.0,1,1.0,20.0,0.0,...,0,0,0,0,0,0,0,0,0,1
2,7.0,20.0,20.0,7.0,2,0.75,4,2.0,50.0,0.0,...,0,0,2,0,0,0,0,0,0,1
3,0.0,3.0,13.0,3.0,1,1.0,2,2.0,15.0,0.0,...,0,0,2,0,0,0,0,0,0,1
4,3.0,0.0,1.0,0.0,1,0.0,1,1.0,10.0,0.0,...,0,0,0,0,0,0,1,0,0,0


In [22]:
#Setup data : Divide Test and Train set

array = train_df.values

data = array[:, 0:70]
target = array[:, 70]

data, target

seed = 7
test_size = 0.2

data_train, data_test, target_train, target_test = train_test_split(data, target, test_size = test_size, random_state = seed)



In [23]:
target_test

array([0., 0., 0., ..., 0., 1., 0.])

In [33]:
# XGB Training default Result (kfold cross validation)

model = xgb.XGBClassifier(eval_metric = 'auc')

# make predictions with kfold cross validation score
kfold = KFold(n_splits = 5, random_state = 7)
results = cross_val_score(model, data, target, cv = kfold)
accuracy = results.mean()*100
print("Accuracy : %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))
    

Accuracy : 84.78% (2.85%)


In [34]:
target

array([1., 1., 1., ..., 0., 1., 1.])

In [35]:
# Making a XGB Train Model Function

def XGB_Train_Model_using_KFold(eta, min_child_weight, max_depth, gamma, subsample,  colsample_bytree) : 
    xgb_params = {
        'n_trees' : 250,
        'eta' : max(eta, 0),
        'max_depth' : int(max_depth),
        'subsample' : max(min(subsample, 1), 0),
        'objective' : 'reg:linear', 
        'base_score' : np.mean(target),
        'eval_metric' : 'auc',
        'silent' : 1,
        'min_child_weight' : int(min_child_weight),
        'gamma' : max(gamma, 0), 
        'colsample_bytree' : max(min(colsample_bytree, 1), 0)
    }
    
    model = xgb.XGBClassifier(**xgb_params)
    
    #n_splits = int(max(n_splits_param, 2))
    kfold = KFold(n_splits = 5, random_state = 7)
    results = cross_val_score(model, data, target, cv = kfold)
    accuracy = results.mean()*100
    print("Accuracy : %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))
    
    '''
    model.fit(data_train, target_train)
    target_pred = model.predict(data_test)
    predictions = [round(value) for value in target_pred]
    accuracy = accuracy_score(target_test, predictions)
    
    '''
    return accuracy

In [36]:
xgb_params = {
    
    #Learning Rate 
    'eta' : (0.01, 0.2),
    
    #Minimum sum of weights : to control overfitting
    'min_child_weight' : (1, 20), 
    
    #Maximum depth of a tree : to control overfitting
    'max_depth' : (2, 10),
    
    #minimum loss reduction required to make a split : makes algorithm conservative
    'gamma' : (0, 10), 
    
    #max_delta_step is not needed since data is not imbalanced
    #'max_delta_step' : (0, 10),
    
    #Fraction of observations to be randomly samples for each tree
    #Lower: prevent overfitting
    'subsample' : (0.5, 1),
    
    #Fraction of columns to be randomly samples for each tree
    'colsample_bytree' : (0.1, 1),
    
    #colsamble_bylevel is not needed since subsample and colsample_bytree will do the job
    #'colsample_bylevel' = (0.1, 1),
    
    #L2 regularization term on weights
    #'lambda' = (?, ?)
    
    #L1 regularization term on weight
    #'alpha' = (?, ?)
    
    #scale_pos_weight is not needed since data is not imbalanced
    #'scale_pos_weight' = (0, 10)
    #'n_splits_param' : (5, 10)
}


xgb_bayesOPT = BayesianOptimization(XGB_Train_Model_using_KFold, xgb_params)
xgb_bayesOPT.maximize(init_points = 5, n_iter = 25)


[31mInitialization[0m
[94m---------------------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |       eta |     gamma |   max_depth |   min_child_weight |   subsample | 
Accuracy : 82.94% (5.93%)
    1 | 00m06s | [35m  82.93637[0m | [32m            0.9360[0m | [32m   0.1114[0m | [32m   5.4282[0m | [32m     6.0369[0m | [32m            4.0633[0m | [32m     0.9237[0m | 
Accuracy : 82.04% (5.67%)
    2 | 00m07s |   82.04070 |             0.7221 |    0.1352 |    8.7700 |      7.6457 |            15.6985 |      0.7057 | 
Accuracy : 82.17% (5.82%)
    3 | 00m03s |   82.16702 |             0.7483 |    0.0968 |    6.8766 |      4.8334 |             9.9810 |      0.9797 | 
Accuracy : 85.15% (3.47%)
    4 | 00m08s | [35m  85.15239[0m | [32m            0.7184[0m | [32m   0.0988[0m | [32m   3.0976[0m | [32m     8.0649[0m | [32m           17.7317[0m | [32m     0.

In [30]:
#lightGBM default result

lgb_train = lgb.Dataset(data_train, target_train)
lgb_eval = lgb.Dataset(data_test, target_test, reference=lgb_train)
lgb_params = {
        
    #static parameters
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2', 'auc'},
    'feature_fraction': 0.9,
    'bagging_fraction': 1,
    'bagging_freq': 5,
    'verbose': 0
}
cv_results = lgb.cv(lgb_params, lgb_train, num_boost_round=20, nfold=5, 
                verbose_eval=20, early_stopping_rounds=5)
cv_results['auc-mean'][-1]

[20]	cv_agg's l2: 0.0937773 + 0.00539591	cv_agg's auc: 0.94954 + 0.0060144


0.9495402224421168

In [31]:
#lightGBM

def LGB_Train_Model(learning_rate, max_depth, min_child_weight, colsample_bytree, subsample ) :
    lgb_train = lgb.Dataset(data_train, target_train)
    lgb_eval = lgb.Dataset(data_test, target_test, reference=lgb_train)
    
    # specify your configurations as a dict
    lgb_params = {
        
    #static parameters
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2', 'auc'},
    'feature_fraction': 0.9,
    'bagging_fraction': 1,
    'bagging_freq': 5,
    'verbose': 0,
        
        
    
    'learning_rate': max(learning_rate, 0),
    'max_depth': int(max_depth),
    'min_child_weight' : int(min_child_weight),
    'colsample_bytree' : max(min(colsample_bytree, 1), 0),
    'subsample' : max(min(subsample, 1), 0)
    }

    # train
    '''
    gbm = lgb.train(lgb_params,
                    lgb_train,
                    num_boost_round = 20,
                    valid_sets=lgb_eval,
                    early_stopping_rounds = 5,
                    verbose_eval=False)
    y_pred_lgb = gbm.predict(data_test, num_iteration=gbm.best_iteration)
    # eval
    lgb_auc=roc_auc_score(target_test,y_pred_lgb)
    #print('lightGBM auc : %.5f' % lgb_auc)
    return lgb_auc

    '''
    cv_results = lgb.cv(lgb_params, lgb_train, num_boost_round=20, nfold=5, 
                    verbose_eval=20, early_stopping_rounds=5)

    return cv_results['auc-mean'][-1]
    

In [32]:
lgb_params = {
    'learning_rate' : (0.01, 2), 
    'max_depth' : (2, 10), 
    'min_child_weight' : (1, 10), 
    'colsample_bytree' : (0.1, 10), 
    'subsample' : (0.5, 1)
    
}


lgb_bayesOPT = BayesianOptimization(LGB_Train_Model, lgb_params)
lgb_bayesOPT.maximize(init_points = 5, n_iter = 25)

[31mInitialization[0m
[94m---------------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |   learning_rate |   max_depth |   min_child_weight |   subsample | 
    1 | 00m00s | [35m   0.94673[0m | [32m            6.3537[0m | [32m         0.7169[0m | [32m     6.7204[0m | [32m            1.1867[0m | [32m     0.7949[0m | 
    2 | 00m00s |    0.91605 |             8.7095 |          1.3835 |      6.9387 |             7.5584 |      0.5763 | 
    3 | 00m00s |    0.94121 |             0.9283 |          0.8129 |      5.3449 |             3.8678 |      0.5609 | 
    4 | 00m00s |    0.93694 |             4.4692 |          0.8433 |      8.2046 |             4.2076 |      0.6203 | 
[20]	cv_agg's l2: 0.0862046 + 0.00444534	cv_agg's auc: 0.953721 + 0.00482988
    5 | 00m00s | [35m   0.95372[0m | [32m            9.1434[0m | [32m         0.2678[0m | [32m     6.4696[0m | [32