In [12]:
import numpy as np
import pandas as pd

from sklearn.model_selection import cross_val_score, train_test_split

import xgboost as xgb

from sklearn.metrics import accuracy_score

from bayes_opt import BayesianOptimization


In [3]:
# Load Data

file_name = "../data/train_preprocessed2.csv"
train_df = pd.read_csv(file_name, low_memory = False)

train_df.head()

Unnamed: 0,A..papers,A.papers,B.papers,C.papers,Dif.countries,Perc_non_australian,Number.people,PHD,Max.years.univ,Grants.succ,...,SEO.11,SEO.12,SEO.13,SEO.14,SEO.15,SEO.16,SEO.17,SEO.18,SEO.19,Grant.Status
0,4.0,2.0,0.0,0.0,1,0.0,1,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
1,6.0,12.0,2.0,2.0,1,1.0,1,1.0,20.0,0.0,...,0,0,0,0,0,0,0,0,0,1
2,7.0,20.0,20.0,7.0,2,0.75,4,2.0,50.0,0.0,...,0,0,2,0,0,0,0,0,0,1
3,0.0,3.0,13.0,3.0,1,1.0,2,2.0,15.0,0.0,...,0,0,2,0,0,0,0,0,0,1
4,3.0,0.0,1.0,0.0,1,0.0,1,1.0,10.0,0.0,...,0,0,0,0,0,0,1,0,0,0


In [5]:
#Setup data : Divide Test and Train set

array = train_df.values

data = array[:, 0:70]
target = array[:, 70]

data, target

seed = 7
test_size = 0.2

data_train, data_test, target_train, target_test = train_test_split(data, target, test_size = test_size, random_state = seed)

In [10]:
# set XGB Model -> parameters set default

model = xgb.XGBClassifier()
model.fit(data_train, target_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [13]:
#Make predictions with XGBoost Model
target_pred = model.predict(data_test)
predictions = [round(value) for value in target_pred]
accuracy = accuracy_score(target_test, predictions)
print("Accuracy : %.2f%%" %(accuracy * 100))


Accuracy : 88.46%


  if diff:


In [26]:
# Making a Model function for bayesian optimization

def XGB_Train_Model(max_depth, subsample, min_child_weight, gamma, colsample_bytree) : 
    xgb_params = {
        'n_trees' : 250,
        'eta' : 0.01,
        'max_depth' : int(max_depth),
        'subsample' : max(min(subsample, 1), 0),
        'objective' : 'reg:linear', 
        'base_score' : np.mean(target),
        'silent' : 1,
        'min_child_weight' : int(min_child_weight),
        'gamma' : max(gamma, 0), 
        'colsample_bytree' : max(min(colsample_bytree, 1), 0)
    }
    
    model = xgb.XGBClassifier(**xgb_params)
    model.fit(data_train, target_train)
    target_pred = model.predict(data_test)
    predictions = [round(value) for value in target_pred]
    accuracy = accuracy_score(target_test, predictions)
    return accuracy

In [27]:
xgb_params = {
    'min_child_weight' : (1, 20), 
    'gamma' : (0, 10), 
    'subsample' : (0.5, 1),
    'colsample_bytree' : (0.1, 1),
    'max_depth' : (2, 10)
}

xgb_bayesOPT = BayesianOptimization(XGB_Train_Model, xgb_params)
xgb_bayesOPT.maximize(init_points = 5, n_iter = 25)


[31mInitialization[0m
[94m---------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |     gamma |   max_depth |   min_child_weight |   subsample | 


  if diff:


    1 | 00m00s | [35m   0.85591[0m | [32m            0.4668[0m | [32m   8.9591[0m | [32m     2.1515[0m | [32m            8.1634[0m | [32m     0.5856[0m | 


  if diff:


    2 | 00m00s |    0.83984 |             0.1248 |    9.9767 |      6.0149 |            10.8635 |      0.7828 | 


  if diff:


    3 | 00m01s | [35m   0.87887[0m | [32m            0.7608[0m | [32m   5.1977[0m | [32m     6.5911[0m | [32m           14.4404[0m | [32m     0.7327[0m | 


  if diff:


    4 | 00m01s | [35m   0.88576[0m | [32m            0.3466[0m | [32m   3.6429[0m | [32m     8.8845[0m | [32m            6.6248[0m | [32m     0.5295[0m | 


  if diff:


    5 | 00m00s | [35m   0.89036[0m | [32m            0.6611[0m | [32m   0.3446[0m | [32m     3.9888[0m | [32m            2.2265[0m | [32m     0.6252[0m | 
[31mBayesian Optimization[0m
[94m---------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |     gamma |   max_depth |   min_child_weight |   subsample | 
    6 | 00m09s |    0.81630 |             0.1000 |    0.0000 |      2.0000 |            20.0000 |      0.5000 | 


  if diff:
  if diff:


    7 | 00m14s | [35m   0.89782[0m | [32m            1.0000[0m | [32m   0.0000[0m | [32m    10.0000[0m | [32m           20.0000[0m | [32m     1.0000[0m | 


  if diff:


    8 | 00m13s |    0.89380 |             1.0000 |    0.0000 |     10.0000 |            10.7208 |      1.0000 | 


  if diff:


    9 | 00m12s |    0.86625 |             1.0000 |   10.0000 |     10.0000 |             1.0000 |      1.0000 | 


  if diff:


   10 | 00m13s |    0.89437 |             1.0000 |    0.0000 |     10.0000 |             1.0000 |      0.5000 | 


  if diff:
  " state: %s" % convergence_dict)


   11 | 00m12s |    0.86280 |             1.0000 |    8.9855 |     10.0000 |            20.0000 |      0.5000 | 


  if diff:


   12 | 00m08s |    0.86223 |             1.0000 |    4.4008 |      2.0000 |             1.0000 |      1.0000 | 


  if diff:


   13 | 00m11s |    0.89265 |             0.9607 |    0.0146 |      9.9220 |            17.1857 |      0.5106 | 


  if diff:


   14 | 00m08s |    0.88232 |             0.9976 |    0.3835 |      3.0179 |             7.6544 |      0.5666 | 


  if diff:


   15 | 00m07s |    0.87428 |             0.1000 |    0.0000 |      7.9888 |             1.0000 |      1.0000 | 


  if diff:


   16 | 00m09s |    0.85304 |             0.9137 |    9.7370 |      2.0774 |            19.8731 |      0.5693 | 


  if diff:
  " state: %s" % convergence_dict)


   17 | 00m11s |    0.89724 |             0.9901 |    0.8951 |      7.4353 |             4.7224 |      0.5337 | 


  if diff:
  " state: %s" % convergence_dict)


   18 | 00m10s |    0.87945 |             0.9982 |    5.0883 |      9.9318 |             9.1227 |      0.9071 | 


  if diff:
  " state: %s" % convergence_dict)


   19 | 00m11s |    0.88117 |             0.2014 |    3.7646 |      9.9595 |            19.9513 |      0.5622 | 


  if diff:


   20 | 00m16s |    0.87084 |             1.0000 |    5.3562 |     10.0000 |             1.0000 |      0.5000 | 


  if diff:


   21 | 00m14s | [35m   0.89897[0m | [32m            0.8051[0m | [32m   0.0995[0m | [32m     9.9124[0m | [32m            5.8883[0m | [32m     0.5494[0m | 


  if diff:


   22 | 00m18s |    0.88634 |             0.9668 |    2.9028 |      9.1309 |            19.9955 |      0.5300 | 


  if diff:


   23 | 00m15s |    0.85247 |             1.0000 |   10.0000 |      2.0000 |             1.0000 |      0.5000 | 


  if diff:


   24 | 00m17s |    0.89724 |             0.9487 |    0.1733 |      7.7481 |             8.6894 |      0.5018 | 


  if diff:


   25 | 00m21s | [35m   0.90011[0m | [32m            0.9893[0m | [32m   0.2843[0m | [32m     9.9486[0m | [32m            8.5741[0m | [32m     0.5636[0m | 


  if diff:


   26 | 00m22s |    0.89782 |             1.0000 |    0.0000 |     10.0000 |             7.5640 |      0.5000 | 


  if diff:


   27 | 00m19s |    0.87543 |             1.0000 |    0.0000 |      2.0000 |             1.0000 |      0.5000 | 


  if diff:


   28 | 00m17s |    0.88691 |             0.2235 |    1.7537 |      9.9879 |            11.8564 |      0.5255 | 


  if diff:
  " state: %s" % convergence_dict)


   29 | 00m17s |    0.86567 |             0.1000 |    0.0000 |     10.0000 |            20.0000 |      0.5000 | 


  if diff:


   30 | 00m17s |    0.86797 |             1.0000 |    3.4764 |      2.0000 |            14.6076 |      1.0000 | 
