In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import roc_curve, auc, roc_auc_score

import xgboost as xgb
import lightgbm as lgb
from lightgbm import LGBMClassifier
import gc
from sklearn.grid_search import GridSearchCV
from bayes_opt import BayesianOptimization
import time
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt



In [2]:
col1 = np.arange(0, 5, 1)
col2 = np.arange(0, 10, 2)
df = pd.DataFrame(data ={'col1' : col1, 'col2' : col2})
df.head()

Unnamed: 0,col1,col2
0,0,0
1,1,2
2,2,4
3,3,6
4,4,8


In [3]:
# load preprocessed data 1

file_name = "./data/train_preprocessed1.csv"
train_df1 = pd.read_csv(file_name, low_memory = False, index_col = False)

train_df1.head()



Unnamed: 0.1,Unnamed: 0,Grant.Status,Sponsor.Code,Grant.Category.Code,Contract.Value.Band...see.note.A,RFCD.Code.1,RFCD.Percentage.1,RFCD.Code.2,RFCD.Percentage.2,RFCD.Code.3,...,Dept.No..1,Faculty.No..1,With.PHD.1,No..of.Years.in.Uni.at.Time.of.Grant.1,Number.of.Successful.Grant.1,Number.of.Unsuccessful.Grant.1,A..1,A.1,B.1,C.1
0,0,1,0.0,0.0,1.0,280199.0,100.0,0.0,0.0,0.0,...,3073.0,31.0,0.0,1.0,0.0,0.0,4.0,2.0,0.0,0.0
1,1,1,2.0,1.0,2.0,280103.0,30.0,280106.0,30.0,280203.0,...,2538.0,25.0,1.0,2.0,0.0,0.0,6.0,12.0,2.0,2.0
2,2,1,29.0,2.0,1.0,321004.0,60.0,321216.0,40.0,0.0,...,2923.0,25.0,1.0,3.0,0.0,0.0,0.0,3.0,5.0,2.0
3,3,1,40.0,2.0,3.0,270602.0,50.0,320602.0,50.0,0.0,...,2678.0,25.0,1.0,3.0,0.0,0.0,0.0,3.0,13.0,3.0
4,4,0,59.0,1.0,1.0,260500.0,34.0,280000.0,33.0,290000.0,...,2153.0,19.0,1.0,3.0,0.0,0.0,3.0,0.0,1.0,0.0


In [4]:
train_df1.drop(train_df1.columns[0], axis = 1, inplace = True)
train_df1.head()

Unnamed: 0,Grant.Status,Sponsor.Code,Grant.Category.Code,Contract.Value.Band...see.note.A,RFCD.Code.1,RFCD.Percentage.1,RFCD.Code.2,RFCD.Percentage.2,RFCD.Code.3,RFCD.Percentage.3,...,Dept.No..1,Faculty.No..1,With.PHD.1,No..of.Years.in.Uni.at.Time.of.Grant.1,Number.of.Successful.Grant.1,Number.of.Unsuccessful.Grant.1,A..1,A.1,B.1,C.1
0,1,0.0,0.0,1.0,280199.0,100.0,0.0,0.0,0.0,0.0,...,3073.0,31.0,0.0,1.0,0.0,0.0,4.0,2.0,0.0,0.0
1,1,2.0,1.0,2.0,280103.0,30.0,280106.0,30.0,280203.0,40.0,...,2538.0,25.0,1.0,2.0,0.0,0.0,6.0,12.0,2.0,2.0
2,1,29.0,2.0,1.0,321004.0,60.0,321216.0,40.0,0.0,0.0,...,2923.0,25.0,1.0,3.0,0.0,0.0,0.0,3.0,5.0,2.0
3,1,40.0,2.0,3.0,270602.0,50.0,320602.0,50.0,0.0,0.0,...,2678.0,25.0,1.0,3.0,0.0,0.0,0.0,3.0,13.0,3.0
4,0,59.0,1.0,1.0,260500.0,34.0,280000.0,33.0,290000.0,33.0,...,2153.0,19.0,1.0,3.0,0.0,0.0,3.0,0.0,1.0,0.0


In [5]:
file_name = "./data/train_preprocessed2.csv"
train_df2 = pd.read_csv(file_name, low_memory = False)

train_df2.head()

Unnamed: 0,A..papers,A.papers,B.papers,C.papers,Dif.countries,Perc_non_australian,Number.people,PHD,Max.years.univ,Grants.succ,...,SEO.11,SEO.12,SEO.13,SEO.14,SEO.15,SEO.16,SEO.17,SEO.18,SEO.19,Grant.Status
0,4.0,2.0,0.0,0.0,1,0.0,1,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
1,6.0,12.0,2.0,2.0,1,1.0,1,1.0,20.0,0.0,...,0,0,0,0,0,0,0,0,0,1
2,7.0,20.0,20.0,7.0,2,0.75,4,2.0,50.0,0.0,...,0,0,2,0,0,0,0,0,0,1
3,0.0,3.0,13.0,3.0,1,1.0,2,2.0,15.0,0.0,...,0,0,2,0,0,0,0,0,0,1
4,3.0,0.0,1.0,0.0,1,0.0,1,1.0,10.0,0.0,...,0,0,0,0,0,0,1,0,0,0


In [6]:
#Setup data : Divide Data and Target

data_df1 = train_df1.drop(['Grant.Status'], axis = 1)
target_df1 = train_df1['Grant.Status']

data_df2 = train_df2.drop(['Grant.Status'], axis = 1)
target_df2 = train_df2['Grant.Status']

data1 = data_df1.values
target1 = target_df1.values
data2 = data_df2.values
target2 = target_df2.values

In [7]:
cnt = 0

In [8]:
max_depth_BO = np.zeros(155)
min_child_weight_BO = np.zeros(155)
colsample_bytree_BO = np.zeros(155)
subsample_BO = np.zeros(155)
gamma_BO = np.zeros(155)
auc_BO = np.zeros(155)

In [9]:
max_depth_BO2 = np.zeros(155)
min_child_weight_BO2 = np.zeros(155)
colsample_bytree_BO2 = np.zeros(155)
subsample_BO2 = np.zeros(155)
gamma_BO2 = np.zeros(155)
auc_BO2 = np.zeros(155)

In [10]:
max_depth_DE = np.zeros(1500)
min_child_weight_DE = np.zeros(1500)
colsample_bytree_DE = np.zeros(1500)
subsample_DE = np.zeros(1500)
gamma_DE = np.zeros(1500)
auc_DE = np.zeros(1500)

In [11]:
max_depth_DE2 = np.zeros(1500)
min_child_weight_DE2 = np.zeros(1500)
colsample_bytree_DE2 = np.zeros(1500)
subsample_DE2 = np.zeros(1500)
gamma_DE2 = np.zeros(1500)
auc_DE2 = np.zeros(1500)

In [12]:
#We will be using only data2 from now.
#data1 will be used after optimizing range is expanded to preprocess part.


In [13]:
#XGB Result (using Default Parameter)

model = xgb.XGBClassifier(eval_metric = 'auc')

# make predictions with kfold cross validation score
kfold = KFold(n_splits = 5, random_state = 7, shuffle = True)
results = cross_val_score(model, data2, target2, cv = kfold)
accuracy = results.mean()*100
print("AUC : %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))
    

Accuracy : 87.93% (0.66%)


In [14]:
#LGB Result (using Default Parameter)

#LGB's default parameter is slightly different from xgb's 
#So several parameters should be setted differently according to xgb's

lgb_train = lgb.Dataset(data2, target2)
lgb_params = {
    'task': 'train',
    'objective': 'regression',
    'metric': {'l2', 'auc'},
    'max_depth' : 6,
    'learning_rate' : 0.03,
    'reg_lambda' : 1.0
}
model = lgb.LGBMClassifier(**lgb_params)
    
kfold = KFold(n_splits = 5, random_state = 7, shuffle = True)
results = cross_val_score(model, data2, target2, cv = kfold)
auc = results.mean()*100
print("AUC : %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

AUC : 88.23% (1.01%)


In [16]:
#XGB Result (using GridSearch, Optimized Parameter)

def XGB_Train_Model(min_child_weight, max_depth, gamma, subsample, colsample_bytree) : 
    xgb_params = {
        #static parameters
        'n_trees' : 20,
        'eta' : 0.3,
        'objective' : 'reg:linear', 
        'eval_metric' : 'auc',
        'silent' : 1,
        
        #tuned parameters
        'max_depth' : int(max_depth),
        'subsample' : max(min(subsample, 1), 0),
        'min_child_weight' : int(min_child_weight),
        'gamma' : max(gamma, 0), 
        'colsample_bytree' : max(min(colsample_bytree, 1), 0)
    }
    
    model = xgb.XGBClassifier(**xgb_params)
    
    kfold = KFold(n_splits = 5, random_state = 7, shuffle = True)
    results = cross_val_score(model, data2, target2, cv = kfold)
    auc = results.mean()*100
    return auc

xgb_clf = xgb.XGBClassifier(eval_metric = 'auc', n_trees = 20, learning_rate = 0.3, objective = 'reg:linear', silent = 1)

xgb_params = {
    'min_child_weight' : np.arange(1, 20, 5),      # 4
    'max_depth' : np.arange(2, 10, 2),             # 4 
    'gamma' : np.arange(0, 10, 2.5),                 # 4
    'subsample' : np.arange(0.5, 1.0, 0.125),        # 4
    'colsample_bytree' : np.arange(0.1, 1.0, 0.3) # 3
    
}

GSCV = GridSearchCV(xgb_clf, xgb_params, cv = 5, scoring = 'roc_auc', n_jobs = 1, verbose = 1)

start_time = time.time()
GSCV.fit(data2, target2)
elapsed_time = time.time() - start_time
print("elapsed time : %s min %s sec"%(elapsed_time/60, elapsed_time%60)



Fitting 5 folds for each of 768 candidates, totalling 3840 fits


[Parallel(n_jobs=1)]: Done 3840 out of 3840 | elapsed: 46.5min finished


TypeError: not enough arguments for format string

In [17]:
#Best Parameter by GridSearch
best_parameters, score, _ = max(GSCV.grid_scores_, key=lambda x: x[1])
print('best parameters:', best_parameters)

best parameters: {'colsample_bytree': 0.7000000000000001, 'gamma': 2.5, 'max_depth': 4, 'min_child_weight': 16, 'subsample': 0.75}


In [20]:
#XGB Result(using Grid Search)

xgb_GS_result = XGB_Train_Model(16, 4, 2.5, 0.75, 0.7)

print('Grid Search(XGB) result(AUC) : '+ str(xgb_GS_result))

Grid Search(XGB) result(AUC) : 87.6551409875027


In [24]:
#LGB Result (using GridSearch, Optimized Parameter)

def LGB_Train_Model(min_child_weight, max_depth, gamma, subsample, colsample_bytree) :
    lgb_train = lgb.Dataset(data2, target2)
    
    lgb_params = {
        
    #static parameters
    'task': 'train',
    'objective': 'regression',
    'metric': {'l2', 'auc'},
    'learning_rate' : 0.03,
    'reg_lambda' : 1.0,
    'num_leaves' : 1023,
        
    #tuned parameters
    'max_depth': int(max_depth),
    'min_child_weight' : int(min_child_weight),
    'colsample_bytree' : max(min(colsample_bytree, 1), 0),
    'subsample' : max(min(subsample, 1), 0),
    'gamma' : max(gamma, 0), 
    }

    model = lgb.LGBMClassifier(**lgb_params)
    
    kfold = KFold(n_splits = 5, random_state = 7, shuffle = True)
    results = cross_val_score(model, data2, target2, cv = kfold)
    auc = results.mean()*100
    return auc

lgb_clf = lgb.LGBMClassifier(task = 'train', metric = {'l2', 'auc'}, objective = 'regression', 
                            learning_rate = 0.03, reg_lambda = 1.0, num_leaves = 1023)

lgb_params = {
    'min_child_weight' : np.arange(1, 20, 5),      # 4
    'max_depth' : np.arange(2, 10, 2),             # 4 
    'gamma' : np.arange(0, 10, 2.5),                 # 4
    'subsample' : np.arange(0.5, 1.0, 0.125),        # 4
    'colsample_bytree' : np.arange(0.1, 1.0, 0.3), # 3
}

GSCV2 = GridSearchCV(lgb_clf, lgb_params, cv = 5, scoring = 'roc_auc', n_jobs = 1, verbose = 1)

start_time = time.time()
GSCV2.fit(data2, target2)
elapsed_time = time.time() - start_time
print("elapsed time : %s min %s sec"%(elapsed_time/60, elapsed_time%60))
best_parameters, score, _ = max(GSCV2.grid_scores_, key=lambda x: x[1])
print('best parameters:', best_parameters)




Fitting 5 folds for each of 768 candidates, totalling 3840 fits
elapsed time : 14.16251415014267 min 9.75084900856018 sec
best parameters: {'colsample_bytree': 0.7000000000000001, 'gamma': 0.0, 'max_depth': 4, 'min_child_weight': 1, 'subsample': 0.75}


[Parallel(n_jobs=1)]: Done 3840 out of 3840 | elapsed: 14.2min finished


In [26]:

lgb_GS_result = LGB_Train_Model(1, 4, 0, 0.75, 0.7)
print('Grid Search(LGB) result(AUC) : '+ str(lgb_GS_result))

Grid Search(LGB) result(AUC) : 87.31062357105033


In [27]:
#XGB Train Model using BayesOpt

def XGB_Train_Model_BO(min_child_weight, max_depth, gamma, subsample, colsample_bytree) : 
    xgb_params = {
        #static parameters
        'n_trees' : 20,
        'eta' : 0.3,
        'objective' : 'reg:linear', 
        'eval_metric' : 'auc',
        'silent' : 1,
        
        #tuned parameters
        'max_depth' : int(max_depth),
        'subsample' : max(min(subsample, 1), 0),
        'min_child_weight' : int(min_child_weight),
        'gamma' : max(gamma, 0), 
        'colsample_bytree' : max(min(colsample_bytree, 1), 0)
    }
    
    model = xgb.XGBClassifier(**xgb_params)
    
    kfold = KFold(n_splits = 5, random_state = 7, shuffle = True)
    results = cross_val_score(model, data2, target2, cv = kfold)
    auc = results.mean()*100
    
    ##############################plot parameter saving part#########################################
    global cnt
    global max_depth_BO, subsample_BO, min_child_weight_BO, gamma_BO, colsample_bytree_BO, auc_BO
    max_depth_BO[cnt]        = max_depth
    subsample_BO[cnt]        = subsample
    min_child_weight_BO[cnt] = min_child_weight
    gamma_BO[cnt]            = gamma
    subsample_BO[cnt]        = subsample
    auc_BO[cnt]              = auc
    cnt = cnt + 1
    ##############################plot parameter saving part#########################################    
    
    print("AUC : %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))
    
    return auc


In [28]:
#XGB Result (using BayesOpt, Optimized Parameter)

xgb_params = {
    
    #Minimum sum of weights : to control overfitting
    'min_child_weight' : (1, 20), 
    
    #Maximum depth of a tree : to control overfitting
    'max_depth' : (2, 10),
    
    #minimum loss reduction required to make a split : makes algorithm conservative
    'gamma' : (0, 10), 
    
    #Fraction of observations to be randomly samples for each tree
    #Lower: prevent overfitting
    'subsample' : (0.5, 1),
    
    #Fraction of columns to be randomly samples for each tree
    'colsample_bytree' : (0.1, 1),
    
    }


xgb_bayesOPT = BayesianOptimization(XGB_Train_Model_BO, xgb_params)
start_time = time.time()
xgb_bayesOPT.maximize(init_points = 5, n_iter = 150)
elapsed_time = time.time() - start_time
print("elapsed time : %s min %s sec"%(elapsed_time/60, elapsed_time%60))
cnt = 0

[31mInitialization[0m
[94m---------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |     gamma |   max_depth |   min_child_weight |   subsample | 
AUC : 87.53% (1.18%)
    1 | 00m04s | [35m  87.52880[0m | [32m            0.6719[0m | [32m   4.5472[0m | [32m     5.3795[0m | [32m            4.4098[0m | [32m     0.9565[0m | 
AUC : 87.33% (0.92%)
    2 | 00m08s |   87.33359 |             0.8319 |    3.7101 |      6.0447 |             7.5197 |      0.5508 | 
AUC : 87.00% (0.88%)
    3 | 00m09s |   87.00058 |             0.8261 |    8.1116 |      9.1521 |             4.9323 |      0.9322 | 
AUC : 88.08% (1.09%)
    4 | 00m10s | [35m  88.08000[0m | [32m            0.8778[0m | [32m   2.3102[0m | [32m     9.6898[0m | [32m            2.1327[0m | [32m     0.8790[0m | 
AUC : 87.38% (0.94%)
    5 | 00m03s |   87.37951 |             0.3500 |    5.5479 |      6.9174 | 

AUC : 89.47% (0.58%)
   55 | 00m13s |   89.46950 |             0.2229 |    0.5320 |      7.0490 |             3.4266 |      1.0000 | 
AUC : 89.70% (0.70%)
   56 | 00m14s |   89.69916 |             0.3100 |    0.3141 |      7.4007 |             4.2386 |      1.0000 | 
AUC : 89.40% (0.77%)
   57 | 00m15s |   89.40058 |             0.7991 |    0.0000 |      7.1938 |             4.3156 |      1.0000 | 
AUC : 88.94% (0.93%)
   58 | 00m15s |   88.94124 |             0.5624 |    1.2357 |      7.4616 |             4.7721 |      0.7593 | 
AUC : 89.91% (0.65%)
   59 | 00m14s |   89.90595 |             0.3999 |    0.0000 |      7.0358 |             4.1242 |      1.0000 | 
AUC : 89.21% (0.83%)
   60 | 00m16s |   89.20531 |             0.6564 |    0.6580 |      7.0242 |             4.7917 |      0.8414 | 
AUC : 89.57% (0.90%)
   61 | 00m19s |   89.57289 |             0.8236 |    0.1182 |      8.4475 |             1.5068 |      0.8820 | 
AUC : 89.24% (1.18%)
   62 | 00m17s |   89.23980 |            

AUC : 88.99% (1.05%)
  117 | 00m32s |   88.98719 |             0.8288 |    0.9126 |      8.1646 |             1.2237 |      0.8582 | 
AUC : 89.66% (0.90%)
  118 | 00m30s |   89.66472 |             0.5713 |    0.1531 |      7.8361 |             1.7581 |      0.8702 | 
AUC : 89.55% (0.96%)
  119 | 00m37s |   89.54990 |             0.6325 |    0.0000 |      8.0409 |             1.8493 |      0.6470 | 
AUC : 89.66% (0.68%)
  120 | 00m33s |   89.66474 |             0.6360 |    0.0225 |      7.9305 |             1.7648 |      0.8059 | 
AUC : 89.45% (1.21%)
  121 | 00m33s |   89.44657 |             0.6581 |    0.4214 |      8.1709 |             2.1180 |      0.5000 | 
AUC : 89.68% (0.92%)
  122 | 00m30s |   89.67620 |             0.4778 |    0.2439 |      7.8432 |             1.8939 |      0.5000 | 
AUC : 89.66% (0.81%)
  123 | 00m33s |   89.66469 |             0.7715 |    0.2560 |      8.2265 |             1.9272 |      0.7569 | 
AUC : 89.22% (0.63%)
  124 | 00m31s |   89.21691 |            

In [29]:
#LGB Train Model

def LGB_Train_Model_BO(gamma, max_depth, min_child_weight, colsample_bytree, subsample) :
    lgb_train = lgb.Dataset(data2, target2)
    
    lgb_params = {
        
    #static parameters
    'task': 'train',
    'objective': 'regression',
    'metric': {'l2', 'auc'},
    'learning_rate' : 0.03,
    'reg_lambda' : 1.0,
    'num_leaves' : 1023,
        
    #tuned parameters
    'max_depth': int(max_depth),
    'min_child_weight' : int(min_child_weight),
    'colsample_bytree' : max(min(colsample_bytree, 1), 0),
    'subsample' : max(min(subsample, 1), 0),
    'gamma' : max(gamma, 0), 
    }

    model = lgb.LGBMClassifier(**lgb_params)
    
    kfold = KFold(n_splits = 5, random_state = 7, shuffle = True)
    results = cross_val_score(model, data2, target2, cv = kfold)
    auc = results.mean()*100
    
    ##############################plot parameter saving part#########################################
    global cnt, optimizer
    global max_depth_BO2, subsample_BO2, min_child_weight_BO2, gamma_BO2, colsample_bytree_BO2, auc_BO2
    max_depth_BO2[cnt]        = max_depth
    subsample_BO2[cnt]        = subsample
    min_child_weight_BO2[cnt] = min_child_weight
    gamma_BO2[cnt]            = gamma
    subsample_BO2[cnt]        = subsample
    auc_BO2[cnt]              = auc
    cnt = cnt + 1
    ##############################plot parameter saving part#########################################
    print("AUC : %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))
    
    return auc

In [30]:
#LGB Result (using BayesOpt, Optimized Parameter)
lgb_params = {
    'max_depth' : (2, 10), 
    'min_child_weight' : (1, 20), 
    'colsample_bytree' : (0.1, 1), 
    'subsample' : (0.5, 1),
    'gamma' : (0, 10)
}

lgb_bayesOPT = BayesianOptimization(LGB_Train_Model_BO, lgb_params)
start_time = time.time()
lgb_bayesOPT.maximize(init_points = 5, n_iter = 150)
elapsed_time = time.time() - start_time
print("elapsed time : %s min %s sec"%(elapsed_time/60, elapsed_time%60))
cnt = 0

[31mInitialization[0m
[94m---------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |     gamma |   max_depth |   min_child_weight |   subsample | 
AUC : 88.29% (0.67%)
    1 | 00m01s | [35m  88.28663[0m | [32m            0.2668[0m | [32m   6.1122[0m | [32m     5.2632[0m | [32m           17.1303[0m | [32m     0.9472[0m | 
AUC : 86.31% (0.92%)
    2 | 00m01s |   86.31135 |             0.2094 |    3.8180 |      6.2483 |             3.4086 |      0.7522 | 
AUC : 88.87% (0.68%)
    3 | 00m01s | [35m  88.87234[0m | [32m            0.4313[0m | [32m   1.6858[0m | [32m     7.1647[0m | [32m            6.0511[0m | [32m     0.6595[0m | 
AUC : 80.95% (0.94%)
    4 | 00m00s |   80.94854 |             0.1731 |    7.7629 |      2.8351 |             4.2741 |      0.8446 | 
AUC : 88.90% (0.94%)
    5 | 00m01s | [35m  88.89531[0m | [32m            0.5184[0m | [32m   

AUC : 88.87% (0.84%)
   56 | 00m09s |   88.87228 |             0.5163 |    2.3423 |      7.7785 |             9.0499 |      0.5000 | 
AUC : 89.52% (0.61%)
   57 | 00m08s |   89.51539 |             0.4000 |    1.0068 |      8.0047 |             7.7382 |      1.0000 | 
AUC : 88.31% (0.87%)
   58 | 00m08s |   88.30973 |             0.4359 |    5.8222 |      5.1954 |            16.6765 |      0.7904 | 
AUC : 88.84% (0.82%)
   59 | 00m08s |   88.83792 |             0.6117 |    3.9422 |      7.6632 |             6.7095 |      0.7073 | 
AUC : 88.18% (0.95%)
   60 | 00m08s |   88.18336 |             0.5819 |    5.4779 |      5.8257 |            15.0540 |      0.7585 | 
AUC : 89.08% (0.61%)
   61 | 00m10s |   89.07898 |             0.2893 |    1.1646 |      8.2346 |             7.8340 |      0.9510 | 
AUC : 88.46% (0.78%)
   62 | 00m09s |   88.45892 |             0.6415 |    3.0429 |      6.2821 |             5.9096 |      0.7404 | 
AUC : 88.67% (0.94%)
   63 | 00m10s |   88.66558 |            

AUC : 88.29% (0.81%)
  117 | 00m19s |   88.28670 |             0.3455 |    6.0721 |      5.0426 |            16.9400 |      0.8193 | 
AUC : 87.72% (1.09%)
  118 | 00m19s |   87.72401 |             0.5801 |    5.8209 |      4.9502 |            16.9360 |      0.9432 | 
AUC : 88.52% (0.71%)
  119 | 00m19s |   88.51633 |             0.4670 |    3.2009 |      6.9111 |             9.6198 |      0.6354 | 
AUC : 88.36% (0.89%)
  120 | 00m19s |   88.35560 |             0.4625 |    5.6272 |      5.8673 |            15.2729 |      0.5289 | 
AUC : 89.37% (0.53%)
  121 | 00m20s |   89.36613 |             0.3810 |    0.9015 |      8.3417 |             7.8927 |      1.0000 | 
AUC : 88.17% (0.66%)
  122 | 00m19s |   88.17189 |             0.3548 |    5.7381 |      5.2460 |            16.0231 |      0.7705 | 
AUC : 88.45% (0.99%)
  123 | 00m21s |   88.44748 |             0.7622 |    4.1881 |      7.9363 |             6.8251 |      0.6128 | 
AUC : 88.54% (0.83%)
  124 | 00m20s |   88.53934 |            

In [34]:

xgb_BO_data = {'min_child_weight' : min_child_weight_BO, 'max_depth' : max_depth_BO, 'gamma' : gamma_BO, 'subsample' : subsample_BO, 'colsample_bytree' : colsample_bytree_BO, 'auc' : auc_BO}
xgb_BO_df = pd.DataFrame(data = xgb_BO_data)
xgb_BO_df.to_csv("xgbBO.csv", sep = ',')

In [35]:
lgb_BO_data = {'min_child_weight' : min_child_weight_BO2, 'max_depth' : max_depth_BO2, 'gamma' : gamma_BO2, 'subsample' : subsample_BO2, 'colsample_bytree' : colsample_bytree_BO2, 'auc' : auc_BO2}
lgb_BO_df = pd.DataFrame(data = lgb_BO_data)
lgb_BO_df.to_csv("lgbBO.csv", sep = ',')

In [31]:
#XGB Train Model

def XGB_Train_Model_DE(params) : 
    min_child_weight = params[0]
    max_depth = params[1]
    gamma = params[2]
    subsample = params[3] 
    colsample_bytree = params[4]
    xgb_params = {
        #static parameters
        'n_trees' : 20,
        'eta' : 0.3,
        'objective' : 'reg:linear', 
        'eval_metric' : 'auc',
        'silent' : 1,
        
        #tuned parameters
        'max_depth' : int(max_depth),
        'subsample' : max(min(subsample, 1), 0),
        'min_child_weight' : int(min_child_weight),
        'gamma' : max(gamma, 0), 
        'colsample_bytree' : max(min(colsample_bytree, 1), 0)
    }
    
    model = xgb.XGBClassifier(**xgb_params)
    
    kfold = KFold(n_splits = 5, random_state = 7, shuffle = True)
    results = cross_val_score(model, data2, target2, cv = kfold)
    auc = results.mean()*100
    
    ##############################plot parameter saving part#########################################
    global cnt
    global max_depth_DE, subsample_DE, min_child_weight_DE, gamma_DE, colsample_bytree_DE, auc_DE
    max_depth_DE[cnt]        = max_depth
    subsample_DE[cnt]        = subsample
    min_child_weight_DE[cnt] = min_child_weight
    gamma_DE[cnt]            = gamma
    subsample_DE[cnt]        = subsample
    auc_DE[cnt]              = auc
    cnt = cnt + 1
    ##############################plot parameter saving part#########################################    
    
    print("AUC : %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))
    return 100 - auc


In [32]:
#XGB Result (using DiffEvolution, Optimized Parameter)

bounds = [(1,20), (2, 10), (0, 10), (0.5, 1), (0.1, 1)]
start_time = time.time()
result = differential_evolution(XGB_Train_Model_DE, bounds, maxiter = 10, popsize = 30)
elapsed_time = time.time() - start_time
print("elapsed time : %s min %s sec"%(elapsed_time/60, elapsed_time%60))
cnt = 0

result.x, result.fun

NameError: name 'differential_evolution' is not defined

In [None]:
#LGB Train Model

def LGB_Train_Model_DE(params) :
    min_child_weight = params[0]
    max_depth = params[1]
    gamma = params[2]
    subsample = params[3] 
    colsample_bytree = params[4]
    lgb_train = lgb.Dataset(data2, target2)
    
    lgb_params = {
        
    #static parameters
    'task': 'train',
    'objective': 'regression',
    'metric': {'l2', 'auc'},
    'learning_rate' : 0.03,
    'reg_lambda' : 1.0,
    'num_leaves' : 1023,
        
    #tuned parameters
    'max_depth': int(max_depth),
    'min_child_weight' : int(min_child_weight),
    'colsample_bytree' : max(min(colsample_bytree, 1), 0),
    'subsample' : max(min(subsample, 1), 0),
    'gamma' : max(gamma, 0), 
    }

    model = lgb.LGBMClassifier(**lgb_params)
    
    kfold = KFold(n_splits = 5, random_state = 7, shuffle = True)
    results = cross_val_score(model, data2, target2, cv = kfold)
    auc = results.mean()*100
    
    ##############################plot parameter saving part#########################################
    global cnt
    global max_depth_DE2, subsample_DE2, min_child_weight_DE2, gamma_DE2, colsample_bytree_DE2, auc_DE2
    max_depth_DE2[cnt]        = max_depth
    subsample_DE2[cnt]        = subsample
    min_child_weight_DE2[cnt] = min_child_weight
    gamma_DE2[cnt]            = gamma
    subsample_DE2[cnt]        = subsample
    auc_DE2[cnt]              = auc
    cnt = cnt + 1
    ##############################plot parameter saving part#########################################
    print("AUC : %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))
    return 100 - auc

In [None]:
#LGB Result (using DiffEvolution, Optimized Parameter)
bounds = [(1,20), (2, 10), (0, 10), (0.5, 1), (0.1, 1)]
start_time = time.time()
result = differential_evolution(LGB_Train_Model_DE, bounds, maxiter = 10, popsize = 30)
elapsed_time = time.time() - start_time
print("elapsed time : %s min %s sec"%(elapsed_time/60, elapsed_time%60))
cnt = 0

result.x, result.fun

In [None]:
#Visualization Part#