# Parameter Tuning Demo

## Some Package Import and Data Preparation

In [1]:
import pandas as pd
import numpy as np
import warnings
import time


from numpy import *
from sklearn import *
from sklearn.metrics import roc_curve, auc
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

warnings.filterwarnings('ignore')

xl = pd.ExcelFile('data2.xlsx')
xl.sheet_names # we'll take 7th
dfs = {sheet: xl.parse(sheet) for sheet in xl.sheet_names}
data1 = dfs['7']
data2 = dfs['1'].loc[:,['Patient','Age at Diagnosis']].drop([554]).drop_duplicates()
data3 = pd.read_csv('data1.csv')
combined_data = data1.set_index('Patient').join(data2.set_index('Patient')).join(data3.set_index('Patient'))
combined_data['label'] = (combined_data['Patient Type'] == 'Healthy').astype(int)
combined_data = combined_data.drop(['Patient Type'],axis=1)
print('The number of samples and features are %d and %d, respectively'%(combined_data.shape[0],combined_data.shape[1]))

x = combined_data.iloc[:, 0:44]
x[isnan(x)] = 0
y = combined_data.iloc[:,44]

The number of samples and features are 423 and 45, respectively


- Grid Search.

In [53]:
start = time.process_time()
n_outer = 5 # number of splits for outer loop
n_inner = 5 # number of splits for inner loop
rangeC = [10**-4, 10**2] # list, float, range of parameter C,eg.[10**-2, 10**2]
rangeGamma = [10**-6, 1] # list, float, range of parameter Gamma,eg.[10**-6, 1]
num_C =10
num_gamma = 5
parameters = {'kernel':['linear','rbf','poly'],
              'degree':[2, 3],
              'C':logspace(log10(rangeC[0]),log10(rangeC[1]),num_C),
              'gamma':logspace(log10(rangeGamma[0]),log10(rangeGamma[1]),num_gamma)}

##---Classification with nested 10*5-fold cross-validation---##
#--- x is feature, y is lable, n is number of fold
#---  define outer loop of K-fold cross validation ---#
KF = model_selection.StratifiedKFold(n_outer, shuffle=True, random_state=920)
score_kfold = []
i = 0
for train_index,test_index in KF.split(x,y):
    #---  Seperate traing set and test set ---#
    x_train, x_test = x.iloc[train_index][:], x.iloc[test_index][:]
    y_train = y.iloc[train_index][:]
        
    #---  Fill NaN age ---#
    x_train[isnan(x_train)] = 0
    x_test[isnan(x_test)] = 0  
        
    ##---  optimize SVM with Grid Search with inner loop cross validation---##
    clf = model_selection.GridSearchCV(svm.SVC(kernel = 'rbf',probability = True),parameters, cv=n_inner, verbose=0,scoring='roc_auc',random_state=920)
    clf.fit(x_train, y_train)
        
    # Calculate performance on test set
    fpr, tpr, threshold = roc_curve(y.iloc[test_index][:].values, clf.predict_proba(x_test)[:,1], pos_label=1)
    score_kfold.append(auc(fpr,tpr))
    print('Performance on %d-fold is %.2f' %(i,score_kfold[i]))
    i+=1

end = time.process_time()
print('Grid Search takes '+str(end - start)+'seconds.\n') 
print('Mean score over k-fold outer loop is %.2f' %mean(score_kfold))

Performance on 0-fold is 0.90
Performance on 1-fold is 0.85
Performance on 2-fold is 0.92
Performance on 3-fold is 0.96
Performance on 4-fold is 0.93
Grid Search takes 3147.0seconds.

Mean score over k-fold outer loop is 0.91


- Random Search.

In [5]:
start = time.process_time()
n_outer = 5 # number of splits for outer loop
n_inner = 5 # number of splits for inner loop
rangeC = [10**-4, 10**2] # list, float, range of parameter C,eg.[10**-2, 10**2]
rangeGamma = [10**-6, 1] # list, float, range of parameter Gamma,eg.[10**-6, 1]
num_C =10
num_gamma = 10
parameters = {'kernel':['linear','rbf','poly'],
              'degree':[2, 3],
              'C':logspace(log10(rangeC[0]),log10(rangeC[1]),num_C),
              'gamma':logspace(log10(rangeGamma[0]),log10(rangeGamma[1]),num_gamma)}
n_iter_search = 20

##---Classification with nested 10*5-fold cross-validation---##
#--- x is feature, y is lable, n is number of fold
#---  define outer loop of K-fold cross validation ---#
KF = model_selection.StratifiedKFold(n_outer, shuffle=True, random_state=920)
score_kfold = []
i = 0
for train_index,test_index in KF.split(x,y):
    #---  Seperate traing set and test set ---#
    x_train, x_test = x.iloc[train_index][:], x.iloc[test_index][:]
    y_train = y.iloc[train_index][:]
        
    #---  Fill NaN age ---#
    x_train[isnan(x_train)] = 0
    x_test[isnan(x_test)] = 0  
        
    ##---  optimize SVM with Grid Search with inner loop cross validation---##
    clf = model_selection.RandomizedSearchCV(svm.SVC(probability = True),parameters, cv=n_inner, verbose=0,scoring='roc_auc',n_iter=n_iter_search,random_state=920)
    clf.fit(x_train, y_train)
        
    # Calculate performance on test set
    fpr, tpr, threshold = roc_curve(y.iloc[test_index][:].values, clf.predict_proba(x_test)[:,1], pos_label=1)
    score_kfold.append(auc(fpr,tpr))
    print('Performance on %d-fold is %.2f' %(i,score_kfold[i]))
    i+=1

end = time.process_time()
print('Random Search takes '+str(end - start)+'seconds.\n') 
print('Mean score over k-fold outer loop is %.2f' %mean(score_kfold))

Performance on 0-fold is 0.92
Performance on 1-fold is 0.94
Performance on 2-fold is 0.88
Performance on 3-fold is 0.94
Performance on 4-fold is 0.94
Random Search takes 141.4375seconds.

Mean score over k-fold outer loop is 0.93


- Meta-heuristic (Genetic Algorithm).

In [4]:
# ---------------------------------------------------#   
#   Parameter Tuning with Meta-heuristic algorithms #
#--------------------------------------------------# 
from sko.GA import GA

##---   Define objective function   ---##
def obj_fun(parameter):
    C,kernel,degree,gamma = parameter
    if kernel == 0:
        kernel = 'linear'
    else:
        if kernel ==1:
            kernel = 'rbf'
        else:
            kernel = 'poly'
            
    KF = model_selection.StratifiedKFold(n_inner, shuffle=True, random_state=920)
    scorein_kfold = []
    for train_indexcv,test_indexcv in KF.split(x_train, y_train):
        x_traincv, x_testcv = x_train.iloc[train_indexcv][:], x_train.iloc[test_indexcv][:]
        y_traincv, y_testcv = y_train.iloc[train_indexcv][:], y_train.iloc[test_indexcv][:]
        clf = svm.SVC(C, kernel, degree, gamma, probability= True).fit(x_traincv, y_traincv)
        # Calculate performance on test set
        fpr, tpr, threshold = metrics.roc_curve(y_testcv.values, clf.predict_proba(x_testcv)[:,1], pos_label=1)
        scorein_kfold.append(metrics.auc(fpr,tpr))
        
    scorein_kfold = -mean(scorein_kfold)
#     print(score_kfold)
    return scorein_kfold

start = time.process_time()
n_outer = 5 # number of splits for outer loop
n_inner = 5 # number of splits for inner loop

##---   Classification with nested 5*5-fold cross-validation   ---##
#---  define outer loop of K-fold cross validation ---#
KF = model_selection.StratifiedKFold(n_outer, shuffle=True, random_state=1105)
score_kfold = []
i = 0
for train_index,test_index in KF.split(x,y):
    #---  Seperate traing set and test set ---#
    x_train, x_test = x.iloc[train_index][:], x.iloc[test_index][:]
    y_train = y.iloc[train_index][:]
        
    #---  Fill NaN age ---#
    x_train[isnan(x_train)] = 0
    x_test[isnan(x_test)] = 0  
        
    ##---  optimize SVM with Genetic Algorithm under inner loop cross validation   ---##
    ga = GA(func=obj_fun, n_dim=4, size_pop=12, max_iter=20, lb=[10**-4, 0, 2, 10**-6], ub=[10**2, 2, 3, 1], precision=[1e-2, 1, 1, 1e-2])
    best_parameter, best_y = ga.run()
    C,kernel,degree,gamma = best_parameter
    if kernel == 0:
        kernel = 'linear'
    else:
        if kernel == 1:
            kernel = 'rbf'
        else:
            kernel = 'poly'
    clf = svm.SVC(C, kernel, degree, gamma, probability= True).fit(x_train, y_train)
        
    # Calculate performance on test set
    fpr, tpr, threshold = roc_curve(y.iloc[test_index][:].values, clf.predict_proba(x_test)[:,1], pos_label=1)
    score_kfold.append(auc(fpr,tpr))
    print('Performance on %d-fold is %.2f' %(i,score_kfold[i]))
    i+=1

end = time.process_time()
print('Genetic Algorithm takes '+str(end - start)+'seconds.\n') 
print('Mean score over k-fold outer loop is %.2f' %mean(score_kfold))

Performance on 0-fold is 0.95
Performance on 1-fold is 0.86
Performance on 2-fold is 0.88
Performance on 3-fold is 0.94
Performance on 4-fold is 0.91
Genetic Algorithm takes 27078.703125seconds.

Mean score over k-fold outer loop is 0.91


- Bayesian optimization.

In [6]:
# ---------------------------------------------------#   
#   Parameter Tuning with Bayesian optimization #
#--------------------------------------------------# 

##---   Define search space   ---##
search_spaces = {'kernel': Categorical(['linear', 'poly', 'rbf']),
              'degree':Integer(1,3),
              'C':Real(1e-4, 1e+2, prior='log-uniform'),
              'gamma':Real(1e-6, 1e+1, prior='log-uniform')}

start = time.process_time()
n_outer = 5 # number of splits for outer loop
n_inner = 5 # number of splits for inner loop

##---   Classification with nested 5*5-fold cross-validation   ---##
#---  define outer loop of K-fold cross validation ---#
KF = model_selection.StratifiedKFold(n_outer, shuffle=True, random_state=920)
score_kfold = []
i = 0
for train_index,test_index in KF.split(x,y):
    #---  Seperate traing set and test set ---#
    x_train, x_test = x.iloc[train_index][:], x.iloc[test_index][:]
    y_train = y.iloc[train_index][:]
        
    #---  Fill NaN age ---#
    x_train[isnan(x_train)] = 0
    x_test[isnan(x_test)] = 0  
        
    ##---  optimize SVM with Genetic Algorithm under inner loop cross validation   ---##
    opt = BayesSearchCV(svm.SVC(probability = True), search_spaces, n_iter=50, cv=n_inner, scoring='roc_auc', random_state=920)
    # executes bayesian optimization
    clf = opt.fit(x_train, y_train)
        
    # Calculate performance on test set
    fpr, tpr, threshold = roc_curve(y.iloc[test_index][:].values, clf.predict_proba(x_test)[:,1], pos_label=1)
    score_kfold.append(auc(fpr,tpr))
    print('Performance on %d-fold is %.2f' %(i,score_kfold[i]))
    i+=1

end = time.process_time()
print('Bayesian optimization takes '+str(end - start)+'seconds.\n') 
print('Mean score over k-fold outer loop is %.2f' %mean(score_kfold))

Performance on 0-fold is 0.92
Performance on 1-fold is 0.86
Performance on 2-fold is 0.92
Performance on 3-fold is 0.98
Performance on 4-fold is 0.93
Bayesian optimization takes 3222.203125seconds.

Mean score over k-fold outer loop is 0.92
