# Machine Learning Algorithms

## Some Package Import and Data Preparation

In [2]:
import pandas as pd
import numpy as np
import warnings
import time


from numpy import *
from sklearn import *
from sklearn.metrics import roc_curve, auc
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

warnings.filterwarnings('ignore')

xl = pd.ExcelFile('data2.xlsx')
xl.sheet_names # we'll take 7th
dfs = {sheet: xl.parse(sheet) for sheet in xl.sheet_names}
data1 = dfs['7']
data2 = dfs['1'].loc[:,['Patient','Age at Diagnosis']].drop([554]).drop_duplicates()
data3 = pd.read_csv('data1.csv')
combined_data = data1.set_index('Patient').join(data2.set_index('Patient')).join(data3.set_index('Patient'))
combined_data['label'] = (combined_data['Patient Type'] == 'Healthy').astype(int)
combined_data = combined_data.drop(['Patient Type'],axis=1)
print('The number of samples and features are %d and %d, respectively'%(combined_data.shape[0],combined_data.shape[1]))

x = combined_data.iloc[:, 0:44]
x[isnan(x)] = 0
y = combined_data.iloc[:,44]

The number of samples and features are 423 and 45, respectively


- SVM

In [3]:
# ---------------------------------------------------#   
#                       SVM                         #
#--------------------------------------------------# 

##---   Define search space   ---##
search_spaces = {'kernel': Categorical(['linear', 'poly', 'rbf']),
              'degree':Integer(1,3),
              'C':Real(1e-4, 1e+2, prior='log-uniform'),
              'gamma':Real(1e-6, 1e+1, prior='log-uniform')}

n_outer = 5 # number of splits for outer loop
n_inner = 5 # number of splits for inner loop

##---   Classification with nested 5*5-fold cross-validation   ---##
#---  define outer loop of K-fold cross validation for model evaluation ---#
KF = model_selection.StratifiedKFold(n_outer, shuffle=True, random_state=920)
score_kfold = []
i = 0
for train_index,test_index in KF.split(x,y):
    #---  Seperate traing set and test set ---#
    x_train, x_test = x.iloc[train_index][:], x.iloc[test_index][:]
    y_train = y.iloc[train_index][:]
        
    #---  Fill NaN age ---#
    x_train[isnan(x_train)] = 0
    x_test[isnan(x_test)] = 0  
        
    ##---  optimize SVM with Genetic Algorithm under inner loop cross validation   ---##
    opt = BayesSearchCV(svm.SVC(probability = True), search_spaces, n_iter=30, cv=n_inner, scoring='neg_brier_score', random_state=920)
    # executes bayesian optimization
    clf = opt.fit(x_train, y_train)
        
    # Calculate performance on test set
    fpr, tpr, threshold = roc_curve(y.iloc[test_index][:].values, clf.predict_proba(x_test)[:,1], pos_label=1)
    score_kfold.append(auc(fpr,tpr))
    print('Performance on %d-fold is %.2f' %(i,score_kfold[i]))
    i+=1

 
print('Mean score over k-fold outer loop is %.2f' %mean(score_kfold))

Performance on 0-fold is 0.92
Performance on 1-fold is 0.94
Performance on 2-fold is 0.87
Performance on 3-fold is 0.94
Performance on 4-fold is 0.94
Mean score over k-fold outer loop is 0.92


- LASSO

In [4]:
# ---------------------------------------------------#   
#                        LASSO                      #
#--------------------------------------------------# 

##---   Define search space   ---##
search_spaces = {'alpha':Real(1e-2, 2, prior='log-uniform')}

n_outer = 5 # number of splits for outer loop
n_inner = 5 # number of splits for inner loop

##---   Classification with nested 5*5-fold cross-validation for model evaluation   ---##
#---  define outer loop of K-fold cross validation ---#
KF = model_selection.StratifiedKFold(n_outer, shuffle=True, random_state=920)
score_kfold = []
i = 0
for train_index,test_index in KF.split(x,y):
    #---  Seperate traing set and test set ---#
    x_train, x_test = x.iloc[train_index][:], x.iloc[test_index][:]
    y_train = y.iloc[train_index][:]
        
    #---  Fill NaN age ---#
    x_train[isnan(x_train)] = 0
    x_test[isnan(x_test)] = 0  
        
    ##---  optimize LASSO with Genetic Algorithm under inner loop cross validation   ---##
    opt = BayesSearchCV(linear_model.Lasso(), search_spaces, n_iter=30, cv=n_inner, scoring='neg_mean_absolute_error', random_state=920)
    # executes bayesian optimization
    clf = opt.fit(x_train, y_train)
        
    # Calculate performance on test set
    fpr, tpr, threshold = roc_curve(y.iloc[test_index][:].values, clf.predict(x_test), pos_label=1)
    score_kfold.append(auc(fpr,tpr))
    print('Performance on %d-fold is %.2f' %(i,score_kfold[i]))
    i+=1

print('Mean score over k-fold outer loop is %.2f' %mean(score_kfold))

Performance on 0-fold is 0.90
Performance on 1-fold is 0.88
Performance on 2-fold is 0.87
Performance on 3-fold is 0.93
Performance on 4-fold is 0.90
Mean score over k-fold outer loop is 0.90


- Random Forest

In [7]:
# ---------------------------------------------------#   
#                  Random Forest                    #
#--------------------------------------------------# 

##---   Define search space   ---##
search_spaces = {'n_estimators':Integer (1e0, 1e+2, prior='log-uniform')}


n_outer = 5 # number of splits for outer loop
n_inner = 5 # number of splits for inner loop

##---   Classification with nested 5*5-fold cross-validation for model evaluation   ---##
#---  define outer loop of K-fold cross validation ---#
KF = model_selection.StratifiedKFold(n_outer, shuffle=True, random_state=920)
score_kfold = []
i = 0
for train_index,test_index in KF.split(x,y):
    #---  Seperate traing set and test set ---#
    x_train, x_test = x.iloc[train_index][:], x.iloc[test_index][:]
    y_train = y.iloc[train_index][:]
        
    #---  Fill NaN age ---#
    x_train[isnan(x_train)] = 0
    x_test[isnan(x_test)] = 0  
        
    ##---  optimize RF with Genetic Algorithm under inner loop cross validation   ---##
    opt = BayesSearchCV(ensemble.RandomForestClassifier(), search_spaces, n_iter=30, cv=n_inner, scoring='neg_mean_absolute_error', random_state=920)
    # executes bayesian optimization
    clf = opt.fit(x_train, y_train)
        
    # Calculate performance on test set
    fpr, tpr, threshold = roc_curve(y.iloc[test_index][:].values, clf.predict(x_test), pos_label=1)
    score_kfold.append(auc(fpr,tpr))
    print('Performance on %d-fold is %.2f' %(i,score_kfold[i]))
    i+=1

print('Mean score over k-fold outer loop is %.2f' %mean(score_kfold))

Performance on 0-fold is 0.86
Performance on 1-fold is 0.85
Performance on 2-fold is 0.82
Performance on 3-fold is 0.88
Performance on 4-fold is 0.87
Mean score over k-fold outer loop is 0.86


- Multi Layer Perceptron

In [6]:
# ---------------------------------------------------#   
#              Multi Layer Perceptron               #
#--------------------------------------------------# 

##---   Define search space   ---##
search_spaces = {'activation': Categorical(['identity','logistic','tanh','relu']),
                 'hidden_layer_sizes':Integer(1,1e+3, prior='log-uniform'),
                 'alpha':Real(1e-4, 1e+2, prior='log-uniform'),
                 'learning_rate_init':Real(1e-6, 1e+1, prior='log-uniform')}

n_outer = 5 # number of splits for outer loop
n_inner = 5 # number of splits for inner loop

##---   Classification with nested 5*5-fold cross-validation   ---##
#---  define outer loop of K-fold cross validation for model evaluation ---#
KF = model_selection.StratifiedKFold(n_outer, shuffle=True, random_state=920)
score_kfold = []
i = 0
for train_index,test_index in KF.split(x,y):
    #---  Seperate traing set and test set ---#
    x_train, x_test = x.iloc[train_index][:], x.iloc[test_index][:]
    y_train = y.iloc[train_index][:]
        
    #---  Fill NaN age ---#
    x_train[isnan(x_train)] = 0
    x_test[isnan(x_test)] = 0  
        
    ##---  optimize MLP with Genetic Algorithm under inner loop cross validation   ---##
    opt = BayesSearchCV(neural_network.MLPClassifier(), search_spaces, n_iter=30, cv=n_inner, scoring='neg_brier_score', random_state=920)
    # executes bayesian optimization
    clf = opt.fit(x_train, y_train)
        
    # Calculate performance on test set
    fpr, tpr, threshold = roc_curve(y.iloc[test_index][:].values, clf.predict_proba(x_test)[:,1], pos_label=1)
    score_kfold.append(auc(fpr,tpr))
    print('Performance on %d-fold is %.2f' %(i,score_kfold[i]))
    i+=1

print('Mean score over k-fold outer loop is %.2f' %mean(score_kfold))

Performance on 0-fold is 0.93
Performance on 1-fold is 0.91
Performance on 2-fold is 0.88
Performance on 3-fold is 0.94
Performance on 4-fold is 0.89
Mean score over k-fold outer loop is 0.91
