In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import roc_curve, auc, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
import random
import numpy

from numpy import *
from sklearn import *
from pandas import *
from scipy import stats

xl = pd.ExcelFile('data2.xlsx')
xl.sheet_names # we'll take 7th
dfs = {sheet: xl.parse(sheet) for sheet in xl.sheet_names}
data1 = dfs['7']
data2 = dfs['1'].loc[:,['Patient','Age at Diagnosis']].drop([554]).drop_duplicates()
# import datas/et1
data3 = pd.read_csv('data1.csv')

combined_data = data1.set_index('Patient').join(data2.set_index('Patient')).join(data3.set_index('Patient'))

combined_data['label'] = (combined_data['Patient Type'] == 'Healthy').astype(int)
combined_data = combined_data.drop(['Patient Type'],axis=1)
print('The number of samples and features are %d and %d, respectively'%(combined_data.shape[0],combined_data.shape[1]))


x = combined_data.iloc[:, 0:44]
x[isnan(x)] = 0
y=combined_data.iloc[:,44]

In [None]:
def SFLA_SVM(x_traincv, y_traincv,x_testcv, y_testcv,kernel,C,gamma=False,degree=False,coef0=False):
    clf = svm.SVC(C=C, kernel=kernel,gamma=gamma, coef0=coef0, probability=True,random_state=920).fit(x_traincv, y_traincv)
    y_score = clf.predict_proba(x_testcv)[:,1]
    fpr, tpr, threshold = metrics.roc_curve(y_testcv, y_score, pos_label=1)
    roc_auc = metrics.auc(fpr,tpr)
    
    return roc_auc 

In [None]:
def SFLA_SVM_CV(x_train, y_train,n,kernel,C,gamma=False,degree=False,coef0=False):
    '''
    n: number of splits for k-fold
    
    '''
    KF = KFold(n_splits=n,shuffle=True, random_state=920)
    f = []
    for train_indexcv,test_indexcv in KF.split(x_train):
        x_traincv, x_testcv = x_train.iloc[train_indexcv][:], x_train.iloc[test_indexcv][:]
        y_traincv, y_testcv = y_train.iloc[train_indexcv][:], y_train.iloc[test_indexcv][:]
        fq = SFLA_SVM(x_traincv, y_traincv,x_testcv, y_testcv,kernel,C,gamma=gamma,degree=degree,coef0=coef0) 
        f.append(fq) 
    f = mean(f)
    return f

In [None]:
def SFLA_SIGMOID(num_parameter,num_global,num_local,m,n,q,n1,kernel,rangeC,rangeGamma,rangeCoef0,x_train,y_train):
    '''
    num_parameter: int, number of parameter to optimize
    
    num_global: int, the maximum number of global iterations
    
    num_local: int, the maximum number of local iterations
    
    m : int, the number of memeplexes
    
    n : int, the number of frogs in each memeplex
    
    q : int, the number of frogs in submemeplex
    
    n1:  number of splits for cross validation for inner loop
    
    rangeC: list, float, range of parameter C,eg.[10**-2, 10**2]
    
    rangeGamma: list, float, range of parameter Gamma,eg.[10**-6, 1]
    
    rangeCoef0: list, float, range of parameter Coef0,eg.[0, 1]

    x_train: feature

    y_train: lable

    '''

    #--- Step 0--Initialize parameters ---#
    sizeC = 2
    sizeGamma = 2
    sizeCoef0 = 2
    max_step =  [(rangeC[1]-rangeC[0])/sizeC,(rangeGamma[1]-rangeGamma[0])/sizeGamma,(rangeCoef0[1]-rangeCoef0[0])/sizeCoef0]# maximum step size
    
    #--- Step 1--Generate initial population ---#
    frogC = 10**random.uniform(log10(rangeC[0]),log10(rangeC[1]),m*n)
    frogGamma = 10**random.uniform(log10(rangeGamma[0]),log10(rangeGamma[1]),m*n)
    frogCoef0 = random.uniform(rangeCoef0[0],rangeCoef0[1],m*n)
    frog = c_[frogC,frogGamma,frogCoef0]

    # Compute the performance value for each frog on validation data #
    KF = KFold(n_splits=n1,shuffle=True, random_state=920)
    f = zeros((m*n,n1))
    j = 0
    for train_indexcv,test_indexcv in KF.split(x_train):
        x_traincv, x_testcv = x_train.iloc[train_indexcv][:], x_train.iloc[test_indexcv][:]
        y_traincv, y_testcv = y_train.iloc[train_indexcv][:], y_train.iloc[test_indexcv][:]
        for i in range(m*n):
            f[i,j] = SFLA_SVM(x_traincv, y_traincv,x_testcv, y_testcv,kernel,frog[i,0],frog[i,1],frog[i,2])
        j+=1
    f = f.mean(axis=1)
    f_parameter = c_[f,frog]

    #--- Step 2--Rank frogs ---#
    f_parameter = f_parameter[argsort(f_parameter[:,0])[::-1]]


    #######--- Global search start---######
    i_global = 0
    flag = 0
    fBest_iteration = f_parameter[0,0]
    weights = [2*(n+1-j)/(n*(n+1)) for j in range(1,n+1)] # weights of ranked frogs in each memeplex
    while i_global < num_global:
        frog_gb = f_parameter[0,0] # mark the global best frog      
        #--- Step 3--Partition frogs into memeplexes ---#
        memeplexes = zeros((m,n,num_parameter+1)) # [memeplexes, frog in memeplex,[f,C,Gamma,Coef0] ]
        for i in range(m):
            memeplexes[i] = f_parameter[linspace(i,m*n+i,num=n,endpoint=False,dtype=int)]

        #######--- Local search start---######
        #--- Step 4--Memetic evolution within each memeplex ---#
        im = 0 # the number of memeplexes that have been optimized
        while im < m:
            i_local = 0 # counts the number of local evolutionary steps in each memeplex
            while i_local < num_local:

                #--- Construct a submemeplex ---#
                rValue = random.random(n)*weights # random value with probability weights
                subindex = sort(argsort(rValue)[::-1][0:q]) # index of selected frogs in memeplex 
                submemeplex = memeplexes[im][subindex] # form submemeplex

                #--- Improve the worst frog's position ---#
                # Learn from local best Pb #
                Pb = submemeplex[0] # mark the best frog in submemeplex
                Pw = submemeplex[q-1] # mark the worst frog in memeplex
                S = (Pb-Pw)[1:]*(Pb-Pw)[0] 
                Uq = Pw[1:]+S
                # Check feasible space and the performance #
                if (rangeC[0] <= Uq[0] <=rangeC[1]) and (rangeGamma[0] <= Uq[1] <=rangeGamma[1])and(rangeCoef0[0] <= Uq[2] <=rangeCoef0[1]): # check feasible space
                    fq = SFLA_SVM_CV(x_train, y_train,n1,kernel,Uq[0],Uq[1],Uq[2])
                    if fq < Pw[0]: # if no improvement of performance,learn from global best randomly #
                        S = random.random(num_parameter)*(frog_gb-Pw)[1:]
                        for i in range(num_parameter):
                            if S[i] > 0:
                                S[i] = min(S[i],max_step[i])
                            else:
                                S[i] = min(S[i],-max_step[i])
                        Uq = Pw[1:]+S
                        if (rangeC[0] <= Uq[0] <=rangeC[1]) and (rangeGamma[0] <= Uq[1] <=rangeGamma[1])\
                        and(rangeCoef0[0] <= Uq[2] <=rangeCoef0[1]): # check feasible space
                            fq = SFLA_SVM_CV(x_train, y_train,n1,kernel,Uq[0],Uq[1],Uq[2])
                            if fq < Pw[0]: # if no improvement of performance, randomly generate a new frog
                                Uq = [10**random.uniform(log10(rangeC[0]),log10(rangeC[1])),10**random.uniform(log10(rangeGamma[0]),log10(rangeGamma[1])),\
                                      random.uniform(rangeCoef0[0],rangeCoef0[1])]
                                fq = SFLA_SVM_CV(x_train, y_train,n1,kernel,Uq[0],Uq[1],Uq[2])
                        else: # if not in the feasible space, randomly generate a new frog
                            Uq = [10**random.uniform(log10(rangeC[0]),log10(rangeC[1])), 10**random.uniform(log10(rangeGamma[0]),log10(rangeGamma[1])),\
                                      random.uniform(rangeCoef0[0],rangeCoef0[1])]
                            fq = SFLA_SVM_CV(x_train, y_train,n1,kernel,Uq[0],Uq[1],Uq[2])            
                else: # if not in the feasible space, learn from global best randomly 
                    S = random.random(num_parameter)*(frog_gb-Pw)[1:]
                    for i in range(num_parameter):
                        if S[i] > 0:
                            S[i] = min(S[i],max_step[i])
                        else:
                            S[i] = min(S[i],-max_step[i])
                    Uq = Pw[1:]+S
                    if (rangeC[0] <= Uq[0] <=rangeC[1]) and (rangeGamma[0] <= Uq[1] <=rangeGamma[1])\
                    and(rangeCoef0[0] <= Uq[2] <=rangeCoef0[1]): # check feasible space
                        fq = SFLA_SVM_CV(x_train, y_train,n1,kernel,Uq[0],Uq[1],Uq[2])
                        if fq < Pw[0]: # if no improvement of performance, randomly generate a new frog
                            Uq = [10**random.uniform(log10(rangeC[0]),log10(rangeC[1])), 10**random.uniform(log10(rangeGamma[0]),log10(rangeGamma[1])),\
                                  random.uniform(rangeCoef0[0],rangeCoef0[1])]
                            fq = SFLA_SVM_CV(x_train, y_train,n1,kernel,Uq[0],Uq[1],Uq[2])
                    else: # if not in the feasible space, randomly generate a new frog
                        Uq = [10**random.uniform(log10(rangeC[0]),log10(rangeC[1])), 10**random.uniform(log10(rangeGamma[0]),log10(rangeGamma[1])),\
                              random.uniform(rangeCoef0[0],rangeCoef0[1])]
                        fq = SFLA_SVM_CV(x_train, y_train,n1,kernel,Uq[0],Uq[1],Uq[2])

                #--- Upgrade the memeplex ---# 
                memeplexes[im][subindex[q-1]] = r_[fq,Uq]
                memeplexes[im] =  memeplexes[im][argsort( memeplexes[im][:,0])[::-1]]            

                i_local += 1

            im += 1
        #######--- Local search end---######    

        #--- Step 5--Shuffle memeplexes ---#
        f_parameter =  memeplexes.reshape(m*n,num_parameter+1)
        f_parameter = f_parameter[argsort(f_parameter[:,0])[::-1]]


        i_global += 1

        #--- Step 6--Check convergence ---#
        if f_parameter[0,0] > 0.99:
            print('The program was terminated because it reached the optimization goal with f = %.3f' %f_parameter[0,0])
            break
        if   abs(frog_gb - f_parameter[0,0])<10**-4:
            flag +=1
        if flag > 30:
            break
        fBest_iteration = r_[fBest_iteration,f_parameter[0,0]] 

    #######--- Global search end---######
        
    return (f_parameter[0],fBest_iteration)

In [None]:
#--- Ensemble ---#
def OptimizeSVM_SFLA_CV(x,y,n_splits,num_parameter,num_global,num_local,m,n,q,n1,kernel,rangeC,rangeGamma = False,rangeDegree = False,rangeCoef0 = False):

##---Classification with n-fold cross-validation---##
#--- x is feature, y is lable, n is number of fold
    #---  define K-fold cross validation ---#
    KF = KFold(n_splits,shuffle=True, random_state=920)
    y_score = []
    y_test = []
    for train_index,test_index in KF.split(x):
        #---  Seperate traing set and test set ---#
        x_train, x_test = x.iloc[train_index][:], x.iloc[test_index][:]
        y_train = y.iloc[train_index][:]
        
        #---  Fill NaN age ---#
        x_train[isnan(x_train)] = 0
        x_test[isnan(x_test)] = 0    
        
        ##---  optimize SVM with SFLA---##
        x_train = pd.DataFrame(x_train) 
        y_train = pd.Series(y_train)
        if kernel == 'poly':
            f_parameter,fBest_iteration = SFLA_POLY(num_parameter,num_global,num_local,m,n,q,n1,kernel,rangeC,rangeGamma,rangeDegree,rangeCoef0,x_train,y_train)
            # f_parameter: list, [bestAUC,bestC,bestGamma,bestDegree,bestCoef0]   fBest_iteration: bestAUC in each iteration
            ##---  creat and train the model ---##
            clf = svm.SVC(kernel=kernel,C=f_parameter[1],gamma=f_parameter[2],degree=f_parameter[3],coef0=f_parameter[4],probability=True,random_state=920)
            
        
        if kernel == 'rbf':
            f_parameter,fBest_iteration = SFLA_RBF(num_parameter,num_global,num_local,m,n,q,n1,kernel,rangeC,rangeGamma,x_train,y_train)
            # f_parameter: list, [bestAUC,bestC,bestGamma,bestDegree,bestCoef0]   fBest_iteration: bestAUC in each iteration
        
            ##---  creat and train the model ---##
            clf = svm.SVC(kernel=kernel,C=f_parameter[1],gamma=f_parameter[2],probability=True,random_state=920)
        
        if kernel == 'linear':
            f_parameter,fBest_iteration = SFLA_LINEAR(num_parameter,num_global,num_local,m,n,q,n1,kernel,rangeC,x_train,y_train)
            # f_parameter: list, [bestAUC,bestC,bestGamma,bestDegree,bestCoef0]   fBest_iteration: bestAUC in each iteration
        
            ##---  creat and train the model ---##
            clf = svm.SVC(kernel=kernel,C=f_parameter[1],probability=True,random_state=920)
        
        if kernel == 'sigmoid':
            f_parameter,fBest_iteration = SFLA_SIGMOID(num_parameter,num_global,num_local,m,n,q,n1,kernel,rangeC,rangeGamma,rangeCoef0,x_train,y_train)
            # f_parameter: list, [bestAUC,bestC,bestGamma,bestDegree,bestCoef0]   fBest_iteration: bestAUC in each iteration
        
            ##---  creat and train the model ---##
            clf = svm.SVC(kernel=kernel,C=f_parameter[1],gamma=f_parameter[2],coef0=f_parameter[3],probability=True,random_state=920)
        
        print(f_parameter)
        clf.fit(x_train, y_train)
        # Plot ROC and calculate AUC
        y_score.extend([x[1] for x in clf.predict_proba(x_test).tolist()])
        y_test.extend(y[test_index].tolist())
        fpr, tpr, threshold = roc_curve(y[test_index], clf.predict_proba(x_test)[:, 1], pos_label=1)
        roc_auc = auc(fpr,tpr)
        print('AUC:',roc_auc)

    # Plot ROC and calculate AUC
    fpr, tpr, threshold = roc_curve(y_test, y_score, pos_label=1)
    roc_auc = auc(fpr,tpr)
#     plt.plot(fpr, tpr, lw=2, label='SVM (AUC = %0.4f)' % roc_auc, linestyle='--')
    y_pred = []
    for i in range(len(y_score)):
        y_pred.append(round(y_score[i]))
    print(cm(y_test,y_pred))
    a = accuracy_score(y_test,y_pred)
    p = precision_score(y_test,y_pred)
    r = recall_score(y_test,y_pred)
    f1score = f1_score(y_test,y_pred)
    print('Accuracy is %0.2f\nPrecision is %0.2f\nRecall is %0.2f\nF1 is %0.2f\nAUC is %0.4f\n'% (a, p, r, f1score, roc_auc))      
    return clf, a, p, r, f1score,roc_auc, y_pred,y_score,fpr, tpr

In [None]:
import time
start = time.process_time()
n_splits = 10 # number of splits for outer loop
num_parameter = 3# number of parameter to optimize
num_global = 30# the maximum number of global iterations
num_local = 20# the maximum number of local iterations
m =4 # the number of memeplexes
n = 8 # the number of frogs in each memeplex
q = 5 # the number of frogs in submemeplex
n1 = 10 # number of splits for inner loop
kernel = 'sigmoid'
rangeC = [10**-12, 10**12] # list, float, range of parameter C,eg.[10**-2, 10**2]
rangeGamma = [10**-6, 1] # list, float, range of parameter Gamma,eg.[10**-6, 1]
rangeCoef0 = [0, 1] # list, float, range of parameter Coef0,eg.[0, 1]
clf, a, p, r, f1score,roc_auc, y_pred,y_score,fpr, tpr = OptimizeSVM_SFLA_CV(x,y,n_splits,num_parameter,num_global,num_local,m,n,q,n1,kernel,rangeC,rangeGamma,rangeCoef0=rangeCoef0)
end = time.process_time()
print('OptimizeSVM_SFLA_CV algorithm takes '+str(end - start)+'seconds.\n') 

In [None]:
import csv
fname = 'ASVM_result_Sigmoid.csv'

# header
tmp = [['Id', 'Prediction','Probability']]
    
# add ID numbers for each Y
for (i,y) in enumerate(y_pred):
    tmp2 = [(i+1), y_pred,y_score]
    tmp.append(tmp2)

# write CSV file
with open(fname, 'w') as f:
    writer = csv.writer(f)
    writer.writerows(tmp)