In [None]:
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.metrics import roc_curve, auc, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix as cm
from sklearn.multiclass import OneVsRestClassifier,OneVsOneClassifier
from sklearn.preprocessing import OneHotEncoder,LabelEncoder,label_binarize
import matplotlib.pyplot as plt
import pandas as pd
import time
import random
import numpy

from numpy import *
from sklearn import *
from pandas import *
from scipy import stats

# import dataset2
xl = pd.ExcelFile('data2.xlsx')
xl.sheet_names # we'll take 7th
dfs = {sheet: xl.parse(sheet) for sheet in xl.sheet_names}
data1 = dfs['7']
data2 = dfs['1'].loc[:,['Patient','Age at Diagnosis']].drop([554]).drop_duplicates()
# import datas/et1
data3 = pd.read_csv('data1.csv')
combined_data = data1.set_index('Patient').join(data2.set_index('Patient')).join(data3.set_index('Patient'))


combined_data = combined_data[~combined_data['Patient Type'].isin(['Healthy','Duodenal Cancer'])]
le = preprocessing.LabelEncoder().fit(combined_data['Patient Type'])
combined_data['label'] = le.transform(combined_data['Patient Type'])
combined_data = combined_data.drop(['Patient Type'],axis=1)
print('The number of samples and features are %d and %d, respectively'%(combined_data.shape[0],combined_data.shape[1]))


x = combined_data.iloc[:, 0:44]
x[isnan(x)] = 0
y=combined_data.iloc[:,44]

In [None]:
def auc_compute(n_classes,y_binary,y_score):

    # Compute ROC curve and ROC area for each class
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_binary[:, i], y_score[:, i])
        roc_auc[i] = [auc(fpr[i], tpr[i])]
    return roc_auc,fpr,tpr

In [None]:
def SFLA_SVM(x_traincv, y_traincv,x_testcv, y_testcv,kernel,C,gamma=False,degree=False,coef0=False):
    estimator = svm.SVC(C=C, kernel=kernel,gamma=gamma, coef0=coef0)
    clf = OneVsRestClassifier(estimator).fit(x_traincv, y_traincv)
    
    n_classes = 7
    y_binary=label_binarize(y_testcv,classes=[0,1,2,3,4,5,6])
    y_score = clf.decision_function(x_testcv)

    roc_auc,_,_ = auc_compute(n_classes,y_binary,y_score)
    auc_mean = 0
    for k in range(n_classes):
        auc_mean = auc_mean+roc_auc[k][0]
    auc_mean = auc_mean/n_classes
    
    return auc_mean

In [None]:
def SFLA_SVM_CV(x_train, y_train,n,kernel,C,gamma=False,degree=False,coef0=False):
    '''
    n: number of splits for k-fold
    
    '''
    KF = KFold(n_splits=n,shuffle=True, random_state=920)
    f = []
    for train_indexcv,test_indexcv in KF.split(x_train):
        x_traincv, x_testcv = x_train.iloc[train_indexcv][:], x_train.iloc[test_indexcv][:]
        y_traincv, y_testcv = y_train.iloc[train_indexcv][:], y_train.iloc[test_indexcv][:]
        fq = SFLA_SVM(x_traincv, y_traincv,x_testcv, y_testcv,kernel,C,gamma) 
        f.append(fq) 
    f = mean(f)
    return f

In [None]:
def SFLA_RBF(num_parameter,num_global,num_local,m,n,q,n1,kernel,rangeC,rangeGamma,x_train,y_train):
    '''
    num_parameter: int, number of parameter to optimize
    
    num_global: int, the maximum number of global iterations
    
    num_local: int, the maximum number of local iterations
    
    m : int, the number of memeplexes
    
    n : int, the number of frogs in each memeplex
    
    q : int, the number of frogs in submemeplex
    
    n1:  number of splits for cross validation for inner loop
    
    rangeC: list, float, range of parameter C,eg.[10**-2, 10**2]
    
    rangeGamma: list, float, range of parameter Gamma,eg.[10**-6, 1]

    x_train: feature

    y_train: lable

    '''

    #--- Step 0--Initialize parameters ---#
    sizeC = 2
    sizeGamma = 2
    max_step =  [(rangeC[1]-rangeC[0])/sizeC,(rangeGamma[1]-rangeGamma[0])/sizeGamma]# maximum step size
    
    #--- Step 1--Generate initial population ---#
    frogC = 10**random.uniform(log10(rangeC[0]),log10(rangeC[1]),m*n)
    frogGamma = 10**random.uniform(log10(rangeGamma[0]),log10(rangeGamma[1]),m*n)
    frog = c_[frogC,frogGamma]

    # Compute the performance value for each frog on validation data #
    KF = KFold(n_splits=n1,shuffle=True, random_state=920)
    f = zeros((m*n,n1))
    j = 0
    for train_indexcv,test_indexcv in KF.split(x_train):
        x_traincv, x_testcv = x_train.iloc[train_indexcv][:], x_train.iloc[test_indexcv][:]
        y_traincv, y_testcv = y_train.iloc[train_indexcv][:], y_train.iloc[test_indexcv][:]
        for i in range(m*n):
            f[i,j] = SFLA_SVM(x_traincv, y_traincv,x_testcv, y_testcv,kernel,frog[i,0],frog[i,1])
        j+=1
    f = f.mean(axis=1)
    f_parameter = c_[f,frog]

    #--- Step 2--Rank frogs ---#
    f_parameter = f_parameter[argsort(f_parameter[:,0])[::-1]]


    #######--- Global search start---######
    i_global = 0
    flag = 0
    fBest_iteration = f_parameter[0,0]
    weights = [2*(n+1-j)/(n*(n+1)) for j in range(1,n+1)] # weights of ranked frogs in each memeplex
    while i_global < num_global:
        frog_gb = f_parameter[0,0] # mark the global best frog      
        #--- Step 3--Partition frogs into memeplexes ---#
        memeplexes = zeros((m,n,num_parameter+1)) # [memeplexes, frog in memeplex,[f,C,Gamma] ]
        for i in range(m):
            memeplexes[i] = f_parameter[linspace(i,m*n+i,num=n,endpoint=False,dtype=int)]

        #######--- Local search start---######
        #--- Step 4--Memetic evolution within each memeplex ---#
        im = 0 # the number of memeplexes that have been optimized
        while im < m:
            i_local = 0 # counts the number of local evolutionary steps in each memeplex
            while i_local < num_local:

                #--- Construct a submemeplex ---#
                rValue = random.random(n)*weights # random value with probability weights
                subindex = sort(argsort(rValue)[::-1][0:q]) # index of selected frogs in memeplex 
                submemeplex = memeplexes[im][subindex] # form submemeplex

                #--- Improve the worst frog's position ---#
                # Learn from local best Pb #
                Pb = submemeplex[0] # mark the best frog in submemeplex
                Pw = submemeplex[q-1] # mark the worst frog in memeplex
                S = (Pb-Pw)[1:]*(Pb-Pw)[0] 
                Uq = Pw[1:]+S
                # Check feasible space and the performance #
                if (rangeC[0] <= Uq[0] <=rangeC[1]) and (rangeGamma[0] <= Uq[1] <=rangeGamma[1]): # check feasible space
                    fq = SFLA_SVM_CV(x_train, y_train,n1,kernel,Uq[0],Uq[1])
                    if fq < Pw[0]: # if no improvement of performance,learn from global best randomly #
                        S = random.random(num_parameter)*(frog_gb-Pw)[1:]
                        for i in range(num_parameter):
                            if S[i] > 0:
                                S[i] = min(S[i],max_step[i])
                            else:
                                S[i] = min(S[i],-max_step[i])
                        Uq = Pw[1:]+S
                        if (rangeC[0] <= Uq[0] <=rangeC[1]) and (rangeGamma[0] <= Uq[1] <=rangeGamma[1]): # check feasible space
                            fq = SFLA_SVM_CV(x_train, y_train,n1,kernel,Uq[0],Uq[1])
                            if fq < Pw[0]: # if no improvement of performance, randomly generate a new frog
                                Uq = [10**random.uniform(log10(rangeC[0]),log10(rangeC[1])),10**random.uniform(log10(rangeGamma[0]),log10(rangeGamma[1]))]
                                fq = SFLA_SVM_CV(x_train, y_train,n1,kernel,Uq[0],Uq[1])
                        else: # if not in the feasible space, randomly generate a new frog
                            Uq = [10**random.uniform(log10(rangeC[0]),log10(rangeC[1])), 10**random.uniform(log10(rangeGamma[0]),log10(rangeGamma[1]))]
                            fq = SFLA_SVM_CV(x_train, y_train,n1,kernel,Uq[0],Uq[1])            
                else: # if not in the feasible space, learn from global best randomly 
                    S = random.random(num_parameter)*(frog_gb-Pw)[1:]
                    for i in range(num_parameter):
                        if S[i] > 0:
                            S[i] = min(S[i],max_step[i])
                        else:
                            S[i] = min(S[i],-max_step[i])
                    Uq = Pw[1:]+S
                    if (rangeC[0] <= Uq[0] <=rangeC[1]) and (rangeGamma[0] <= Uq[1] <=rangeGamma[1]): # check feasible space
                        fq = SFLA_SVM_CV(x_train, y_train,n1,kernel,Uq[0],Uq[1])
                        if fq < Pw[0]: # if no improvement of performance, randomly generate a new frog
                            Uq = [10**random.uniform(log10(rangeC[0]),log10(rangeC[1])), 10**random.uniform(log10(rangeGamma[0]),log10(rangeGamma[1]))]
                            fq = SFLA_SVM_CV(x_train, y_train,n1,kernel,Uq[0],Uq[1])
                    else: # if not in the feasible space, randomly generate a new frog
                        Uq = [10**random.uniform(log10(rangeC[0]),log10(rangeC[1])), 10**random.uniform(log10(rangeGamma[0]),log10(rangeGamma[1]))]
                        fq = SFLA_SVM_CV(x_train, y_train,n1,kernel,Uq[0],Uq[1])

                #--- Upgrade the memeplex ---# 
                memeplexes[im][subindex[q-1]] = r_[fq,Uq]
                memeplexes[im] =  memeplexes[im][argsort( memeplexes[im][:,0])[::-1]]            

                i_local += 1

            im += 1
        #######--- Local search end---######    

        #--- Step 5--Shuffle memeplexes ---#
        f_parameter =  memeplexes.reshape(m*n,num_parameter+1)
        f_parameter = f_parameter[argsort(f_parameter[:,0])[::-1]]


        i_global += 1

        #--- Step 6--Check convergence ---#
        if f_parameter[0,0] > 0.99:
            print('The program was terminated because it reached the optimization goal with f = %.3f' %f_parameter[0,0])
            break
#         if   abs(frog_gb - f_parameter[0,0])<10**-4:
#             flag +=1
#         if flag > 30:
#             break
        fBest_iteration = r_[fBest_iteration,f_parameter[0,0]] 

    #######--- Global search end---######
        
    return (f_parameter[0],fBest_iteration)

In [None]:
import warnings
warnings.filterwarnings("ignore")
import time
start = time.process_time()
n_repeat = 10 # number of time that nested k-fold cross validation repeat
n_outer = 10 # number of splits for outer loop
n_inner = 10 # number of splits for inner loop
rangeC = [10**-12, 10**12] # list, float, range of parameter C,eg.[10**-2, 10**2]
rangeGamma = [10**-6, 1] # list, float, range of parameter Gamma,eg.[10**-6, 1]

num_parameter = 2# number of parameter to optimize
num_global = 30# the maximum number of global iterations
num_local = 20# the maximum number of local iterations
m =4 # the number of memeplexes
n = 8 # the number of frogs in each memeplex
q = 5 # the number of frogs in submemeplex
kernel = 'rbf'

fpr = dict()
tpr = dict()
roc_auc = dict()
j = 0
for k in range(n_repeat):
    ##---Classification with nested 10*10-fold cross-validation---##
    #--- x is feature, y is lable, n is number of fold
    #---  define K-fold cross validation ---#
    KF = StratifiedKFold(n_outer,shuffle=True, random_state=920+k)
    
    for train_index,test_index in KF.split(x,y):
        #---  Seperate traing set and test set ---#
        x_train, x_test = x.iloc[train_index][:], x.iloc[test_index][:]
        y_train = y.iloc[train_index][:]
        
        #---  Fill NaN age ---#
        x_train[isnan(x_train)] = 0
        x_test[isnan(x_test)] = 0
        ##---  optimize SVM with SFLA---##
        f_parameter,fBest_iteration = SFLA_RBF(num_parameter,num_global,num_local,m,n,q,n_inner,kernel,rangeC,rangeGamma,x_train,y_train)
        ##---  creat and train the model ---##
        clf = svm.SVC(kernel=kernel,C=f_parameter[1],gamma=f_parameter[2],probability=True,random_state=920)
        clf.fit(x_train, y_train)
        # Calculate AUC
        n_classes = 7
        y_binary= preprocessing.label_binarize(y.iloc[test_index][:].values,classes=[0,1,2,3,4,5,6])
        y_score = clf.decision_function(x_test)               
        for i in range(n_classes):
            if j ==0:
                y_binary_all = y_binary
                y_score_all = y_score
                
                fpr, tpr, _ = roc_curve(y_binary[:, i], y_score[:, i])
                roc_auc[i] = [auc(fpr, tpr)]
            else:
                y_binary_all = numpy.vstack((y_binary_all,y_binary))
                y_score_all = numpy.vstack((y_score_all,y_score))
                fpr, tpr, _ = roc_curve(y_binary[:, i], y_score[:, i])
                roc_auc[i] = roc_auc[i] + [auc(fpr, tpr)]
        j = j+1
        print(roc_auc)
end = time.process_time()
print('ASVM takes '+str(end - start)+'seconds.\n') 



In [None]:
from sklearn.preprocessing import label_binarize
n_classes = 7
y1 = label_binarize(y, classes=[0,1,2,3,4,5,6])
y_score = clf.decision_function(x)
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y1[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plot of a ROC curve for a specific class
for i in range(n_classes):
    plt.figure()
    plt.plot(fpr[i], tpr[i], label='ROC curve (area = %0.2f)' % roc_auc[i])
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()