In [None]:
%matplotlib inline
import IPython.core.display         
# setup output image format (Chrome works best)
IPython.core.display.set_matplotlib_formats("svg")
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.metrics import roc_curve, auc, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix as cm
import matplotlib.pyplot as plt
import pandas as pd
import time

from numpy import *
from sklearn import *
from scipy import stats

In [None]:
xl = pd.ExcelFile('data2.xlsx')
xl.sheet_names # we'll take 7th
dfs = {sheet: xl.parse(sheet) for sheet in xl.sheet_names}
data1 = dfs['7']
data2 = dfs['1'].loc[:,['Patient','Age at Diagnosis']].drop([554]).drop_duplicates()
# import datas/et1
data3 = pd.read_csv('data1.csv')

combined_data = data1.set_index('Patient').join(data2.set_index('Patient')).join(data3.set_index('Patient'))

combined_data['label'] = (combined_data['Patient Type'] == 'Healthy').astype(int)
combined_data = combined_data.drop(['Patient Type'],axis=1)
print('The number of samples and features are %d and %d, respectively'%(combined_data.shape[0],combined_data.shape[1]))


x = combined_data.iloc[:, 0:44]
x[isnan(x)] = 0
y=combined_data.iloc[:,44]

In [None]:
def classificationCV(x,y,classifier,n):
##---Classification with n-fold cross-validation---##
#--- x is feature, y is lable, clf is classifier, n is number of fold

    #---  define K-fold cross validation ---#
    KF = StratifiedKFold(n_splits=n,shuffle=True, random_state=920)
    y_score = []
    y_test = []
    auc_kfold = []
    start = time.process_time()
    for train_index,test_index in KF.split(x,y):
        #---  Seperate traing set and test set ---#
        x_train, x_test = x.iloc[train_index][:], x.iloc[test_index][:]
        y_train = y.iloc[train_index][:]
        
        #---  creat and train the model ---#
        clf = classifier
        clf.fit(x_train, y_train)

        #---  predict ---#
        y_score.extend([x[1] for x in clf.predict_proba(x_test).tolist()])
        y_test.extend(y.iloc[test_index][:].values)
        fpr, tpr, threshold = roc_curve(y.iloc[test_index][:].values, clf.predict_proba(x_test)[:,1], pos_label=1)
        auc_kfold.append(auc(fpr,tpr))
    end = time.process_time()
    print('The algorithm takes '+str(end - start)+'seconds.\n')

    #--- Plot ROC and calculate AUC ---#
    fpr, tpr, threshold = roc_curve(y_test, y_score, pos_label=1)
    roc_auc = auc(fpr,tpr)
    # plt.plot(fpr, tpr, lw=2, label='GPC (AUC = %0.4f)' % roc_auc, linestyle='--')

    #--- calculate accuracy,precision_score,recall_score and f1_score ---#
    y_pred = []
    for i in range(len(y_score)):
        y_pred.append(round(y_score[i]))
    print(cm(y_test,y_pred))
    a = accuracy_score(y_test,y_pred)
    p = precision_score(y_test,y_pred)
    r = recall_score(y_test,y_pred)
    f1score = f1_score(y_test,y_pred)
    
    print('%0.2f %0.2f %0.2f %0.2f %0.3f'% (a, p, r, f1score, roc_auc))

    return y_test, y_score,min(auc_kfold),max(auc_kfold),std(auc_kfold),auc_kfold#clf, a, p, r, f1score,roc_auc, y_pred

In [None]:
print('\n RF')
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier()
y_test, y_score,min_auc,max_auc,std_auc,auc_kfold = classificationCV(x,y,classifier,10)
##Plot ROC and calculate AUC
fpr, tpr, threshold = roc_curve(y_test, y_score, pos_label=1)
roc_auc = auc(fpr,tpr)
fig = plt.figure(figsize=(10,10))
plt.plot(fpr, tpr, lw=2, label='RF      %0.3f ±%0.3f' % (roc_auc,std_auc), linestyle='-')
print('auc_kfold')
print(auc_kfold)

print('\n KNN')
classifier = neighbors.KNeighborsClassifier()
y_test, y_score,min_auc,max_auc,std_auc,auc_kfold = classificationCV(x,y,classifier,10)
##Plot ROC and calculate AUC
fpr, tpr, threshold = roc_curve(y_test, y_score, pos_label=1)
roc_auc = auc(fpr,tpr)
plt.plot(fpr, tpr, lw=2, label='KNN      %0.3f ±%0.3f' % (roc_auc,std_auc), linestyle='-.')
print('auc_kfold')
print(auc_kfold)

print('\n GPC')
from sklearn.gaussian_process.kernels import RBF
classifier = gaussian_process.GaussianProcessClassifier(kernel=1.0 * RBF(1.0))
y_test, y_score,min_auc,max_auc,std_auc,auc_kfold = classificationCV(x,y,classifier,10)
##Plot ROC and calculate AUC
fpr, tpr, threshold = roc_curve(y_test, y_score, pos_label=1)
roc_auc = auc(fpr,tpr)
plt.plot(fpr, tpr, lw=2, label='GPC      %0.3f ±%0.3f' % (roc_auc,std_auc), linestyle='--')
print('auc_kfold')
print(auc_kfold)

print('\n GNB')
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
y_test, y_score,min_auc,max_auc,std_auc,auc_kfold = classificationCV(x,y,classifier,10)
##Plot ROC and calculate AUC
fpr, tpr, threshold = roc_curve(y_test, y_score, pos_label=1)
roc_auc = auc(fpr,tpr)
plt.plot(fpr, tpr, lw=2, label='GNB      %0.3f ±%0.3f' % (roc_auc,std_auc), linestyle=':')
print('auc_kfold')
print(auc_kfold)

print('\n Gradient Boosting')
from sklearn.ensemble import GradientBoostingClassifier
classifier = GradientBoostingClassifier(n_estimators=150, learning_rate=0.1,max_depth = 3, random_state = 0)
y_test, y_score,min_auc,max_auc,std_auc,auc_kfold = classificationCV(x,y,classifier,10)
##Plot ROC and calculate AUC
fpr, tpr, threshold = roc_curve(y_test, y_score, pos_label=1)
roc_auc = auc(fpr,tpr)
plt.plot(fpr, tpr, lw=2, label='GBM      %0.3f ±%0.3f' % (roc_auc,std_auc), linestyle='-')
print('auc_kfold')
print(auc_kfold)


print('SVM')
classifier = svm.SVC(kernel = 'rbf', gamma='scale', probability=True)
y_test, y_score,min_auc,max_auc,std_auc,auc_kfold = classificationCV(x,y,classifier,10)
##Plot ROC and calculate AUC
fpr, tpr, threshold = roc_curve(y_test, y_score, pos_label=1)
roc_auc = auc(fpr,tpr)
# plt.plot(fpr, tpr, lw=2, label='SVM      %0.2f(%0.2f - %0.2f)' % (roc_auc,min_auc,max_auc), linestyle='-')
plt.plot(fpr, tpr, lw=2, label='SVM      %0.3f ±%0.3f' % (roc_auc,std_auc), linestyle='-.')
print('auc_kfold')
print(auc_kfold)