In [2]:
import numpy as np
import pandas as pd
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
import re, os, sys
import math
import torch
import warnings
warnings.filterwarnings("ignore")


Cross-Validation on the train dataset

In [2]:
def cv(clf, X, y, nr_fold):
    ix = []
    for i in range(0, len(y)):
        ix.append(i)
    ix = np.array(ix)
    
    allACC = []
    allSENS = []
    allSPEC = []
    allMCC = []
    allAUC = []
    for j in range(0, nr_fold):
        train_ix = ((ix % nr_fold) != j)
        test_ix = ((ix % nr_fold) == j)
        train_X, test_X = X[train_ix], X[test_ix]
        train_y, test_y = y[train_ix], y[test_ix]
        clf.fit(train_X, train_y)        
        p = clf.predict(test_X)
        pr = clf.predict_proba(test_X)[:,1]   
        TP=0   
        FP=0
        TN=0
        FN=0
        for i in range(0,len(test_y)):
            if test_y[i]==1 and p[i]==1:
                TP+= 1
            elif test_y[i]==1 and p[i]==0:
                FN+= 1
            elif test_y[i]==0 and p[i]==1:
                FP+= 1
            elif test_y[i]==0 and p[i]==0:
                TN+= 1
        ACC = (TP+TN)/(TP+FP+TN+FN)
        SENS = TP/(TP+FN)
        SPEC = TN/(TN+FP)
        det = math.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
        if (det == 0):            
            MCC = 0                
        else:
            MCC = ((TP*TN)-(FP*FN))/det
        AUC = roc_auc_score(test_y, pr)
       
        allACC.append(ACC)
        allSENS.append(SENS)
        allSPEC.append(SPEC)
        allMCC.append(MCC)
        allAUC.append(AUC)
        
    return np.mean(allACC), np.mean(allSENS), np.mean(allSPEC), np.mean(allMCC), np.mean(allAUC)


Independent test on the test dataset

In [3]:
def test(clf, X, y, Xt, yt):
    train_X, test_X = X, Xt
    train_y, test_y = y, yt       
    p = clf.predict(test_X)
    pr = clf.predict_proba(test_X)[:,1]   
    TP=0   
    FP=0
    TN=0
    FN=0
    for i in range(0,len(test_y)):
        if test_y[i]==1 and p[i]==1:
            TP+= 1
        elif test_y[i]==1 and p[i]==0:
            FN+= 1
        elif test_y[i]==0 and p[i]==1:
            FP+= 1
        elif test_y[i]==0 and p[i]==0:
            TN+= 1
    ACC = (TP+TN)/(TP+FP+TN+FN)
    SENS = TP/(TP+FN)
    SPEC = TN/(TN+FP)
    det = math.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
    if (det == 0):            
        MCC = 0                
    else:
        MCC = ((TP*TN)-(FP*FN))/det
    AUC = roc_auc_score(test_y, pr)
   
    return ACC, SENS, SPEC, MCC, AUC

Load dataset and split the dataset(Please change your paths)

In [8]:
pos_ade = torch.load("C:\\Windows\\System32\\PLMTHP\\data\\Feature\\pos_ade.pt")
neg_ade = torch.load("C:\\Windows\\System32\\PLMTHP\\data\\Feature\\neg_ade.pt")

pos = pos_ade.numpy()
neg = neg_ade.numpy()

all_data = np.concatenate((pos, neg), axis=0)
X = all_data
y=np.zeros(1302,dtype=int)
for i in range(1302):
    if i<651:
        y[i]=1
    else:
        y[i]=0

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
X=X_train
y=y_train
Xt=X_test
yt=y_test

del pos_ade,neg_ade,pos,neg,all_data

651.0
651


Tuning the best parameters of KNN classifier and train the model

In [5]:
param = [i for i in np.arange(1,100, dtype=int)]
acc = np.zeros(len(param)) 
sens = np.zeros(len(param)) 
spec = np.zeros(len(param)) 
mcc = np.zeros(len(param)) 
auc = np.zeros(len(param))

for i in range(1,len(param)):  
    clf = KNeighborsClassifier(n_neighbors = int(i))
    acc[i], sens[i], spec[i], mcc[i], auc[i] = cv(clf, X,y,10)
choose = np.argmax(auc)
clf = clf = KNeighborsClassifier(n_neighbors = param[choose]).fit(X,y)
print("Best n_neighbors:"+str(param[choose]))


Best n_neighbors:14


Cross-validation evaluation

In [7]:
acc, sens, spec, mcc, auc = cv(clf, X, y, 10) 
print("ACC:"+str(acc)+"\n"+"SENS:"+str(sens)+"\n"+"SPEC:"+str(spec)+"\n"+"MCC:"+str(mcc)+"\n"+"AUC:"+str(auc)+"\n")


ACC:0.8245945945945946
SENS:0.7888477215290215
SPEC:0.8594419487263991
MCC:0.6490361758120748
AUC:0.9082641538194066



Independent test evaluation

In [6]:
acc, sens, spec, mcc, auc = test(clf, X, y, Xt, yt) 
print("ACC:"+str(acc)+"\n"+"SENS:"+str(sens)+"\n"+"SPEC:"+str(spec)+"\n"+"MCC:"+str(mcc)+"\n"+"AUC:"+str(auc)+"\n")


ACC:0.826530612244898
SENS:0.7941176470588235
SPEC:0.8617021276595744
MCC:0.655819774718398
AUC:0.9164580725907384

