In [None]:
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly.express as px
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import cohen_kappa_score


In [None]:
def KNN_Function (input_1,filename):
    #reading inputfile
    inputdf=pd.read_csv(input_1).drop(["ovarall"],axis=1, errors='ignore')
    df=inputdf.drop(["Unnamed: 0"],axis=1, errors='ignore').set_index("species").T
    
    #column with label adden
    df["Label"]=df.index
    df["Label"]=df.Label.apply(lambda x: 'SGP' if 'SGP' in x else x)
    df["Label"]=df.Label.apply(lambda x: 'ICN' if 'ICN' in x else x)
    df["Label"]=df.Label.apply(lambda x: 'NYC' if 'NYC' in x else x)
    df["Label"]=df.Label.apply(lambda x: 'HKG' if 'HKG' in x else x)
    df["Label"]=df.Label.apply(lambda x: 'TPE' if 'TPE' in x else x)
    df["Label"]=df.Label.apply(lambda x: 'TYO' if 'TYO' in x else x)
    df["Label"]=df.Label.apply(lambda x: 'ILR' if 'ILR' in x else x)
    df["Label"]=df.Label.apply(lambda x: 'IEV' if 'IEV' in x else x)
    df["Label"]=df.Label.apply(lambda x: 'VIE' if 'VIE' in x else x)
    
    #train test split
    X=df[df.columns[0:df.shape[1]-1]]
    Y=df["Label"]
    X_train, X_test, Y_train, Y_test=train_test_split(X,Y,test_size=0.3,stratify=df["Label"],random_state=123)
    
    
    #number of neighbors
    acc=[]
    for i in range(3,50):
        knn=KNeighborsClassifier(n_neighbors=i,weights="distance")
        knncv = cross_validate(knn, X_train, Y_train, cv=5)
        acc.append(knncv['test_score'].mean())
    print("Maximum accuracy:",max(acc),"at K =",acc.index(max(acc))+3)
    plt.figure(figsize=(10,6))
    sns.set_style("whitegrid")
    plt.plot(range(3,50),acc,color = 'blue',linestyle='dashed', 
    marker='o',markerfacecolor='red', markersize=10)
    plt.title('accuracy vs. K Value')
    plt.xlabel('k')
    plt.ylabel('Accuracy (5-fold cross validation)')
    plt.savefig(filename,dpi=300)

    
    #KNN Train/Test
    KNN=KNeighborsClassifier(n_neighbors=acc.index(max(acc))+3,weights="distance")
    KNN.fit(X_train,Y_train)
    Y_pred=KNN.predict(X_test)
    print()
    print("classification report:")
    print(metrics.classification_report(Y_test,Y_pred))
    print("Cohen Kappa value:",cohen_kappa_score(Y_test,Y_pred))
    
    # 5 fold cross validation for more accurate accuracy value
    cv = cross_validate(KNN, X_train, Y_train, cv=5)
    print("Test score for each fold:" ,cv['test_score'])
    print("Mean test score for 5 fold cross validation:",cv['test_score'].mean())
    
    # 5 fold cross validation for classification report
    cv = cross_val_predict(KNN, X_train, Y_train, cv=5)
    print(metrics.classification_report(Y_train,cv))

In [None]:
KNN_Function("top10.csv","Top 10")

In [None]:
KNN_Function("top10_1.csv","Top 10_1")

In [None]:
KNN_Function("top10_2.csv","Top 10_2")

In [None]:
KNN_Function("top20.csv","Top 20")

In [None]:
KNN_Function("top20_1.csv","Top 20_1")

In [None]:
KNN_Function("top20_2.csv","Top 20_2")

In [None]:
KNN_Function("top30.csv","Top 30")

In [None]:
KNN_Function("top30_1.csv","Top 30_1")

In [None]:
KNN_Function("top30_2.csv","Top 30_2")

In [None]:
KNN_Function("top100.csv","Top 100")

In [None]:
KNN_Function("top100_1.csv","Top 100_1")

In [None]:
KNN_Function("top100_2.csv","Top 100_2")

In [None]:
KNN_Function("bacteria","all bacteria")