In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline

In [2]:
training_set = pd.read_csv('processed_data/training_set.csv')
testing_set = pd.read_csv('processed_data/testing_set.csv')

In [3]:
training_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6417 entries, 0 to 6416
Data columns (total 49 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0    FC11   6417 non-null   float64
 1    FC12   6417 non-null   float64
 2    FC13   6417 non-null   float64
 3    FC14   6417 non-null   float64
 4    CA21   6417 non-null   float64
 5    CA22   6417 non-null   float64
 6    CA23   6417 non-null   float64
 7    CA24   6417 non-null   float64
 8    CA25   6417 non-null   float64
 9    CA26   6417 non-null   float64
 10   CA30   6417 non-null   float64
 11   CA31   6417 non-null   float64
 12   CA32   6417 non-null   float64
 13   CA33   6417 non-null   float64
 14   CA34   6417 non-null   float64
 15   CA36   6417 non-null   float64
 16   CA37   6417 non-null   float64
 17   CA38   6417 non-null   float64
 18   CA39   6417 non-null   float64
 19   CA40   6417 non-null   float64
 20   CA41   6417 non-null   float64
 21   CA42   6417 non-null   float64
 22  

In [4]:
testing_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 957 entries, 0 to 956
Data columns (total 49 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0    FC11   957 non-null    float64
 1    FC12   957 non-null    float64
 2    FC13   957 non-null    float64
 3    FC14   957 non-null    float64
 4    CA21   957 non-null    float64
 5    CA22   957 non-null    float64
 6    CA23   957 non-null    float64
 7    CA24   957 non-null    float64
 8    CA25   957 non-null    float64
 9    CA26   957 non-null    float64
 10   CA30   957 non-null    float64
 11   CA31   957 non-null    float64
 12   CA32   957 non-null    float64
 13   CA33   957 non-null    float64
 14   CA34   957 non-null    float64
 15   CA36   957 non-null    float64
 16   CA37   957 non-null    float64
 17   CA38   957 non-null    float64
 18   CA39   957 non-null    float64
 19   CA40   957 non-null    float64
 20   CA41   957 non-null    float64
 21   CA42   957 non-null    float64
 22   C

In [5]:
X_train = training_set.drop('Y', axis=1)
y_train = training_set['Y']
X_test = testing_set.drop('Y', axis=1)
y_test = testing_set['Y']

In [6]:
X_train.shape, X_train.shape

((6417, 48), (6417, 48))

In [7]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, train_size=0.8, random_state=20211008, 
                                                  stratify=y_train)

In [8]:
import itertools
n_neighbors = [3,5,7,9,11,13,15] 
weights = ["uniform", "distance"] # Weight function used in prediction.
algorithm = ['auto', 'ball_tree', 'kd_tree', 'brute'] # Algorithm used to compute the nearest neighbors.
p = [2, 3, 5,7] # Power parameter for the Minkowski metric. 
parameters = [n_neighbors,weights,algorithm, p]  
parameters_combinations = list(itertools.product(*parameters))
len(parameters_combinations) 


224

In [9]:
from sklearn.metrics import f1_score
from sklearn.metrics import matthews_corrcoef
from imblearn.metrics import geometric_mean_score

In [10]:
best_acc_params = {"n_neighbors": None, "weights": None, "algorithm": None,
                         "p": None, "accuracy": {
                             "average_score": 0,
                             "f1_score_macro": 0,
                             "f1_score_micro": 0,
                             "MCC": 0,
                             "Gmean": 0
                         }}
for params in parameters_combinations:


    classifer = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=params[0], weights=params[1], 
                                                                       algorithm=params[2], p=params[3]))
    classifer = classifer.fit(X_train,y_train)
    y_pred = classifer.predict(X_val)
    

    f1_score_macro = f1_score(y_val, y_pred, average='macro')
    f1_score_micro = f1_score(y_val, y_pred, average='micro')
    MCC_score = matthews_corrcoef(y_val, y_pred)
    Gmean_score = geometric_mean_score(y_val,y_pred, average='macro')
    accuracy = (f1_score_macro + f1_score_micro + MCC_score + Gmean_score) / 4.0
    
    if accuracy > 0.50:
        print(f'parameters: \n criterion: {params[0]}, bootstrap: {params[1]}, max depth: {params[2]}, max features: {params[3]}')
        print(f'accuracy: {accuracy}')
        if accuracy > best_acc_params['accuracy']['average_score']:
            best_acc_params.update({"n_neighbors": params[0], "weights": params[1], "algorithm": params[2],
                         "p": params[3], "accuracy": {
                             "average_score": accuracy,
                             "f1_score_macro": f1_score_macro,
                             "f1_score_micro": f1_score_micro,
                             "MCC": MCC_score,
                             "Gmean": Gmean_score
                         }})

parameters: 
 criterion: 3, bootstrap: uniform, max depth: auto, max features: 2
accuracy: 0.8880829277560903
parameters: 
 criterion: 3, bootstrap: uniform, max depth: auto, max features: 3
accuracy: 0.8829471100328488
parameters: 
 criterion: 3, bootstrap: uniform, max depth: auto, max features: 5
accuracy: 0.875554484831802
parameters: 
 criterion: 3, bootstrap: uniform, max depth: auto, max features: 7
accuracy: 0.8741815033551875
parameters: 
 criterion: 3, bootstrap: uniform, max depth: ball_tree, max features: 2
accuracy: 0.8880829277560903
parameters: 
 criterion: 3, bootstrap: uniform, max depth: ball_tree, max features: 3
accuracy: 0.8829471100328488
parameters: 
 criterion: 3, bootstrap: uniform, max depth: ball_tree, max features: 5
accuracy: 0.875554484831802
parameters: 
 criterion: 3, bootstrap: uniform, max depth: ball_tree, max features: 7
accuracy: 0.8741815033551875
parameters: 
 criterion: 3, bootstrap: uniform, max depth: kd_tree, max features: 2
accuracy: 0.888082

parameters: 
 criterion: 7, bootstrap: uniform, max depth: kd_tree, max features: 3
accuracy: 0.8630731169133309
parameters: 
 criterion: 7, bootstrap: uniform, max depth: kd_tree, max features: 5
accuracy: 0.851105608458609
parameters: 
 criterion: 7, bootstrap: uniform, max depth: kd_tree, max features: 7
accuracy: 0.8495975274049876
parameters: 
 criterion: 7, bootstrap: uniform, max depth: brute, max features: 2
accuracy: 0.8757697289761543
parameters: 
 criterion: 7, bootstrap: uniform, max depth: brute, max features: 3
accuracy: 0.8630731169133309
parameters: 
 criterion: 7, bootstrap: uniform, max depth: brute, max features: 5
accuracy: 0.851105608458609
parameters: 
 criterion: 7, bootstrap: uniform, max depth: brute, max features: 7
accuracy: 0.8495975274049876
parameters: 
 criterion: 7, bootstrap: distance, max depth: auto, max features: 2
accuracy: 0.8855518414368733
parameters: 
 criterion: 7, bootstrap: distance, max depth: auto, max features: 3
accuracy: 0.87254565727726

parameters: 
 criterion: 11, bootstrap: distance, max depth: auto, max features: 5
accuracy: 0.8584195075849986
parameters: 
 criterion: 11, bootstrap: distance, max depth: auto, max features: 7
accuracy: 0.846239094269925
parameters: 
 criterion: 11, bootstrap: distance, max depth: ball_tree, max features: 2
accuracy: 0.8838089745925211
parameters: 
 criterion: 11, bootstrap: distance, max depth: ball_tree, max features: 3
accuracy: 0.8676985652296118
parameters: 
 criterion: 11, bootstrap: distance, max depth: ball_tree, max features: 5
accuracy: 0.8584195075849986
parameters: 
 criterion: 11, bootstrap: distance, max depth: ball_tree, max features: 7
accuracy: 0.846239094269925
parameters: 
 criterion: 11, bootstrap: distance, max depth: kd_tree, max features: 2
accuracy: 0.8838089745925211
parameters: 
 criterion: 11, bootstrap: distance, max depth: kd_tree, max features: 3
accuracy: 0.8676985652296118
parameters: 
 criterion: 11, bootstrap: distance, max depth: kd_tree, max featur

parameters: 
 criterion: 15, bootstrap: distance, max depth: kd_tree, max features: 5
accuracy: 0.8510245227129041
parameters: 
 criterion: 15, bootstrap: distance, max depth: kd_tree, max features: 7
accuracy: 0.8393558651458042
parameters: 
 criterion: 15, bootstrap: distance, max depth: brute, max features: 2
accuracy: 0.8755239550924925
parameters: 
 criterion: 15, bootstrap: distance, max depth: brute, max features: 3
accuracy: 0.8585497651708958
parameters: 
 criterion: 15, bootstrap: distance, max depth: brute, max features: 5
accuracy: 0.8510245227129041
parameters: 
 criterion: 15, bootstrap: distance, max depth: brute, max features: 7
accuracy: 0.8393558651458042


In [11]:
best_acc_params

{'n_neighbors': 3,
 'weights': 'distance',
 'algorithm': 'auto',
 'p': 2,
 'accuracy': {'average_score': 0.8923385370152274,
  'f1_score_macro': 0.8858051899901557,
  'f1_score_micro': 0.8831775700934581,
  'MCC': 0.8636147542181803,
  'Gmean': 0.9367566337591153}}

In [12]:
KNN = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=best_acc_params['n_neighbors'],
                                                           weights=best_acc_params['weights'], 
                                                           algorithm=best_acc_params['algorithm'], p=best_acc_params['p']))
KNN = KNN.fit(X_train,y_train)
y_pred = KNN.predict(X_test)




f1_score_macro = f1_score(y_test, y_pred, average='macro')
f1_score_micro = f1_score(y_test, y_pred, average='micro')
MCC_score = matthews_corrcoef(y_test, y_pred)
Gmean_score = geometric_mean_score(y_test,y_pred, average='macro')


print(f'f1_score (macro): {f1_score_macro}')
print(f'f1_score (micro): {f1_score_micro}')
print(f'MCC: {MCC_score}')
print(f'Gmean: {Gmean_score}')

    

f1_score (macro): 0.7766901067420064
f1_score (micro): 0.8213166144200627
MCC: 0.7776283395000059
Gmean: 0.906210733777856
