In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline

In [30]:
data = pd.read_csv('balanced_filled_Dataset-vf.csv')


In [31]:
data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,X11,X12,X13,X14,Y
0,2892.0,75.0,7.0,95.0,9.0,1889.0,228.0,228.0,133.0,2371.0,0.79,39.0,3,42,Apple
1,3208.0,0.0,9.0,124.0,-2.0,5394.0,206.0,222.0,154.0,900.0,0.97,-740.0,1,42,Apple
2,3245.0,97.630426,3.0,564.0,66.0,4387.0,220.0,233.0,150.0,2650.0,0.55,4.0,1,42,Apple
3,3157.0,307.0,27.0,120.0,35.0,2971.0,138.0,213.0,211.0,2467.0,0.19,637.0,1,44,Apple
4,3246.0,18.0,9.0,120.0,11.0,4333.0,213.0,221.0,144.0,972.0,0.93,833.0,1,59,Apple


In [32]:
X = data.drop('Y', axis=1)
Y = data['Y']

In [33]:
X.shape, Y.shape

((7774, 14), (7774,))

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size=0.8, random_state=20211008, stratify=Y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, train_size=0.8, random_state=20211008, 
                                                  stratify=y_train)

In [35]:
import itertools
n_neighbors = [3,5,7,9,11,13,15] 
weights = ["uniform", "distance"] # Weight function used in prediction.
algorithm = ['auto', 'ball_tree', 'kd_tree', 'brute'] # Algorithm used to compute the nearest neighbors.
p = [2, 3, 5,7] # Power parameter for the Minkowski metric. 
parameters = [n_neighbors,weights,algorithm, p]  
parameters_combinations = list(itertools.product(*parameters))
len(parameters_combinations) 


224

In [36]:
from sklearn.metrics import f1_score
from sklearn.metrics import matthews_corrcoef
from imblearn.metrics import geometric_mean_score

In [37]:
best_acc_params = {"n_neighbors": None, "weights": None, "algorithm": None,
                         "p": None, "accuracy": {
                             "average_score": 0,
                             "f1_score_macro": 0,
                             "f1_score_micro": 0,
                             "MCC": 0,
                             "Gmean": 0
                         }}
for params in parameters_combinations:


    classifer = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=params[0], weights=params[1], 
                                                                       algorithm=params[2], p=params[3]))
    classifer = classifer.fit(X_train,y_train)
    y_pred = classifer.predict(X_val)
    

    f1_score_macro = f1_score(y_val, y_pred, average='macro')
    f1_score_micro = f1_score(y_val, y_pred, average='micro')
    MCC_score = matthews_corrcoef(y_val, y_pred)
    Gmean_score = geometric_mean_score(y_val,y_pred, average='macro')
    accuracy = (f1_score_macro + f1_score_micro + MCC_score + Gmean_score) / 4.0
    
    if accuracy > 0.50:
        print(f'parameters: \n criterion: {params[0]}, bootstrap: {params[1]}, max depth: {params[2]}, max features: {params[3]}')
        print(f'accuracy: {accuracy}')
        if accuracy > best_acc_params['accuracy']['average_score']:
            best_acc_params.update({"n_neighbors": params[0], "weights": params[1], "algorithm": params[2],
                         "p": params[3], "accuracy": {
                             "average_score": accuracy,
                             "f1_score_macro": f1_score_macro,
                             "f1_score_micro": f1_score_micro,
                             "MCC": MCC_score,
                             "Gmean": Gmean_score
                         }})

parameters: 
 criterion: 3, bootstrap: uniform, max depth: auto, max features: 2
accuracy: 0.8588652862368236
parameters: 
 criterion: 3, bootstrap: uniform, max depth: auto, max features: 3
accuracy: 0.8313497595994134
parameters: 
 criterion: 3, bootstrap: uniform, max depth: auto, max features: 5
accuracy: 0.8140669752849207
parameters: 
 criterion: 3, bootstrap: uniform, max depth: auto, max features: 7
accuracy: 0.8090194184077337
parameters: 
 criterion: 3, bootstrap: uniform, max depth: ball_tree, max features: 2
accuracy: 0.8588652862368236
parameters: 
 criterion: 3, bootstrap: uniform, max depth: ball_tree, max features: 3
accuracy: 0.8313497595994134
parameters: 
 criterion: 3, bootstrap: uniform, max depth: ball_tree, max features: 5
accuracy: 0.8140669752849207
parameters: 
 criterion: 3, bootstrap: uniform, max depth: ball_tree, max features: 7
accuracy: 0.8090194184077337
parameters: 
 criterion: 3, bootstrap: uniform, max depth: kd_tree, max features: 2
accuracy: 0.8588

parameters: 
 criterion: 7, bootstrap: uniform, max depth: kd_tree, max features: 3
accuracy: 0.8294048587898425
parameters: 
 criterion: 7, bootstrap: uniform, max depth: kd_tree, max features: 5
accuracy: 0.8136930683350352
parameters: 
 criterion: 7, bootstrap: uniform, max depth: kd_tree, max features: 7
accuracy: 0.8074849158857917
parameters: 
 criterion: 7, bootstrap: uniform, max depth: brute, max features: 2
accuracy: 0.8419132608505333
parameters: 
 criterion: 7, bootstrap: uniform, max depth: brute, max features: 3
accuracy: 0.8294048587898425
parameters: 
 criterion: 7, bootstrap: uniform, max depth: brute, max features: 5
accuracy: 0.8136930683350352
parameters: 
 criterion: 7, bootstrap: uniform, max depth: brute, max features: 7
accuracy: 0.8074849158857917
parameters: 
 criterion: 7, bootstrap: distance, max depth: auto, max features: 2
accuracy: 0.848726592617745
parameters: 
 criterion: 7, bootstrap: distance, max depth: auto, max features: 3
accuracy: 0.8331609425257

parameters: 
 criterion: 11, bootstrap: distance, max depth: auto, max features: 5
accuracy: 0.8086259149407776
parameters: 
 criterion: 11, bootstrap: distance, max depth: auto, max features: 7
accuracy: 0.7967159735583056
parameters: 
 criterion: 11, bootstrap: distance, max depth: ball_tree, max features: 2
accuracy: 0.8339112986383679
parameters: 
 criterion: 11, bootstrap: distance, max depth: ball_tree, max features: 3
accuracy: 0.8217550573945348
parameters: 
 criterion: 11, bootstrap: distance, max depth: ball_tree, max features: 5
accuracy: 0.8086259149407776
parameters: 
 criterion: 11, bootstrap: distance, max depth: ball_tree, max features: 7
accuracy: 0.7967159735583056
parameters: 
 criterion: 11, bootstrap: distance, max depth: kd_tree, max features: 2
accuracy: 0.8339112986383679
parameters: 
 criterion: 11, bootstrap: distance, max depth: kd_tree, max features: 3
accuracy: 0.8217550573945348
parameters: 
 criterion: 11, bootstrap: distance, max depth: kd_tree, max feat

parameters: 
 criterion: 15, bootstrap: distance, max depth: kd_tree, max features: 7
accuracy: 0.7846181477321896
parameters: 
 criterion: 15, bootstrap: distance, max depth: brute, max features: 2
accuracy: 0.8308619979588443
parameters: 
 criterion: 15, bootstrap: distance, max depth: brute, max features: 3
accuracy: 0.8166408672046113
parameters: 
 criterion: 15, bootstrap: distance, max depth: brute, max features: 5
accuracy: 0.7919932460745583
parameters: 
 criterion: 15, bootstrap: distance, max depth: brute, max features: 7
accuracy: 0.7846181477321896


In [38]:
best_acc_params

{'n_neighbors': 3,
 'weights': 'uniform',
 'algorithm': 'auto',
 'p': 2,
 'accuracy': {'average_score': 0.8588652862368236,
  'f1_score_macro': 0.8512424635368102,
  'f1_score_micro': 0.8464630225080386,
  'MCC': 0.8219832566613708,
  'Gmean': 0.9157724022410747}}

In [39]:
KNN = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=best_acc_params['n_neighbors'],
                                                           weights=best_acc_params['weights'], 
                                                           algorithm=best_acc_params['algorithm'], p=best_acc_params['p']))
KNN = KNN.fit(X_train,y_train)
y_pred = KNN.predict(X_test)




f1_score_macro = f1_score(y_test, y_pred, average='macro')
f1_score_micro = f1_score(y_test, y_pred, average='micro')
MCC_score = matthews_corrcoef(y_test, y_pred)
Gmean_score = geometric_mean_score(y_test,y_pred, average='macro')


print(f'f1_score (macro): {f1_score_macro}')
print(f'f1_score (micro): {f1_score_micro}')
print(f'MCC: {MCC_score}')
print(f'Gmean: {Gmean_score}')

    

f1_score (macro): 0.8498026847088772
f1_score (micro): 0.8450160771704179
MCC: 0.8203994971692073
Gmean: 0.9156783755122022
