## KNN MODELS

In [36]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report


## OverSampling Method

In [37]:
def over_sampling(X_train,y_train):
    oversampling = SMOTE()
    X_train, y_train = oversampling.fit_resample(X_train, y_train)
    return X_train,y_train

## Split Data

In [38]:
PATH='F:\\clustering-classification-dashboard\\data\\'
def split_data(train_data,val_data):
    X_train = train_data.drop(columns=['species'])
    y_train = train_data['species']
    X_val = val_data.drop(columns=['species'])
    y_val = val_data['species']
    return X_train,y_train,X_val,y_val
    

## Model Method

In [39]:
def model(X_train, y_train, X_val, y_val, oversampling=True, n_neighbors=1):
    if oversampling:
        X_train, y_train = over_sampling(X_train, y_train)
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn.fit(X_train, y_train)
    print(y_train.value_counts())
    print(confusion_matrix(y_val, knn.predict(X_val)))
    print(classification_report(y_val, knn.predict(X_val)))


## KNN Without Anything

In [40]:
train_data = pd.read_csv(
    PATH+'train_data_without_anything.csv')
val_data=pd.read_csv(
    PATH+'val_data_without_anything.csv')


In [41]:
X_train,y_train,X_val,y_val=split_data(train_data,val_data)

In [42]:
model(X_train,y_train,X_val,y_val)

0    121
1    121
2    121
Name: species, dtype: int64
[[26  0  4]
 [ 2 21  2]
 [ 7  0  7]]
              precision    recall  f1-score   support

           0       0.74      0.87      0.80        30
           1       1.00      0.84      0.91        25
           2       0.54      0.50      0.52        14

    accuracy                           0.78        69
   macro avg       0.76      0.74      0.74        69
weighted avg       0.79      0.78      0.78        69



In [43]:
model(X_train,y_train,X_val,y_val,oversampling=False)

0    121
1     98
2     54
Name: species, dtype: int64
[[26  0  4]
 [ 2 22  1]
 [ 7  0  7]]
              precision    recall  f1-score   support

           0       0.74      0.87      0.80        30
           1       1.00      0.88      0.94        25
           2       0.58      0.50      0.54        14

    accuracy                           0.80        69
   macro avg       0.78      0.75      0.76        69
weighted avg       0.80      0.80      0.80        69



## KNN with Min_Max Scaler

In [44]:
train_data = pd.read_csv(
    PATH+'train_data_with_min_max_scaler.csv')
val_data = pd.read_csv(
    PATH+'val_data_with_min_max_scaler.csv')


In [45]:
X_train, y_train, X_val, y_val = split_data(train_data, val_data)


In [46]:
X_train.head()

Unnamed: 0,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,SEX_MALE,SEX_FEMALE,island_Dream,island_Biscoe,island_Torgersen
0,0.207547,0.464286,0.271186,0.055556,1.0,0.0,0.0,0.0,1.0
1,0.249057,0.547619,0.355932,0.138889,1.0,0.0,1.0,0.0,0.0
2,0.8,0.309524,1.0,0.819444,0.0,1.0,1.0,0.0,0.0
3,0.301887,0.607143,0.338983,0.375,0.0,1.0,1.0,0.0,0.0
4,0.622642,0.607143,0.355932,0.298611,0.0,1.0,0.0,1.0,0.0


In [47]:
model(X_train,y_train,X_val,y_val)

0    121
1    121
2    121
Name: species, dtype: int64
[[30  0  0]
 [ 0 25  0]
 [ 0  0 14]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        30
           1       1.00      1.00      1.00        25
           2       1.00      1.00      1.00        14

    accuracy                           1.00        69
   macro avg       1.00      1.00      1.00        69
weighted avg       1.00      1.00      1.00        69



In [48]:
model(X_train, y_train, X_val, y_val, oversampling=False)


0    121
1     98
2     54
Name: species, dtype: int64
[[30  0  0]
 [ 0 25  0]
 [ 0  0 14]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        30
           1       1.00      1.00      1.00        25
           2       1.00      1.00      1.00        14

    accuracy                           1.00        69
   macro avg       1.00      1.00      1.00        69
weighted avg       1.00      1.00      1.00        69



## KNN with Std_Scaler

In [49]:
train_data = pd.read_csv(
    PATH+'train_data_with_std_scaler.csv')
val_data = pd.read_csv(
    PATH+'val_data_with_std_scaler.csv')


In [50]:
X_train, y_train, X_val, y_val = split_data(train_data, val_data)


In [51]:
model(X_train, y_train, X_val, y_val)


0    121
1    121
2    121
Name: species, dtype: int64
[[30  0  0]
 [ 0 25  0]
 [ 0  0 14]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        30
           1       1.00      1.00      1.00        25
           2       1.00      1.00      1.00        14

    accuracy                           1.00        69
   macro avg       1.00      1.00      1.00        69
weighted avg       1.00      1.00      1.00        69



In [52]:
model(X_train, y_train, X_val, y_val, oversampling=False)


0    121
1     98
2     54
Name: species, dtype: int64
[[30  0  0]
 [ 0 25  0]
 [ 0  0 14]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        30
           1       1.00      1.00      1.00        25
           2       1.00      1.00      1.00        14

    accuracy                           1.00        69
   macro avg       1.00      1.00      1.00        69
weighted avg       1.00      1.00      1.00        69



## KNN with Robust_Scaler

In [53]:
train_data_with_min_max = pd.read_csv(
    PATH+'train_data_with_robust_scaler.csv')
val_data_with_min_max = pd.read_csv(
    PATH+'val_data_with_robust_scaler.csv')


In [54]:
X_train, y_train, X_val, y_val = split_data(train_data, val_data)


In [55]:
model(X_train, y_train, X_val, y_val)


0    121
1    121
2    121
Name: species, dtype: int64
[[30  0  0]
 [ 0 25  0]
 [ 0  0 14]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        30
           1       1.00      1.00      1.00        25
           2       1.00      1.00      1.00        14

    accuracy                           1.00        69
   macro avg       1.00      1.00      1.00        69
weighted avg       1.00      1.00      1.00        69



In [56]:
model(X_train, y_train, X_val, y_val, oversampling=False)


0    121
1     98
2     54
Name: species, dtype: int64
[[30  0  0]
 [ 0 25  0]
 [ 0  0 14]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        30
           1       1.00      1.00      1.00        25
           2       1.00      1.00      1.00        14

    accuracy                           1.00        69
   macro avg       1.00      1.00      1.00        69
weighted avg       1.00      1.00      1.00        69

