## KNN MODELS

In [138]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model  import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report


## OverSampling Method

In [139]:
def over_sampling(X_train,y_train):
    oversampling = SMOTE()
    X_train, y_train = oversampling.fit_resample(X_train, y_train)
    return X_train,y_train

## Split Data

In [140]:
PATH='F:\\clustering-classification-dashboard\\data\\'
def split_data(train_data,val_data):
    X_train = train_data.drop(columns=['species'])
    y_train = train_data['species']
    X_val = val_data.drop(columns=['species'])
    y_val = val_data['species']
    return X_train,y_train,X_val,y_val
    

## Model Method

In [141]:
def model(X_train, y_train, X_val, y_val, oversampling=True, n_neighbors=4):
    if oversampling:
        X_train, y_train = over_sampling(X_train, y_train)
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn.fit(X_train, y_train)
    print(y_train.value_counts())
    print(confusion_matrix(y_val, knn.predict(X_val)))
    print(classification_report(y_val, knn.predict(X_val)))

def log_model(X_train, y_train, X_val, y_val, oversampling=True):
    if oversampling:
        X_train, y_train = over_sampling(X_train, y_train)
    log = LogisticRegression()
    log.fit(X_train, y_train)
    print(y_train.value_counts())
    print(confusion_matrix(y_val, log.predict(X_val)))
    print(classification_report(y_val, log.predict(X_val)))


## KNN Without Anything

In [142]:
train_data = pd.read_csv(
    PATH+'train_data_without_anything.csv')
val_data=pd.read_csv(
    PATH+'val_data_without_anything.csv')


In [143]:
X_train,y_train,X_val,y_val=split_data(train_data,val_data)

In [144]:
model(X_train,y_train,X_val,y_val)

0    121
1    121
2    121
Name: species, dtype: int64
[[27  0  3]
 [ 2 22  1]
 [ 8  0  6]]
              precision    recall  f1-score   support

           0       0.73      0.90      0.81        30
           1       1.00      0.88      0.94        25
           2       0.60      0.43      0.50        14

    accuracy                           0.80        69
   macro avg       0.78      0.74      0.75        69
weighted avg       0.80      0.80      0.79        69



In [145]:
log_model(X_train, y_train, X_val, y_val)


0    121
1    121
2    121
Name: species, dtype: int64
[[30  0  0]
 [ 0 25  0]
 [ 1  0 13]]
              precision    recall  f1-score   support

           0       0.97      1.00      0.98        30
           1       1.00      1.00      1.00        25
           2       1.00      0.93      0.96        14

    accuracy                           0.99        69
   macro avg       0.99      0.98      0.98        69
weighted avg       0.99      0.99      0.99        69



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [146]:
model(X_train,y_train,X_val,y_val,oversampling=False)

0    121
1     98
2     54
Name: species, dtype: int64
[[28  0  2]
 [ 5 20  0]
 [12  0  2]]
              precision    recall  f1-score   support

           0       0.62      0.93      0.75        30
           1       1.00      0.80      0.89        25
           2       0.50      0.14      0.22        14

    accuracy                           0.72        69
   macro avg       0.71      0.63      0.62        69
weighted avg       0.73      0.72      0.69        69



In [147]:
log_model(X_train,y_train,X_val,y_val,oversampling=False)

0    121
1     98
2     54
Name: species, dtype: int64
[[30  0  0]
 [ 0 25  0]
 [ 2  0 12]]
              precision    recall  f1-score   support

           0       0.94      1.00      0.97        30
           1       1.00      1.00      1.00        25
           2       1.00      0.86      0.92        14

    accuracy                           0.97        69
   macro avg       0.98      0.95      0.96        69
weighted avg       0.97      0.97      0.97        69



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## KNN with Min_Max Scaler

In [148]:
train_data = pd.read_csv(
    PATH+'train_data_with_min_max_scaler.csv')
val_data = pd.read_csv(
    PATH+'val_data_with_min_max_scaler.csv')


In [149]:
X_train, y_train, X_val, y_val = split_data(train_data, val_data)


In [150]:
X_train.head()

Unnamed: 0,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex_FEMALE,sex_MALE,island_Biscoe,island_Dream,island_Torgersen
0,0.207547,0.464286,0.271186,0.055556,1.0,0.0,0.0,0.0,1.0
1,0.249057,0.547619,0.355932,0.138889,1.0,0.0,1.0,0.0,0.0
2,0.8,0.309524,1.0,0.819444,0.0,1.0,1.0,0.0,0.0
3,0.301887,0.607143,0.338983,0.375,0.0,1.0,1.0,0.0,0.0
4,0.622642,0.607143,0.355932,0.298611,0.0,1.0,0.0,1.0,0.0


In [151]:
model(X_train,y_train,X_val,y_val)

0    121
1    121
2    121
Name: species, dtype: int64
[[30  0  0]
 [ 0 25  0]
 [ 0  0 14]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        30
           1       1.00      1.00      1.00        25
           2       1.00      1.00      1.00        14

    accuracy                           1.00        69
   macro avg       1.00      1.00      1.00        69
weighted avg       1.00      1.00      1.00        69



In [152]:
log_model(X_train,y_train,X_val,y_val)

0    121
1    121
2    121
Name: species, dtype: int64
[[30  0  0]
 [ 0 25  0]
 [ 0  0 14]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        30
           1       1.00      1.00      1.00        25
           2       1.00      1.00      1.00        14

    accuracy                           1.00        69
   macro avg       1.00      1.00      1.00        69
weighted avg       1.00      1.00      1.00        69



In [153]:
model(X_train, y_train, X_val, y_val, oversampling=False)


0    121
1     98
2     54
Name: species, dtype: int64
[[30  0  0]
 [ 0 25  0]
 [ 0  0 14]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        30
           1       1.00      1.00      1.00        25
           2       1.00      1.00      1.00        14

    accuracy                           1.00        69
   macro avg       1.00      1.00      1.00        69
weighted avg       1.00      1.00      1.00        69



In [154]:
log_model(X_train, y_train, X_val, y_val, oversampling=False)


0    121
1     98
2     54
Name: species, dtype: int64
[[30  0  0]
 [ 0 25  0]
 [ 0  0 14]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        30
           1       1.00      1.00      1.00        25
           2       1.00      1.00      1.00        14

    accuracy                           1.00        69
   macro avg       1.00      1.00      1.00        69
weighted avg       1.00      1.00      1.00        69



## KNN with Std_Scaler

In [155]:
train_data = pd.read_csv(
    PATH+'train_data_with_std_scaler.csv')
val_data = pd.read_csv(
    PATH+'val_data_with_std_scaler.csv')


In [156]:
X_train, y_train, X_val, y_val = split_data(train_data, val_data)


In [157]:
model(X_train, y_train, X_val, y_val)


0    121
1    121
2    121
Name: species, dtype: int64
[[30  0  0]
 [ 0 25  0]
 [ 0  0 14]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        30
           1       1.00      1.00      1.00        25
           2       1.00      1.00      1.00        14

    accuracy                           1.00        69
   macro avg       1.00      1.00      1.00        69
weighted avg       1.00      1.00      1.00        69



In [158]:
log_model(X_train, y_train, X_val, y_val)

0    121
1    121
2    121
Name: species, dtype: int64
[[30  0  0]
 [ 0 25  0]
 [ 0  0 14]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        30
           1       1.00      1.00      1.00        25
           2       1.00      1.00      1.00        14

    accuracy                           1.00        69
   macro avg       1.00      1.00      1.00        69
weighted avg       1.00      1.00      1.00        69



In [159]:
model(X_train, y_train, X_val, y_val, oversampling=False)


0    121
1     98
2     54
Name: species, dtype: int64
[[30  0  0]
 [ 0 25  0]
 [ 0  0 14]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        30
           1       1.00      1.00      1.00        25
           2       1.00      1.00      1.00        14

    accuracy                           1.00        69
   macro avg       1.00      1.00      1.00        69
weighted avg       1.00      1.00      1.00        69



In [160]:
log_model(X_train, y_train, X_val, y_val, oversampling=False)


0    121
1     98
2     54
Name: species, dtype: int64
[[30  0  0]
 [ 0 25  0]
 [ 0  0 14]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        30
           1       1.00      1.00      1.00        25
           2       1.00      1.00      1.00        14

    accuracy                           1.00        69
   macro avg       1.00      1.00      1.00        69
weighted avg       1.00      1.00      1.00        69



## KNN with Robust_Scaler

In [161]:
train_data_with_min_max = pd.read_csv(
    PATH+'train_data_with_robust_scaler.csv')
val_data_with_min_max = pd.read_csv(
    PATH+'val_data_with_robust_scaler.csv')


In [162]:
X_train, y_train, X_val, y_val = split_data(train_data, val_data)


In [163]:
model(X_train, y_train, X_val, y_val)


0    121
1    121
2    121
Name: species, dtype: int64
[[30  0  0]
 [ 0 25  0]
 [ 0  0 14]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        30
           1       1.00      1.00      1.00        25
           2       1.00      1.00      1.00        14

    accuracy                           1.00        69
   macro avg       1.00      1.00      1.00        69
weighted avg       1.00      1.00      1.00        69



In [164]:
log_model(X_train, y_train, X_val, y_val)


0    121
1    121
2    121
Name: species, dtype: int64
[[30  0  0]
 [ 0 25  0]
 [ 0  0 14]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        30
           1       1.00      1.00      1.00        25
           2       1.00      1.00      1.00        14

    accuracy                           1.00        69
   macro avg       1.00      1.00      1.00        69
weighted avg       1.00      1.00      1.00        69



In [165]:
model(X_train, y_train, X_val, y_val, oversampling=False)


0    121
1     98
2     54
Name: species, dtype: int64
[[30  0  0]
 [ 0 25  0]
 [ 0  0 14]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        30
           1       1.00      1.00      1.00        25
           2       1.00      1.00      1.00        14

    accuracy                           1.00        69
   macro avg       1.00      1.00      1.00        69
weighted avg       1.00      1.00      1.00        69



In [166]:
log_model(X_train, y_train, X_val, y_val, oversampling=False)


0    121
1     98
2     54
Name: species, dtype: int64
[[30  0  0]
 [ 0 25  0]
 [ 0  0 14]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        30
           1       1.00      1.00      1.00        25
           2       1.00      1.00      1.00        14

    accuracy                           1.00        69
   macro avg       1.00      1.00      1.00        69
weighted avg       1.00      1.00      1.00        69

