## KNN MODELS

In [159]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model  import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report


## OverSampling Method

In [160]:
def over_sampling(X_train,y_train):
    oversampling = SMOTE()
    X_train, y_train = oversampling.fit_resample(X_train, y_train)
    return X_train,y_train
def under_sampling(X_train,y_train):
    undersample = RandomUnderSampler()
    X_train, y_train = undersample.fit_resample(X_train, y_train)
    return X_train,y_train


## Split Data

In [161]:
PATH='F:\\clustering-classification-dashboard\\data\\'
def split_data(train_data,val_data):
    X_train = train_data.drop(columns=['species'])
    y_train = train_data['species']
    X_val = val_data.drop(columns=['species'])
    y_val = val_data['species']
    return X_train,y_train,X_val,y_val
    

## Model Method

In [162]:
def model(X_train, y_train, X_val, y_val,undersampling=True, oversampling=False, n_neighbors=4):
    # if oversampling :
    #     X_train, y_train = over_sampling(X_train, y_train)
    if undersampling :
        X_train, y_train = under_sampling(X_train, y_train)
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn.fit(X_train, y_train)
    print(y_train.value_counts())
    print(confusion_matrix(y_val, knn.predict(X_val)))
    print(classification_report(y_val, knn.predict(X_val)))

def log_model(X_train, y_train, X_val, y_val,undersampling=True, oversampling=False):
    if oversampling:
        X_train, y_train = over_sampling(X_train, y_train)
    elif undersampling:
        X_train, y_train = under_sampling(X_train, y_train)
    log = LogisticRegression( max_iter=1,C=20)
    log.fit(X_train, y_train)
    print(y_train.value_counts())
    print(confusion_matrix(y_val, log.predict(X_val)))
    print(classification_report(y_val, log.predict(X_val)))



## KNN Without Anything

In [163]:
train_data = pd.read_csv(
    PATH+'train_data_without_anything.csv')
val_data=pd.read_csv(
    PATH+'val_data_without_anything.csv')


In [164]:
train_data.columns

Index(['species', 'culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm',
       'body_mass_g', 'sex_FEMALE', 'sex_MALE', 'island_Biscoe',
       'island_Dream', 'island_Torgersen'],
      dtype='object')

In [165]:
X_train,y_train,X_val,y_val=split_data(train_data,val_data)

In [166]:
model(X_train,y_train,X_val,y_val)

0    54
1    54
2    54
Name: species, dtype: int64
[[27  0  3]
 [ 6 18  1]
 [ 8  0  6]]
              precision    recall  f1-score   support

           0       0.66      0.90      0.76        30
           1       1.00      0.72      0.84        25
           2       0.60      0.43      0.50        14

    accuracy                           0.74        69
   macro avg       0.75      0.68      0.70        69
weighted avg       0.77      0.74      0.74        69



In [167]:
log_model(X_train, y_train, X_val, y_val)


0    54
1    54
2    54
Name: species, dtype: int64
[[ 0 30  0]
 [ 0 25  0]
 [ 0 14  0]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        30
           1       0.36      1.00      0.53        25
           2       0.00      0.00      0.00        14

    accuracy                           0.36        69
   macro avg       0.12      0.33      0.18        69
weighted avg       0.13      0.36      0.19        69



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [168]:
model(X_train,y_train,X_val,y_val,oversampling=False)

0    54
1    54
2    54
Name: species, dtype: int64
[[24  2  4]
 [ 1 24  0]
 [ 8  0  6]]
              precision    recall  f1-score   support

           0       0.73      0.80      0.76        30
           1       0.92      0.96      0.94        25
           2       0.60      0.43      0.50        14

    accuracy                           0.78        69
   macro avg       0.75      0.73      0.73        69
weighted avg       0.77      0.78      0.77        69



In [169]:
log_model(X_train,y_train,X_val,y_val,oversampling=False)

0    54
1    54
2    54
Name: species, dtype: int64
[[ 0 30  0]
 [ 0 25  0]
 [ 0 14  0]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        30
           1       0.36      1.00      0.53        25
           2       0.00      0.00      0.00        14

    accuracy                           0.36        69
   macro avg       0.12      0.33      0.18        69
weighted avg       0.13      0.36      0.19        69



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## KNN with Min_Max Scaler

In [170]:
train_data = pd.read_csv(
    PATH+'train_data_with_min_max_scaler.csv')
val_data = pd.read_csv(
    PATH+'val_data_with_min_max_scaler.csv')


In [171]:
X_train, y_train, X_val, y_val = split_data(train_data, val_data)


In [172]:
X_train.head()

Unnamed: 0,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex_FEMALE,sex_MALE,island_Biscoe,island_Dream,island_Torgersen
0,0.523636,0.154762,0.762712,0.611111,1.0,0.0,1.0,0.0,0.0
1,0.450909,0.142857,0.745763,0.388889,0.0,1.0,1.0,0.0,0.0
2,0.538182,0.416667,0.338983,0.0,1.0,0.0,0.0,1.0,0.0
3,0.236364,0.964286,0.322034,0.305556,0.0,1.0,0.0,0.0,1.0
4,0.534545,0.357143,0.728814,0.777778,0.0,1.0,1.0,0.0,0.0


In [173]:
model(X_train,y_train,X_val,y_val)

0    54
1    54
2    54
Name: species, dtype: int64
[[30  0  0]
 [ 0 25  0]
 [ 0  0 14]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        30
           1       1.00      1.00      1.00        25
           2       1.00      1.00      1.00        14

    accuracy                           1.00        69
   macro avg       1.00      1.00      1.00        69
weighted avg       1.00      1.00      1.00        69



In [174]:
log_model(X_train,y_train,X_val,y_val)

0    54
1    54
2    54
Name: species, dtype: int64
[[12  7 11]
 [ 0 25  0]
 [ 0  0 14]]
              precision    recall  f1-score   support

           0       1.00      0.40      0.57        30
           1       0.78      1.00      0.88        25
           2       0.56      1.00      0.72        14

    accuracy                           0.74        69
   macro avg       0.78      0.80      0.72        69
weighted avg       0.83      0.74      0.71        69



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [175]:
model(X_train, y_train, X_val, y_val, oversampling=False)


0    54
1    54
2    54
Name: species, dtype: int64
[[30  0  0]
 [ 0 25  0]
 [ 0  0 14]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        30
           1       1.00      1.00      1.00        25
           2       1.00      1.00      1.00        14

    accuracy                           1.00        69
   macro avg       1.00      1.00      1.00        69
weighted avg       1.00      1.00      1.00        69



In [176]:
log_model(X_train, y_train, X_val, y_val, oversampling=False)


0    54
1    54
2    54
Name: species, dtype: int64
[[11  8 11]
 [ 0 25  0]
 [ 0  0 14]]
              precision    recall  f1-score   support

           0       1.00      0.37      0.54        30
           1       0.76      1.00      0.86        25
           2       0.56      1.00      0.72        14

    accuracy                           0.72        69
   macro avg       0.77      0.79      0.71        69
weighted avg       0.82      0.72      0.69        69



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## KNN with Std_Scaler

In [177]:
train_data = pd.read_csv(
    PATH+'train_data_with_std_scaler.csv')
val_data = pd.read_csv(
    PATH+'val_data_with_std_scaler.csv')


In [178]:
X_train, y_train, X_val, y_val = split_data(train_data, val_data)


In [179]:
model(X_train, y_train, X_val, y_val)


0    54
1    54
2    54
Name: species, dtype: int64
[[30  0  0]
 [ 0 25  0]
 [ 0  0 14]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        30
           1       1.00      1.00      1.00        25
           2       1.00      1.00      1.00        14

    accuracy                           1.00        69
   macro avg       1.00      1.00      1.00        69
weighted avg       1.00      1.00      1.00        69



In [180]:
log_model(X_train, y_train, X_val, y_val)

0    54
1    54
2    54
Name: species, dtype: int64
[[30  0  0]
 [ 0 25  0]
 [ 1  0 13]]
              precision    recall  f1-score   support

           0       0.97      1.00      0.98        30
           1       1.00      1.00      1.00        25
           2       1.00      0.93      0.96        14

    accuracy                           0.99        69
   macro avg       0.99      0.98      0.98        69
weighted avg       0.99      0.99      0.99        69



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [181]:
model(X_train, y_train, X_val, y_val, oversampling=False)


0    54
1    54
2    54
Name: species, dtype: int64
[[30  0  0]
 [ 0 25  0]
 [ 0  0 14]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        30
           1       1.00      1.00      1.00        25
           2       1.00      1.00      1.00        14

    accuracy                           1.00        69
   macro avg       1.00      1.00      1.00        69
weighted avg       1.00      1.00      1.00        69



In [182]:
log_model(X_train, y_train, X_val, y_val, oversampling=False)


0    54
1    54
2    54
Name: species, dtype: int64
[[30  0  0]
 [ 0 25  0]
 [ 1  0 13]]
              precision    recall  f1-score   support

           0       0.97      1.00      0.98        30
           1       1.00      1.00      1.00        25
           2       1.00      0.93      0.96        14

    accuracy                           0.99        69
   macro avg       0.99      0.98      0.98        69
weighted avg       0.99      0.99      0.99        69



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## KNN with Robust_Scaler

In [183]:
train_data= pd.read_csv(
    PATH+'train_data_with_robust_scaler.csv')
val_data = pd.read_csv(
    PATH+'val_data_with_robust_scaler.csv')


In [184]:
X_train, y_train, X_val, y_val = split_data(train_data, val_data)


In [185]:
model(X_train, y_train, X_val, y_val)


0    54
1    54
2    54
Name: species, dtype: int64
[[30  0  0]
 [ 0 25  0]
 [ 0  0 14]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        30
           1       1.00      1.00      1.00        25
           2       1.00      1.00      1.00        14

    accuracy                           1.00        69
   macro avg       1.00      1.00      1.00        69
weighted avg       1.00      1.00      1.00        69



In [186]:
log_model(X_train, y_train, X_val, y_val)


0    54
1    54
2    54
Name: species, dtype: int64
[[25  2  3]
 [ 0 25  0]
 [ 0  0 14]]
              precision    recall  f1-score   support

           0       1.00      0.83      0.91        30
           1       0.93      1.00      0.96        25
           2       0.82      1.00      0.90        14

    accuracy                           0.93        69
   macro avg       0.92      0.94      0.92        69
weighted avg       0.94      0.93      0.93        69



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [187]:
model(X_train, y_train, X_val, y_val, oversampling=False)


0    54
1    54
2    54
Name: species, dtype: int64
[[30  0  0]
 [ 0 25  0]
 [ 0  0 14]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        30
           1       1.00      1.00      1.00        25
           2       1.00      1.00      1.00        14

    accuracy                           1.00        69
   macro avg       1.00      1.00      1.00        69
weighted avg       1.00      1.00      1.00        69



In [188]:
log_model(X_train, y_train, X_val, y_val, oversampling=False)


0    54
1    54
2    54
Name: species, dtype: int64
[[24  2  4]
 [ 0 25  0]
 [ 0  0 14]]
              precision    recall  f1-score   support

           0       1.00      0.80      0.89        30
           1       0.93      1.00      0.96        25
           2       0.78      1.00      0.88        14

    accuracy                           0.91        69
   macro avg       0.90      0.93      0.91        69
weighted avg       0.93      0.91      0.91        69



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
