In [1]:
#Importing Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler,Normalizer,MinMaxScaler,RobustScaler

In [2]:
#Reading the CSV file
df = pd.read_csv("C:\\Users\\Swayam\\Desktop\\ML\\Excel_Files_dataset\\diabetes.csv")

In [3]:
# Replace zeroes
zero_not_accepted = ['Glucose', 'BloodPressure', 'SkinThickness', 'BMI', 'Insulin']
for column in zero_not_accepted:
    df[column] = df[column].replace(0, np.NaN)
    mean = int(df[column].mean(skipna=True))
    df[column] = df[column].replace(np.NaN, mean)

In [4]:
#Dependent and Independent Variables
y = df['Outcome']
x = df.drop(['Outcome'], axis = 1)

In [5]:
def model_building(ts,scaler,x,y): #Model Building Function

    from sklearn.model_selection import train_test_split

    x = scaler.fit_transform(x) #Normalizing the dataset

    x_train, x_test, y_train, y_test = train_test_split(
                                                        x,
                                                        y,
                                                        test_size=ts, 
                                                        random_state=5
                                                    )


    model = KNeighborsClassifier() #Declaring the model
    parametrs = {"n_neighbors":[5,7,9,11,13,15,17,19,21,23,25], "metric":["euclidean","cityblock","manhattan",]}
    clf = GridSearchCV(model,parametrs,cv=5)
    clf.fit(x_train,y_train)
    print("The best parameters are: ",clf.best_params_)
    print("The best score is: ",clf.best_score_)

    #validating on training data
    y_pred_training = clf.predict(x_train)
    con_matrix_training = confusion_matrix(y_train, y_pred_training) #Confusion Matrix
    print("\n\nThe confusion matrix for training data set is:\n",con_matrix_training)
    acc_training = accuracy_score(y_train,y_pred_training) #Accuracy Score 
    print("The accuracy score for training data set is: ",acc_training)

    #validating on test data
    y_pred_test = clf.predict(x_test)
    con_matrix_test = confusion_matrix(y_train, y_pred_training) #Confusion Matrix
    print("\n\nThe confusion matrix for test data set is:\n ",con_matrix_test)
    acc_test = accuracy_score(y_test,y_pred_test) #Accuracy Score
    print("The accuracy score for test data set is: ",acc_test)

    return(con_matrix_training,con_matrix_test,acc_training,acc_test)

In [6]:
test_size = [0.1, 0.2, 0.3, 0.4]
data_transformation = [StandardScaler,Normalizer,MinMaxScaler,RobustScaler]
for scaler in data_transformation:
    print("The Scaler is: ",scaler)
    for ts in test_size:
        print("\n\nFor the test size: ",ts)
        print(model_building(ts, scaler(),x,y))

The Scaler is:  <class 'sklearn.preprocessing._data.StandardScaler'>


For the test size:  0.1
The best parameters are:  {'metric': 'euclidean', 'n_neighbors': 23}
The best score is:  0.7612657699927015


The confusion matrix for training data set is:
 [[397  50]
 [ 99 145]]
The accuracy score for training data set is:  0.784370477568741


The confusion matrix for test data set is:
  [[397  50]
 [ 99 145]]
The accuracy score for test data set is:  0.8311688311688312
(array([[397,  50],
       [ 99, 145]], dtype=int64), array([[397,  50],
       [ 99, 145]], dtype=int64), 0.784370477568741, 0.8311688311688312)


For the test size:  0.2
The best parameters are:  {'metric': 'cityblock', 'n_neighbors': 11}
The best score is:  0.7525389844062376


The confusion matrix for training data set is:
 [[352  48]
 [ 80 134]]
The accuracy score for training data set is:  0.7915309446254072


The confusion matrix for test data set is:
  [[352  48]
 [ 80 134]]
The accuracy score for test data set is: 

**RESULTS:**
**NORMALIZATION METHODS USED ARE:**
1) Standard Scaler
2) Normalizer
3) Min-Max Scaler
4) Robust


| Normalization | Test size | N_neighbour | Distance | Test Accuracy |
|---|---|---|---|---|
| Standard Scaler | 0.1 | Euclidean | 23 | 83.116 |
| Normalizer | 0.1 | City block | 17 | 74 |
| Min-Max Scaler | 0.1 | City block | 19 | 83.116|
| Robust | 0.1 | Euclidean | 21 | 79 |


**Conclusion from the above table:**

| Normalization | Standard Scaler |
| ---|---|
| Test Size | 0.1 |
| N_neighbour | Euclidean |
| Distance | 23 |
| Test Accuracy | 83.116 |