In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
import pickle

In [3]:
data = pd.read_csv('/kaggle/input/pima-indians-diabetes-database/diabetes.csv')

# Basic EDA and Statistical Analysis

In [4]:
data.info()

In [5]:
data.describe().T

## Handling Missing Values
#### Some columns have value 0 which is not possible. These 0s are actually null values

In [6]:
data_copy = data.copy(deep=True)
data_copy[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']] = data_copy[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0, np.nan)
print(data_copy.isnull().sum())

In [7]:
p = data.hist(figsize = (20,20))

In [8]:
data_copy['Glucose'].fillna(data_copy['Glucose'].mean(), inplace = True)
data_copy['BloodPressure'].fillna(data_copy['BloodPressure'].mean(), inplace = True)
data_copy['SkinThickness'].fillna(data_copy['SkinThickness'].median(), inplace = True)
data_copy['Insulin'].fillna(data_copy['Insulin'].mean(), inplace = True)
data_copy['BMI'].fillna(data_copy['BMI'].median(), inplace = True)

In [9]:
p = data_copy.hist(figsize = (20,20))

## Checking the Correlation between Features

In [10]:
plt.figure(figsize=(12,10))
sns.heatmap(data.corr(), annot=True)

In [11]:
plt.figure(figsize=(12,10))
sns.heatmap(data_copy.corr(), annot=True)

#### We can see from the distribution charts below that **Diabetic** patients seem to have relatively high **Glucose** and **BMI**.

In [12]:
sns.displot(data_copy, x="Glucose", hue='Outcome')
sns.displot(data_copy, x="BMI", hue='Outcome')

In [13]:
data_copy.describe()

Splitting the dependent and independent variables.

In [14]:
x = data_copy.drop(['Outcome'],axis=1)
y = data_copy['Outcome']

## Scaling the Data

In [15]:
sc = StandardScaler()
x = pd.DataFrame(sc.fit_transform(x))

In [16]:
x.describe()

## Hyperparameter Tuning of the KNN Classifier

In [17]:
x_train, x_val, y_train, y_val = train_test_split(x,y,test_size=0.3,random_state=42,stratify=y)

In [18]:
accuracies = np.zeros(20)
for i in range(1,21):
    #Train Model and Predict  
    knn = KNeighborsClassifier(n_neighbors = i).fit(x_train,y_train)
    y_pred = knn.predict(x_val)
    accuracies[i-1] = metrics.accuracy_score(y_val, y_pred)

accuracies

In [19]:
loc = np.arange(1,21,step=1.0)
plt.figure(figsize = (10, 6))
plt.plot(range(1,21), accuracies)
plt.xticks(loc)
plt.xlabel('Number of Neighbors ')
plt.ylabel('Accuracy')
plt.show()

In [20]:
grid_params = { 'n_neighbors' : [int(x) for x in np.linspace(start = 3, stop = 19, num = 9)],
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan']}

In [21]:
gs = GridSearchCV(KNeighborsClassifier(), grid_params, verbose = 1, cv=3, n_jobs = -1)

In [22]:
g_model = gs.fit(x, y)

In [23]:
g_model.best_params_

## Building the model with best params

In [24]:
knn = KNeighborsClassifier(n_neighbors=9,weights='uniform',metric='manhattan',algorithm='brute')

In [25]:
print(cross_val_score(knn,x,y,scoring='accuracy',cv=3,n_jobs=-1).mean())

In [26]:
knn.fit(x,y)

## Saving the Model and StandardScaler

In [27]:
pickle.dump((knn, sc), open('knn_sc_diabetes.pkl', 'wb'))