# Importing Libraries and Csv file

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [5]:
# Gettig the data

Data = r'C:\Users\Windows 10 Pro\Downloads\diabetes.csv'
df = pd.read_csv(Data)
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [9]:
#checking % of people that have diabetics

df.Outcome.value_counts(normalize = True)*100

0    65.104167
1    34.895833
Name: Outcome, dtype: float64

**Finding Unique Values of each column**

In [13]:
for column in df.columns:
    unique_values = df[column].unique()
    unique_values_sorted = sorted(unique_values)
    print(f"Unique values in {column} (ascending order): {unique_values_sorted}")


Unique values in Pregnancies (ascending order): [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17]
Unique values in Glucose (ascending order): [0, 44, 56, 57, 61, 62, 65, 67, 68, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 186, 187, 188, 189, 190, 191, 193, 194, 195, 196, 197, 198, 199]
Unique values in BloodPressure (ascending order): [0, 24, 30, 38, 40, 44, 46, 48, 50, 52, 54, 55, 56, 58, 60, 61, 62, 64, 65, 66, 68, 70, 72, 74, 75, 76, 78, 80, 82, 84, 85, 86, 88, 90, 92, 94, 95, 96, 98, 1

We see clomuns like Glucose, Insulin, BloodPressure, skinThickness, BMi have rows with 0 values thus we change into NaN because it unlikely for a living person values to be 0.  

In [16]:
# Replace 0 with NaN in selected columns
columns_to_check = ['Glucose', 'Insulin', 'BloodPressure', 'SkinThickness', 'BMI']

df[columns_to_check] = df[columns_to_check].replace(0, np.nan)
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,,33.6,0.627,50,1
1,1,85.0,66.0,29.0,,26.6,0.351,31,0
2,8,183.0,64.0,,,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1


In [20]:
# Replacing the NaN values with the average for effective modelling
 
  # Calculate the mean for the selected columns while skipping NaN values
column_means = df[columns_to_check].mean(skipna=True)

# Fill NaN values in the selected columns with the respective column means
df[columns_to_check] = df[columns_to_check].fillna(column_means)
df.head()


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,155.548223,33.6,0.627,50,1
1,1,85.0,66.0,29.0,155.548223,26.6,0.351,31,0
2,8,183.0,64.0,29.15342,155.548223,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1


**Now Split Dataset for training and testing**

In [24]:
# Choose the x (independant variables) and y(dependant variables)
x = df[['Pregnancies', 'Glucose', 'Insulin', 'BloodPressure', 'SkinThickness', 'BMI', 'DiabetesPedigreeFunction', 'Age']]
y= df.Outcome

#Splitting data into training and testing
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = 0.2, random_state = 0)
print("Training: ", train_x.shape, train_y.shape)
print("Testing: ", test_x.shape, test_y.shape)

Training:  (614, 8) (614,)
Testing:  (154, 8) (154,)


In [26]:
# Feature Scaling to standardize independant variables
sc_X = StandardScaler()
train_x = sc_X.fit_transform(train_x)
test_x = sc_X.transform(test_x)

**Fit the model for Training**

In [28]:
#import math
import math
math.sqrt(len(test_y))

12.409673645990857

*so n_neighbors we take odd number thats nearer to 12.4 which is 11
*p is the power parameter that define the metrics in this case is the parameters in the y(dependant) variable which is 2(0 & 1) 

In [29]:
#Define the model: Init K-NN
classifier = KNeighborsClassifier(n_neighbors=11, p=2,metric='euclidean')
classifier.fit(train_x, train_y)

**Predict the Model**

In [30]:
# Predict the test set results
y_pred = classifier.predict(test_x)
y_pred

array([1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=int64)

**Evaluate the Model**

In [37]:
 # Evaluate Model
cm = confusion_matrix(test_y, y_pred)
print (cm)
print("F1 Score: ",f1_score(test_y, y_pred))
print("Accuracy Score: ",accuracy_score(test_y, y_pred))

[[94 13]
 [16 31]]
F1 Score:  0.6813186813186813
Accuracy Score:  0.8116883116883117


# Explanation of confusion matrix
True Negatives (TN): In this matrix, there are 94 instances that are correctly predicted as "No Diabetes" (negative class).

False Positives (FP): There are 13 instances where the model incorrectly predicted "Diabetes" (positive class) when the individuals do not actually have diabetes.

False Negatives (FN): There are 16 instances where the model incorrectly predicted "No Diabetes" when individuals actually have diabetes. 

True Positives (TP): There are 31 instances correctly predicted as "Diabetes." These are individuals who have diabetes, and the model correctly identified them as having diabetes.