In [10]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [11]:
# Load the diabetes dataset from a CSV file
data = pd.read_csv("diabetes.csv")


# Display the number of rows in the dataset and the first few rows of data
print(len(data))  # Output the total number of rows in the dataset
print(data.head())  # Display the first few rows of the dataset

768
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [12]:
# List of columns where zero values are not accepted
zero_not_accepted = ['Glucose', 'BloodPressure', 'SkinThickness', 'BMI', 'Insulin']

# Replace zero values with NaN in specified columns and impute mean values
for column in zero_not_accepted:
    data[column] = data[column].replace(0, np.NaN)  # Replace zeros with NaN
    mean = int(data[column].mean(skipna=True))  # Calculate the mean while skipping NaN values
    data[column].fillna(mean, inplace=True)  # Fill NaN values with the calculated mean

# Display the 'Glucose' column after replacing zero values and show the calculated mean
print(data['Glucose'])
print(mean)

0      148.0
1       85.0
2      183.0
3       89.0
4      137.0
       ...  
763    101.0
764    122.0
765    121.0
766    126.0
767     93.0
Name: Glucose, Length: 768, dtype: float64
155


In [13]:
# Separate features (x) and labels (y)
x = data.iloc[:, 0:8]  # Extract features: all rows and columns 0 to 7
y = data.iloc[:, 8]     # Extract labels: column 8, indicating diabetes status

# Split the dataset into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=0, test_size=0.2)
# Split the data: 80% for training, 20% for testing

In [14]:
# Standardize the features using StandardScaler
sc_x = StandardScaler()  # Initialize the StandardScaler
x_train = sc_x.fit_transform(x_train)  # Fit and transform training features
x_test = sc_x.transform(x_test)        # Transform test features using the same scaler

In [15]:
# Calculate the square root of the number of samples in the test set
# Import math
import math
print(len(y_test))  # Output the number of samples in the test set
math.sqrt(len(y_test))  # Calculate the square root of the number of samples

154


12.409673645990857

In [16]:
# Create a K-Nearest Neighbors classifier
classifier = KNeighborsClassifier(n_neighbors=11, p=2, metric='euclidean')
# Initialize a K-NN classifier with 11 neighbors and the Euclidean distance metric

# Train the classifier on the training data
classifier.fit(x_train, y_train)  # Fit the classifier to the training data

In [17]:
# Make predictions on the test data
y_pred = classifier.predict(x_test)  # Predict labels for the test features

# Display the predicted labels
y_pred  # Output the predicted labels for the test set

array([1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [18]:
# Compute the confusion matrix to evaluate model performance
cm = confusion_matrix(y_test, y_pred)  # Calculate the confusion matrix
print(cm)  # Output the confusion matrix

# Calculate the F1 score, a metric that considers both precision and recall
print(f1_score(y_test, y_pred))  # Output the F1 score of the model

[[94 13]
 [15 32]]
0.6956521739130436
