[B4] Implement k nearest neighbours algorithm on diabetes.csv dataset.
compute  confusion Matrix, accuracy, error rate, precision and recall on the given dataset

Step 1: Import Required Libraries

In [7]:
# Importing necessary libraries
import pandas as pd           # For data manipulation and analysis
import numpy as np            # For numerical operations
from sklearn.model_selection import train_test_split  # For splitting the dataset into training and testing sets
from sklearn.preprocessing import StandardScaler      # For feature scaling
from sklearn.neighbors import KNeighborsClassifier     # For k-NN algorithm
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report  # For evaluation metrics

Step 2: Load the Dataset

In [8]:
# Loading the diabetes dataset
data = pd.read_csv('diabetes.csv')  # Replace with the correct path to your dataset
print(data.head())  # Display the first few rows of the dataset

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


Step 3: Preprocess the Data

In [14]:
# Check for missing values
data.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [15]:
# Separating features and target variable
X = data.drop('Outcome', axis=1)  # Features (all columns except 'Outcome')
y = data['Outcome']                 # Target variable (diabetes outcome)

In [16]:
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
# Scaling the features for better performance of k-NN
scaler = StandardScaler()           # Initializing the scaler
X_train = scaler.fit_transform(X_train)  # Fitting and transforming the training data
X_test = scaler.transform(X_test)        # Transforming the testing data

Step 4: Train the k-NN Model


In [18]:
# Initializing the k-NN classifier with k=5 (can be adjusted)
k = 5
knn = KNeighborsClassifier(n_neighbors=k)

In [19]:
# Training the k-NN model
knn.fit(X_train, y_train)  # Fitting the model with training data

Step 5: Make Predictions

In [20]:
# Making predictions on the test set
y_pred = knn.predict(X_test)  # Predicting the outcomes for the test set

Step 6: Evaluate the Model

In [21]:
# Calculating the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

Confusion Matrix:
 [[79 20]
 [27 28]]


In [22]:
# Calculating accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.6948051948051948


In [23]:
# Calculating error rate
error_rate = 1 - accuracy
print("Error Rate:", error_rate)

Error Rate: 0.30519480519480524


In [24]:
# Generating the classification report for precision and recall
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.80      0.77        99
           1       0.58      0.51      0.54        55

    accuracy                           0.69       154
   macro avg       0.66      0.65      0.66       154
weighted avg       0.69      0.69      0.69       154

