In [58]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tabulate import tabulate
from scipy.stats import t
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier

In [60]:
def read_data(file, y_name):
    df = pd.read_csv(file)
    x = df.loc[:, df.columns != y_name]
    y = df.loc[:, df.columns == y_name]
    return x, y

x, y = read_data("/Users/abjain/Documents/Industry/ML/ML_practice/Project1/ML_heardisease/data/heart.csv",'target')
# print(x.shape)
# print(x.head())
# print(y.shape)
# y.head

In [61]:
np.random.seed(177372)

def shuffle(x, y):
    idx = np.random.permutation(x.index)
    x = x.reindex(idx)
    y = y.index(idx)
    return x, y

def data_split(x, y, frac):
    idx = np.random.permutation(x.index)
    train_idx = idx[:int(len(idx)*frac)]
    test_idx = idx[int(len(idx)*frac):len(idx)]

    train_x = x.iloc[train_idx,:]
    train_y = y.iloc[train_idx,:]
    test_x = x.iloc[test_idx,:]
    test_y = y.iloc[test_idx,:]

    return train_x, train_y, test_x, test_y

# 80% data in traing while for testing 20% 
train_x, train_y, test_x, test_y = data_split(x,y,0.8) 

In [62]:
print(train_x.shape)
print(train_y.shape)
print(test_x.shape)
print(test_y.shape)

(242, 13)
(242, 1)
(61, 13)
(61, 1)


In [63]:
## If you go down in the section of the feature selection we have selected features based on the Lasso regression 
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x, y, 
                                                test_size=0.2, 
                                                random_state=42)

In [64]:
print(train_x.shape)
print(train_y.shape)
print(test_x.shape)
print(test_y.shape)
print(xtrain.shape)
print(ytrain.shape)
print(xtest.shape)
print(ytest.shape)


(242, 13)
(242, 1)
(61, 13)
(61, 1)
(242, 13)
(242, 1)
(61, 13)
(61, 1)


Scaling the training and testing dataset with the min-max transformation using sklearn. From now we will use sklearn. Since it is much easier and clean

In [65]:
from sklearn import preprocessing
scaler = sklearn.preprocessing.MinMaxScaler()
xtrain_scaled = scaler.fit_transform(xtrain)
xtest_scaled = scaler.fit_transform(xtest)


Applying the K nearest neighbor using Sklearn

In [None]:
# import pandas as pd
# print(pd.DataFrame(xtrain_scaled).head(2))
# ytrain

In [67]:
# KNeighborsClassifier paramteres
# https://scikit-learn.org/dev/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
# weights{‘uniform’, ‘distance’}, callable or None, default=’uniform’
# Weight function used in prediction. Possible values:
# ‘uniform’ : uniform weights. All points in each neighborhood are weighted equally.
# ‘distance’ : weight points by the inverse of their distance. in this case, closer neighbors of a query point will have a greater influence than neighbors which are further away.
# [callable] : a user-defined function which accepts an array of distances, and returns an array of the same shape containing the weights.

# algorithm{‘auto’, ‘ball_tree’, ‘kd_tree’, ‘brute’}, default=’auto’
# Algorithm used to compute the nearest neighbors:
# ‘ball_tree’ will use BallTree
# ‘kd_tree’ will use KDTree
# ‘brute’ will use a brute-force search.
# ‘auto’ will attempt to decide the most appropriate algorithm based on the values passed to fit method.
# Note: fitting on sparse input will override the setting of this parameter, using brute force.

# class sklearn.neighbors.KNeighborsClassifier(n_neighbors=5, *, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None)
knn_sklearn = KNeighborsClassifier(n_neighbors=3, weights='distance', algorithm = 'auto')
knn_sklearn.fit(xtrain_scaled, ytrain)

# Make predictions
predictions_sklearn = knn_sklearn.predict(xtest_scaled)
print("Scikit-learn KNN Predictions:\n", predictions_sklearn)


Scikit-learn KNN Predictions:
 [0 0 1 0 1 1 1 0 0 1 1 0 1 0 1 1 1 0 0 0 0 0 1 1 1 1 1 0 0 1 0 0 0 0 1 0 1
 1 1 1 1 1 1 1 1 0 1 1 0 0 0 0 1 1 0 0 0 1 0 0 0]


  return self._fit(X, y)


In [74]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(ytest, predictions_sklearn)
print(f"AccuracyL {accuracy:.2f}")


AccuracyL 0.85


In [70]:
from sklearn.metrics import confusion_matrix

# Generate confusion matrix
conf_matrix = confusion_matrix(ytest, predictions_sklearn)
print("Confusion Matrix:\n", conf_matrix)

Confusion Matrix:
 [[25  4]
 [ 5 27]]


In [71]:
from sklearn.metrics import classification_report

# Print precision, recall, and F1 score
print("Classification Report:\n", classification_report(ytest, predictions_sklearn))


Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.86      0.85        29
           1       0.87      0.84      0.86        32

    accuracy                           0.85        61
   macro avg       0.85      0.85      0.85        61
weighted avg       0.85      0.85      0.85        61



Tuning the Hyperparameters for Better Accuracy

Grid Search for Hyperparameter Tuning

To improve the accuracy or the F1 score, lets tune the parameters using GridSearchCV, which allows to test different hyperparameter combinations.

In [None]:
from sklearn.model_selection import GridSearchCV

# Define hyperparameters to tune
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}

# Initialize the KNN classifier
knn = KNeighborsClassifier()

# Use GridSearchCV to search for the best parameters
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy')
grid_search.fit(xtrain_scaled, ytrain)

# Print the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)

In [73]:
# Print the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)

Best Parameters: {'algorithm': 'auto', 'n_neighbors': 3, 'weights': 'uniform'}
Best Accuracy: 0.8017006802721088


In [79]:
### Using the best Paramters 
knn_sklearn = KNeighborsClassifier(n_neighbors=3, weights='uniform', algorithm = 'auto')
knn_sklearn.fit(xtrain_scaled, ytrain)

# Make predictions
predictions_sklearn = knn_sklearn.predict(xtest_scaled)
print("Scikit-learn KNN Predictions:\n", predictions_sklearn)


Scikit-learn KNN Predictions:
 [0 0 1 0 1 1 0 0 0 1 1 0 1 0 1 1 1 0 0 0 0 0 1 1 1 1 1 0 0 1 0 0 0 0 1 0 1
 1 1 1 1 1 1 1 1 0 1 1 0 0 0 0 1 1 0 0 0 1 0 0 0]


  return self._fit(X, y)


In [80]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(ytest, predictions_sklearn)
print(f"AccuracyL {accuracy:.2f}")

from sklearn.metrics import classification_report

# Print precision, recall, and F1 score
print("Classification Report:\n", classification_report(ytest, predictions_sklearn))



AccuracyL 0.84
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.86      0.83        29
           1       0.87      0.81      0.84        32

    accuracy                           0.84        61
   macro avg       0.84      0.84      0.84        61
weighted avg       0.84      0.84      0.84        61

