In [2]:
import pandas as pd
from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

In [3]:
heart_df = pd.read_csv("heart.csv")

X = heart_df.drop("target",axis=1)
y = heart_df["target"]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.2, random_state = 42 
)

scalar = StandardScaler()
X_train_scaled = scalar.fit_transform(X_train)
X_test_scaled = scalar.transform(X_test)

In [5]:
knn_classifier = KNeighborsClassifier(n_neighbors=3)
knn_classifier.fit(X_train_scaled, y_train)

y_pred = knn_classifier.predict(X_test_scaled)

print("recall score: ", recall_score(y_test, y_pred))
print("accuracy score: ", accuracy_score(y_test, y_pred))
print("precision score: ", precision_score(y_test, y_pred))



recall score:  0.78125
accuracy score:  0.8524590163934426
precision score:  0.9259259259259259


In [6]:
knn_classifier = KNeighborsClassifier(n_neighbors=5)
knn_classifier.fit(X_train_scaled, y_train)

y_pred = knn_classifier.predict(X_test_scaled)

print("recall score: ", recall_score(y_test, y_pred))
print("accuracy score: ", accuracy_score(y_test, y_pred))
print("precision score: ", precision_score(y_test, y_pred))

recall score:  0.875
accuracy score:  0.9016393442622951
precision score:  0.9333333333333333


In [7]:
# K=7 - Best_score
knn_classifier = KNeighborsClassifier(n_neighbors=7)
knn_classifier.fit(X_train_scaled, y_train)

y_pred = knn_classifier.predict(X_test_scaled)

print("recall score: ", recall_score(y_test, y_pred))
print("accuracy score: ", accuracy_score(y_test, y_pred))
print("precision score: ", precision_score(y_test, y_pred))

recall score:  0.90625
accuracy score:  0.9180327868852459
precision score:  0.9354838709677419


In [15]:
# Cross Validation for hyperparameters tuning using GridsearchCV

from sklearn.model_selection import GridSearchCV

classifier = KNeighborsClassifier()
param_grid = {"n_neighbors":[3,5,7,9]}

classifierCV = GridSearchCV(
    classifier,
    param_grid,
    cv=5,
    # scoring="recall" default uses accuracy score
)

classifierCV.fit(X_train_scaled, y_train)

y_pred = classifierCV.predict(X_test_scaled)

print("recall score: ", recall_score(y_test, y_pred))
print("accuracy score: ", accuracy_score(y_test, y_pred))
print("precision score: ", precision_score(y_test, y_pred))

#results 
res = pd.DataFrame(classifierCV.cv_results_)
print(res[["param_n_neighbors","mean_test_score"]])

print(classifierCV.best_params_)

recall score:  0.875
accuracy score:  0.9016393442622951
precision score:  0.9333333333333333
   param_n_neighbors  mean_test_score
0                  3         0.805782
1                  5         0.814116
2                  7         0.801616
3                  9         0.801786
{'n_neighbors': 5}


In [18]:
#Pipeline used to prevent data lickage for testing data
from sklearn.pipeline import Pipeline
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.2, random_state = 42 
)

pipeline = Pipeline([
    ('scaler',StandardScaler()),
    ('knn',KNeighborsClassifier())
])

param_grid = {"knn__n_neighbors":[3,5,7,9]}

classifierCV = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring="recall" #default uses accuracy score
)

classifierCV.fit(X_train, y_train)

y_pred = classifierCV.predict(X_test)

print("recall score: ", recall_score(y_test, y_pred))
print("accuracy score: ", accuracy_score(y_test, y_pred))
print("precision score: ", precision_score(y_test, y_pred))

recall score:  0.90625
accuracy score:  0.9180327868852459
precision score:  0.9354838709677419
