Import libraries

In [47]:
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_digits
import matplotlib.pyplot as plt
from imblearn.under_sampling import RandomUnderSampler

**Upload Data**

We upload the data in a pandas dataframe. 

In [48]:
vioPath = r'/content/drive/MyDrive/Colab Notebooks/DS/heart_disease_health_indicators.csv'
df = pd.read_csv(vioPath)

# Pre-Processing Data

In [49]:
df.head()

Unnamed: 0,HeartDiseaseorAttack,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,Diabetes,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


In [50]:
df.shape

(253680, 22)

**Check empty values**

In [51]:
print(round((((df.isnull().sum()).sum() / np.product(df.shape)) * 100), 2))

0.0


**Split Data**

In [52]:
X = df.drop(columns=['HeartDiseaseorAttack'])
y = np.array(df['HeartDiseaseorAttack'])

**Balance Data**

We performed undersampling because our target is not balanced. 

In [53]:
rus = RandomUnderSampler()
X_resampled, y_resampled = rus.fit_resample(X, y)

In [54]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size = 0.2)

In [55]:
y_train.shape

(38228,)

In [56]:
X_train.shape

(38228, 21)

**Scale Data**

In [57]:
from sklearn.preprocessing import StandardScaler

In [58]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train_std = scaler.transform(X_train)
X_test_std = scaler.transform(X_test)

**Normalize Data**

In [59]:
from sklearn.preprocessing import MinMaxScaler

In [60]:
minMax = MinMaxScaler()
minMax.fit(X_train)
X_train_minMax = scaler.transform(X_train)
X_test_minMax = scaler.transform(X_test)

# KNN 

In [61]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

Note: We have chosen a range of odd numbers for the k, to try to avoid "ties" as much as possible. 

**KNN with standardize data**

In [62]:
collector = []
for k in range(1, 20, 2): # try 1 to 7 neighbors
  knn = KNeighborsClassifier(n_neighbors = k)
  knn.fit(X_train_std, y_train)
  prediction = knn.predict(X_test_std)
  accuracy = accuracy_score(y_test, prediction)
collector.append({"k" : k, "accuracy": accuracy.astype('float64')})

accuracy_scores_std_df = pd.DataFrame(collector)

KeyboardInterrupt: ignored

In [None]:
display(accuracy_scores_std_df)

In [None]:
plt.plot(accuracy_scores_std_df['k'], accuracy_scores_std_df['accuracy'])
plt.grid(True)
plt.show()

**KNN with Min-Max Normalization**

In [None]:
collector = []
for k in range(1, 20, 2): # try 1 to 7 neighbors
  knn = KNeighborsClassifier(n_neighbors = k)
  knn.fit(X_train_minMax, y_train)
  prediction = knn.predict(X_test_minMax)
  accuracy = accuracy_score(y_test, prediction)

collector.append({"k" : k, "accuracy": accuracy.astype('float64')})
accuracy_scores_minMax_df = pd.DataFrame(collector)

In [None]:
display(accuracy_scores_minMax_df)

In [None]:
plt.plot(accuracy_scores_minMax_df['k'], accuracy_scores_minMax_df['accuracy'])
plt.grid(True)
plt.show()

In [None]:
print(f"Average accuracy for Standardization {accuracy_scores_std_df['accuracy'].mean()}")
print(f"Average accuracy for Standardization {accuracy_scores_minMax_df['accuracy'].mean()}")

## optimize KNN with RandomizedSearchCV

In [None]:
X_train = X_train_std
X_test = X_test_std

We tried to optimized the KNN using RandomizedSearchCV. We will try to get the best number of neighbors.


In [None]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
knn = KNeighborsClassifier()

We generate a list of possible values for k, which are all odd to avoid possible "ties".

In [None]:
k = [1, 3, 5, 7, 9, 11, 13, 15, 17, 19]

I make a list of parameters to test in the RandomizedSearchCV. In this case, the parameters are just the number of neighbors. For p, p=1 is equivalent to using the manhattan_distance, p=2 means using the euclidian distance. 

In [None]:
params = {'n_neighbors' : k, 'p' : [1, 2]}

cv=5 means cross validation with 5 folds

In [None]:
random_search = GridSearchCV(knn, params, cv=5)
random_search.fit(X_train, y_train)

Call score_samples on the estimator with the best found parameters.

In [None]:
random_search.best_params_

In [None]:
best_k = random_search.best_params_.get('n_neighbors')
print(best_k)

In [None]:
p =random_search.best_params_.get('p')

Check if we get the same or very similar result: 

In [None]:
knn = KNeighborsClassifier(n_neighbors=best_k, p=p)
knn.fit(X_train, y_train)
prediction = knn.predict(X_test)
accuracy = accuracy_score(y_test, prediction)
accuracy

# Other Evaluations

## Confusion Matrix

In [None]:
from sklearn.metrics import plot_confusion_matrix
plot_confusion_matrix(knn, X_test, y_test)

We have gotten a relatively high number of false negatives. 