In [45]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [46]:
cd /content/drive/MyDrive/

/content/drive/MyDrive


In [47]:
ls

[0m[01;34m'Colab Notebooks'[0m/   [01;34mFinalProject_DM[0m/   [01;34mProject[0m/


In [48]:
cd FinalProject_DM/

/content/drive/MyDrive/FinalProject_DM


In [49]:
pwd

'/content/drive/MyDrive/FinalProject_DM'

In [50]:
# Importing the important libraries

import numpy as np
import random
import pandas as pd

from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler , MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report , confusion_matrix, accuracy_score, f1_score

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV


import warnings
warnings.filterwarnings('ignore')


In [51]:
# Loading the dataset

df = pd.read_csv('heart.csv')
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [52]:
print(df.shape)
print(df["HeartDisease"].value_counts())

(918, 12)
1    508
0    410
Name: HeartDisease, dtype: int64


In [53]:
df.isnull().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [54]:
df.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


In [55]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [56]:
# Seperating the dependent and the independent columns

X = df.drop("HeartDisease" , axis = 1)
y = df['HeartDisease']

In [57]:
# Seperating the numerical and the categorical columns

cat_cols = list(df.select_dtypes(include=['object']).columns)
num_cols = []

for i in list(X.columns):
  if i not in cat_cols:
    num_cols.append(i)

In [58]:
print(cat_cols , num_cols)

['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope'] ['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak']


In [59]:
normal_cols = ["Age","MaxHR","RestingBP"]
for col in normal_cols:
    mean = np.mean(df[col])
    std = np.std(df[col])
    lower_range = mean - (3*std)
    upper_range = mean + (3*std)
    df[col] = np.where(((df[col] < lower_range) | (df[col] > upper_range))
                            ,random.randint(int(lower_range),int(upper_range)),df[col])

In [60]:
IQR = np.percentile(df["Cholesterol"],75) - np.percentile(df["Cholesterol"],25)
lower_bound = np.percentile(df["Cholesterol"],25) - 1.5 * IQR
upper_bound = np.percentile(df["Cholesterol"],75) + 1.5 * IQR
median_cholesterol = np.median(df["Cholesterol"])

df["Cholesterol"] = np.where(((df["Cholesterol"] > upper_bound) | (df["Cholesterol"] < lower_bound)) 
                                 ,random.randint(int(np.percentile(df["Cholesterol"],25)),
                                                 int(np.percentile(df["Cholesterol"],75))),df["Cholesterol"])

In [61]:
# Seperating the train and test dataset
x_train,x_test,y_train,y_test = train_test_split(X,y,stratify = y , random_state=42,test_size=0.2)

In [62]:
x_train[:3]

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
485,63,M,ATA,139,217,1,ST,128,Y,1.2,Flat
486,55,M,ATA,110,214,1,ST,180,N,0.4,Up
117,59,F,ASY,130,338,1,ST,130,Y,1.5,Flat


In [63]:
# Standardadising the features of training and testing dataset

scaler = StandardScaler()
scaler.fit(X[num_cols])
X[num_cols] = scaler.transform(X[num_cols])

In [64]:
X = pd.get_dummies(data = X , drop_first=True)

In [65]:
# Seperating the train and test dataset
x_train,x_test,y_train,y_test = train_test_split(X,y,stratify = y , random_state=42,test_size=0.2)

In [66]:
x_train[:3]

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up
485,1.006537,0.356867,0.166481,1.813758,-0.346192,0.293283,1,1,0,0,0,1,1,1,0
486,0.157954,-1.210356,0.13904,1.813758,1.697314,-0.457194,1,1,0,0,0,1,0,0,1
117,0.582246,-0.129513,1.273277,1.813758,-0.267596,0.574711,0,0,0,0,0,1,1,1,0


# KNN Algorithm

In [67]:
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(x_train,y_train)
y_pred_knn = knn.predict(x_test)

print(knn.score(x_test,y_test))
print(classification_report(y_test,y_pred_knn))
print(confusion_matrix(y_test,y_pred_knn))
#compare with table

0.8967391304347826
              precision    recall  f1-score   support

           0       0.90      0.87      0.88        82
           1       0.90      0.92      0.91       102

    accuracy                           0.90       184
   macro avg       0.90      0.89      0.90       184
weighted avg       0.90      0.90      0.90       184

[[71 11]
 [ 8 94]]


# SVM Algorithm

In [68]:
from sklearn.svm import SVC
classifier = SVC(kernel='rbf', random_state=27)
classifier.fit(x_train, y_train)

SVC(random_state=27)

In [69]:
y_pred_svm = classifier.predict(x_test)

In [70]:
print(classifier.score(x_test,y_test))
print(classification_report(y_test, y_pred_svm))
print(confusion_matrix(y_test,y_pred_svm))

0.8858695652173914
              precision    recall  f1-score   support

           0       0.89      0.85      0.87        82
           1       0.89      0.91      0.90       102

    accuracy                           0.89       184
   macro avg       0.89      0.88      0.88       184
weighted avg       0.89      0.89      0.89       184

[[70 12]
 [ 9 93]]


# Random Forest Algorithm

In [71]:
randomforest = RandomForestClassifier()
randomforest.fit(x_train,y_train)
y_pred_rf = randomforest.predict(x_test)

In [72]:
print('Train Accuracy %s' % round(accuracy_score(y_test, y_pred_rf),2))
print('Train F1-score %s' % f1_score(y_test, y_pred_rf, average=None))
print(classification_report(y_test, y_pred_rf))
print("Confusion Matrix:\n",confusion_matrix(y_test, y_pred_rf))

Train Accuracy 0.86
Train F1-score [0.84146341 0.87254902]
              precision    recall  f1-score   support

           0       0.84      0.84      0.84        82
           1       0.87      0.87      0.87       102

    accuracy                           0.86       184
   macro avg       0.86      0.86      0.86       184
weighted avg       0.86      0.86      0.86       184

Confusion Matrix:
 [[69 13]
 [13 89]]


In [73]:
knn_acc = []
knn_tss = []
knn_prec = []
knn_tn = []
knn_tp = []
knn_fn = []
knn_fp = []

svm_acc = []
svm_tss = []
svm_prec = []
svm_tn = []
svm_tp = []
svm_fn = []
svm_fp = []

randmf_acc = []
randmf_tss = []
randmf_prec = []
randmf_tn = []
randmf_tp = []
randmf_fn = []
randmf_fp = []

for i in range(0,11):

  x_train,x_test,y_train,y_test = train_test_split(X,y,stratify = y , random_state=42,test_size=0.3)
  
  ## Running KNN Algorithm
  knn.fit(x_train,y_train)
  y_pred_knn = knn.predict(x_test)

  tn, fp, fn, tp = confusion_matrix(y_test, y_pred_knn).ravel()
  knn_tn.append(tn)
  knn_tp.append(fp)
  knn_fn.append(fn)
  knn_fp.append(tp)

  acck = (tp + tn) / (tn + fp + fn + tp)
  tss = (tp / (tp - fn)) - (fp / (fp + tn))
  precision = tp / (tp + fp)

  knn_acc.append(acck)
  knn_tss.append(tss)
  knn_prec.append(precision)

  ## Running Random Forest Algorithm
  randomforest.fit(x_train,y_train)
  y_pred_rf = randomforest.predict(x_test)

  tn, fp, fn, tp = confusion_matrix(y_test, y_pred_rf).ravel()
  randmf_tn.append(tn)
  randmf_tp.append(fp)
  randmf_fn.append(fn)
  randmf_fp.append(tp)

  accrf = (tp + tn) / (tn + fp + fn + tp)
  tss = (tp / (tp - fn)) - (fp / (fp + tn))
  precision = tp / (tp + fp)

  randmf_acc.append(accrf)
  randmf_tss.append(tss)
  randmf_prec.append(precision)

  ## Running SVM Algorithm
  classifier.fit(x_train, y_train)
  y_pred_svm = classifier.predict(x_test)

  tn, fp, fn, tp = confusion_matrix(y_test, y_pred_svm).ravel()
  svm_tn.append(tn)
  svm_tp.append(fp)
  svm_fn.append(fn)
  svm_fp.append(tp)

  accs = (tp + tn) / (tn + fp + fn + tp)
  tss = (tp / (tp - fn)) - (fp / (fp + tn))
  precision = tp / (tp + fp)

  svm_acc.append(accs)
  svm_tss.append(tss)
  svm_prec.append(precision)

In [74]:
## Average: 
avg_knn_acc = sum(knn_acc) / len(knn_acc)
avg_knn_tss = sum(knn_tss) / len(knn_tss)
avg_knn_prec = sum(knn_prec) / len(knn_prec)
avg_knn_tn = sum(knn_tn) / len(knn_tn)
avg_knn_tp = sum(knn_tp) / len(knn_tp)
avg_knn_fn = sum(knn_fn) / len(knn_fn)
avg_knn_fp = sum(knn_fp) / len(knn_fp)


In [75]:
## Average: 
avg_svm_acc = sum(svm_acc) / len(svm_acc)
avg_svm_tss = sum(svm_tss) / len(svm_tss)
avg_svm_prec = sum(svm_prec) / len(svm_prec)
avg_svm_tn = sum(svm_tn) / len(svm_tn)
avg_svm_tp = sum(svm_tp) / len(svm_tp)
avg_svm_fn = sum(svm_fn) / len(svm_fn)
avg_svm_fp = sum(svm_fp) / len(svm_fp)

In [76]:
## Average: 
avg_randmf_acc = sum(randmf_acc) / len(randmf_acc)
avg_randmf_tss = sum(randmf_tss) / len(randmf_tss)
avg_randmf_prec = sum(randmf_prec) / len(randmf_prec)
avg_randmf_tn = sum(randmf_tn) / len(randmf_tn)
avg_randmf_tp = sum(randmf_tp) / len(randmf_tp)
avg_randmf_fn = sum(randmf_fn) / len(randmf_fn)
avg_randmf_fp = sum(randmf_fp) / len(randmf_fp)

In [77]:
table_cols = {'Algorithm' : ['KNN' , 'SVM' , 'Random Forest'] , 'TP' : [avg_knn_tp , avg_svm_tp , avg_randmf_tp] , 'FP' : [avg_knn_fp, avg_svm_fp, avg_randmf_fp] , 'FN' : [avg_knn_fn, avg_svm_fn, avg_randmf_fn] , 'TN' : [avg_knn_tn, avg_svm_tn, avg_randmf_tn] , 'ACC' : [avg_knn_acc, avg_svm_acc, avg_randmf_acc], 'TSS' : [avg_knn_tss , avg_svm_tss, avg_randmf_tss] , 'Precision' : [avg_knn_prec, avg_svm_prec, avg_randmf_prec]}


In [80]:
table_cols1 = pd.DataFrame.from_dict(table_cols)

In [79]:
table_cols1

Unnamed: 0,Algorithm,TP,FP,FN,TN,ACC,TSS,Precision
0,KNN,18.0,137.0,16.0,105.0,0.876812,0.98589,0.883871
1,SVM,14.0,143.0,10.0,109.0,0.913043,0.961367,0.910828
2,Random Forest,15.818182,139.818182,13.181818,107.181818,0.894928,0.975593,0.898436
