## Heart Disease Classification

#### Index
1. Loading libraries and dataset
2. EDA
3. Cleaning and Preprocessing
4. Train Test Split
5. Training baseline models (LR, SVC, DT, KNN, GaussNB, RF, AdaBoost)
6. Optimize best 4
5. PCA
7. Train models on reduced ds

#### EDA + Preproc

In [63]:
## importing necessary libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

In [37]:
df = pd.read_csv("uci_ds.csv")
df.head(10)

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0
5,6,56,Male,Cleveland,atypical angina,120.0,236.0,False,normal,178.0,False,0.8,upsloping,0.0,normal,0
6,7,62,Female,Cleveland,asymptomatic,140.0,268.0,False,lv hypertrophy,160.0,False,3.6,downsloping,2.0,normal,3
7,8,57,Female,Cleveland,asymptomatic,120.0,354.0,False,normal,163.0,True,0.6,upsloping,0.0,normal,0
8,9,63,Male,Cleveland,asymptomatic,130.0,254.0,False,lv hypertrophy,147.0,False,1.4,flat,1.0,reversable defect,2
9,10,53,Male,Cleveland,asymptomatic,140.0,203.0,True,lv hypertrophy,155.0,True,3.1,downsloping,0.0,reversable defect,1


In [39]:
ds = df.drop(columns='id')
ds

Unnamed: 0,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,54,Female,VA Long Beach,asymptomatic,127.0,333.0,True,st-t abnormality,154.0,False,0.0,,,,1
916,62,Male,VA Long Beach,typical angina,,139.0,False,st-t abnormality,,,,,,,0
917,55,Male,VA Long Beach,asymptomatic,122.0,223.0,True,st-t abnormality,100.0,False,0.0,,,fixed defect,2
918,58,Male,VA Long Beach,asymptomatic,,385.0,True,lv hypertrophy,,,,,,,0


In [31]:
ds['sex'].unique()

array(['Male', 'Female'], dtype=object)

In [50]:
for i in range(0, len(ds['sex'])):
    if ds.iloc[i, 1] == 'Male':
        ds.iloc[i, 1] = 1
    else:
        ds.iloc[i, 1] = 0

In [33]:
ds = ds.drop(columns='dataset')

In [34]:
ds['cp'].unique()

array(['typical angina', 'asymptomatic', 'non-anginal', 'atypical angina'],
      dtype=object)

In [35]:
ohe = OneHotEncoder(handle_unknown='ignore')
cp_ohe = ohe.fit_transform(ds[['cp']]).toarray()
cp_ohe = pd.DataFrame(cp_ohe)
cp_ohe = cp_ohe.rename(columns={0: "typical angina", 1: "asymptomatic", 2: "non-anginal", 3: "atypical angina"})
cp_ohe

Unnamed: 0,typical angina,asymptomatic,non-anginal,atypical angina
0,0.0,0.0,0.0,1.0
1,1.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0
4,0.0,1.0,0.0,0.0
...,...,...,...,...
915,1.0,0.0,0.0,0.0
916,0.0,0.0,0.0,1.0
917,1.0,0.0,0.0,0.0
918,1.0,0.0,0.0,0.0


In [45]:
res = pd.concat([ds, cp_ohe], axis=1, join='inner')
res = res.drop(columns='dataset')
res = res.drop(columns='cp')
ds = res.copy()

In [46]:
ds

Unnamed: 0,age,sex,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num,typical angina,asymptomatic,non-anginal,atypical angina
0,63,Male,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0,0.0,0.0,0.0,1.0
1,67,Male,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2,1.0,0.0,0.0,0.0
2,67,Male,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1,1.0,0.0,0.0,0.0
3,37,Male,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0,0.0,0.0,1.0,0.0
4,41,Female,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,54,Female,127.0,333.0,True,st-t abnormality,154.0,False,0.0,,,,1,1.0,0.0,0.0,0.0
916,62,Male,,139.0,False,st-t abnormality,,,,,,,0,0.0,0.0,0.0,1.0
917,55,Male,122.0,223.0,True,st-t abnormality,100.0,False,0.0,,,fixed defect,2,1.0,0.0,0.0,0.0
918,58,Male,,385.0,True,lv hypertrophy,,,,,,,0,1.0,0.0,0.0,0.0


In [47]:
ds['fbs'].unique()

array([True, False, nan], dtype=object)

In [52]:
for i in range(0, len(ds['fbs'])):
    if ds.iloc[i, 4] == True:
        ds.iloc[i, 4] = 1
    if ds.iloc[i, 4] == False:
        ds.iloc[i, 4] = 0

In [54]:
for i in range(0, len(ds['exang'])):
    if ds.iloc[i, 7] == True:
        ds.iloc[i, 7] = 1
    if ds.iloc[i, 7] == False:
        ds.iloc[i, 7] = 0

In [56]:
ds['restecg'].unique()  ## wait for nan removal

array(['lv hypertrophy', 'normal', 'st-t abnormality', nan], dtype=object)

In [57]:
ds['slope'].unique()    ## w8 for nan removal

array(['downsloping', 'flat', 'upsloping', nan], dtype=object)

In [58]:
ds['thal'].unique()

array(['fixed defect', 'normal', 'reversable defect', nan], dtype=object)

In [60]:
ds_np = np.array(ds)

In [62]:
ds_np[0,:]

array([63, 1, 145.0, 233.0, 1, 'lv hypertrophy', 150.0, 0, 2.3,
       'downsloping', 0.0, 'fixed defect', 0, 0.0, 0.0, 0.0, 1.0],
      dtype=object)

In [66]:
ohe = OneHotEncoder(handle_unknown='ignore')
cp_ohe = ohe.fit_transform(ds[['restecg']]).toarray()
cp_ohe = pd.DataFrame(cp_ohe)
cp_ohe = cp_ohe.rename(columns={0: "lv hypertrophy", 1: "normal", 2: "st-t abnormality", 3: 'nan'})
cp_ohe

Unnamed: 0,lv hypertrophy,normal,st-t abnormality,nan
0,1.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0
4,1.0,0.0,0.0,0.0
...,...,...,...,...
915,0.0,0.0,1.0,0.0
916,0.0,0.0,1.0,0.0
917,0.0,0.0,1.0,0.0
918,1.0,0.0,0.0,0.0


In [67]:
res = pd.concat([ds, cp_ohe], axis=1, join='inner')
res = res.drop(columns='restecg')
ds = res.copy()
ds

Unnamed: 0,age,sex,trestbps,chol,fbs,thalch,exang,oldpeak,slope,ca,thal,num,typical angina,asymptomatic,non-anginal,atypical angina,lv hypertrophy,normal,st-t abnormality,nan
0,63,1,145.0,233.0,1,150.0,0,2.3,downsloping,0.0,fixed defect,0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
1,67,1,160.0,286.0,0,108.0,1,1.5,flat,3.0,normal,2,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,67,1,120.0,229.0,0,129.0,1,2.6,flat,2.0,reversable defect,1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,37,1,130.0,250.0,0,187.0,0,3.5,downsloping,0.0,normal,0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,41,0,130.0,204.0,0,172.0,0,1.4,upsloping,0.0,normal,0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,54,0,127.0,333.0,1,154.0,0,0.0,,,,1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
916,62,1,,139.0,0,,,,,,,0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
917,55,1,122.0,223.0,1,100.0,0,0.0,,,fixed defect,2,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
918,58,1,,385.0,1,,,,,,,0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [77]:
ds = ds.drop(columns='thal')
ds

Unnamed: 0,age,sex,trestbps,chol,fbs,thalch,exang,oldpeak,ca,num,typical angina,asymptomatic,non-anginal,atypical angina,lv hypertrophy,normal,st-t abnormality
0,63,1,145.0,233.0,1,150.0,0,2.3,0.0,0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1,67,1,160.0,286.0,0,108.0,1,1.5,3.0,2,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,67,1,120.0,229.0,0,129.0,1,2.6,2.0,1,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,37,1,130.0,250.0,0,187.0,0,3.5,0.0,0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,41,0,130.0,204.0,0,172.0,0,1.4,0.0,0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,54,0,127.0,333.0,1,154.0,0,0.0,,1,1.0,0.0,0.0,0.0,0.0,0.0,1.0
916,62,1,,139.0,0,,,,,0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
917,55,1,122.0,223.0,1,100.0,0,0.0,,2,1.0,0.0,0.0,0.0,0.0,0.0,1.0
918,58,1,,385.0,1,,,,,0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [80]:
ds_np = np.array(ds)

In [113]:
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
ds_np = imp_mean.fit_transform(ds_np)
ds_np

array([[ 63.        ,   1.        , 145.        , ...,   1.        ,
          0.        ,   0.        ],
       [ 67.        ,   1.        , 160.        , ...,   1.        ,
          0.        ,   0.        ],
       [ 67.        ,   1.        , 120.        , ...,   1.        ,
          0.        ,   0.        ],
       ...,
       [ 55.        ,   1.        , 122.        , ...,   0.        ,
          0.        ,   1.        ],
       [ 58.        ,   1.        , 132.13240418, ...,   1.        ,
          0.        ,   0.        ],
       [ 62.        ,   1.        , 120.        , ...,   1.        ,
          0.        ,   0.        ]])

In [114]:
ds = pd.DataFrame(ds_np)
ds = ds.drop(columns=9)
ds

Unnamed: 0,0,1,2,3,4,5,6,7,8,10,11,12,13,14,15,16
0,63.0,1.0,145.000000,233.0,1.0,150.000000,0.000000,2.300000,0.000000,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1,67.0,1.0,160.000000,286.0,0.0,108.000000,1.000000,1.500000,3.000000,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,67.0,1.0,120.000000,229.0,0.0,129.000000,1.000000,2.600000,2.000000,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,37.0,1.0,130.000000,250.0,0.0,187.000000,0.000000,3.500000,0.000000,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,41.0,0.0,130.000000,204.0,0.0,172.000000,0.000000,1.400000,0.000000,0.0,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,54.0,0.0,127.000000,333.0,1.0,154.000000,0.000000,0.000000,0.676375,1.0,0.0,0.0,0.0,0.0,0.0,1.0
916,62.0,1.0,132.132404,139.0,0.0,137.545665,0.389595,0.878788,0.676375,0.0,0.0,0.0,1.0,0.0,0.0,1.0
917,55.0,1.0,122.000000,223.0,1.0,100.000000,0.000000,0.000000,0.676375,1.0,0.0,0.0,0.0,0.0,0.0,1.0
918,58.0,1.0,132.132404,385.0,1.0,137.545665,0.389595,0.878788,0.676375,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [115]:
features = np.array(ds)
features.shape

(920, 16)

In [116]:
target = ds_np[:, 9]
target.shape

(920,)

In [117]:
## saving as csv
feat_df = pd.DataFrame(features)
target_df = pd.DataFrame(target)

feat_df.to_csv('features.csv')
target_df.to_csv('target.csv')

#### Training basic ML models

In [136]:
## importing additional libraries

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier

In [119]:
## loading the dataset

feat = pd.read_csv("features.csv")
targ = pd.read_csv("target.csv")
feat = feat.drop(columns='Unnamed: 0')
targ = targ.drop(columns='Unnamed: 0')
X = feat.to_numpy()
y = targ.to_numpy()


In [120]:
print(X.shape)
print(y.shape)

(920, 16)
(920, 1)


In [121]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [131]:
lr = LogisticRegression(max_iter=500)
lr.fit(X_train, y_train.ravel())
# lr_pred = lr.predict(X_test)
lr.score(X_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.5434782608695652

In [132]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train.ravel())
rf.score(X_test, y_test)

0.5434782608695652

In [134]:
sv = SVC()
sv.fit(X_train, y_train.ravel())
sv.score(X_test, y_test)

0.44565217391304346

In [138]:
adb = AdaBoostClassifier(algorithm='SAMME')
adb.fit(X_train, y_train.ravel())
adb.score(X_test, y_test)

0.5380434782608695