In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn import tree
from sklearn.preprocessing import StandardScaler, LabelEncoder


In [2]:
# Data
df = pd.read_excel('heart_disease.xlsx', sheet_name='Heart_disease')
df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,thal,num
0,63,Male,typical angina,145,233,True,lv hypertrophy,150,False,2.3,downsloping,fixed defect,0
1,41,Male,atypical angina,135,203,False,normal,132,False,0.0,flat,fixed defect,0
2,57,Male,asymptomatic,140,192,False,normal,148,False,0.4,flat,fixed defect,0
3,52,Male,typical angina,118,186,False,lv hypertrophy,190,False,0.0,flat,fixed defect,0
4,57,Male,asymptomatic,110,201,False,normal,126,True,1.5,flat,fixed defect,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
903,53,Male,asymptomatic,125,0,False,normal,120,False,1.5,upsloping,reversable defect,4
904,62,Male,asymptomatic,166,170,False,st-t abnormality,120,True,3.0,flat,reversable defect,4
905,56,Male,non-anginal,170,0,False,lv hypertrophy,123,True,2.5,downsloping,normal,4
906,56,Male,non-anginal,144,208,True,st-t abnormality,105,TURE,,downsloping,fixed defect,4


In [3]:
df.fillna(0, inplace=True)

In [4]:
df.drop_duplicates(inplace=True)
df = df.reset_index(drop=True)

In [5]:
outliers_trestbps = df[(df['trestbps'] < 80) | (df['trestbps'] > 180)]
outliers_trestbps

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,thal,num
243,39,Male,atypical angina,190,241,False,normal,106,False,0.0,flat,normal,0
383,63,Female,atypical angina,181,0,False,normal,87,TURE,0.0,flat,fixed defect,0
384,74,Male,non-anginal,188,0,False,normal,132,FALSE,0.0,flat,normal,0
387,45,Male,non-anginal,193,236,False,normal,188,TURE,0.0,flat,reversable defect,0
396,55,Male,non-anginal,196,406,False,st-t abnormality,72,TURE,0.0,downsloping,fixed defect,0
487,54,Male,atypical angina,192,283,False,lv hypertrophy,195,False,0.0,upsloping,reversable defect,1
488,63,Male,asymptomatic,185,0,False,normal,98,True,0.0,upsloping,reversable defect,1
589,54,Male,asymptomatic,200,198,False,normal,142,True,2.0,flat,normal,1
765,69,Male,asymptomatic,184,236,True,normal,131,TURE,0.0,flat,normal,2
767,60,Male,asymptomatic,192,281,False,st-t abnormality,107,TURE,0.0,upsloping,normal,2


In [6]:
df1 = df.drop(outliers_trestbps.index)

In [7]:
df1

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,thal,num
0,63,Male,typical angina,145,233,True,lv hypertrophy,150,False,2.3,downsloping,fixed defect,0
1,41,Male,atypical angina,135,203,False,normal,132,False,0.0,flat,fixed defect,0
2,57,Male,asymptomatic,140,192,False,normal,148,False,0.4,flat,fixed defect,0
3,52,Male,typical angina,118,186,False,lv hypertrophy,190,False,0.0,flat,fixed defect,0
4,57,Male,asymptomatic,110,201,False,normal,126,True,1.5,flat,fixed defect,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
902,53,Male,asymptomatic,125,0,False,normal,120,False,1.5,upsloping,reversable defect,4
903,62,Male,asymptomatic,166,170,False,st-t abnormality,120,True,3.0,flat,reversable defect,4
904,56,Male,non-anginal,170,0,False,lv hypertrophy,123,True,2.5,downsloping,normal,4
905,56,Male,non-anginal,144,208,True,st-t abnormality,105,TURE,0.0,downsloping,fixed defect,4


In [8]:
chol_out = df1[(df1['chol'] < 100) | (df1['chol'] > 420)]
chol_rows = chol_out.shape[0]
print("Number of outliers:", chol_rows)

Number of outliers: 179


In [9]:
thalch_out = df1[(df1['thalch'] < 60) | (df1['thalch'] > 200)]
thalch_rows = thalch_out.shape[0]
print("Number of outliers:", thalch_rows)

Number of outliers: 1


In [10]:
outliers_old = df1[(df1['oldpeak'] < -2) | (df1['oldpeak'] > 4)]

In [11]:
df2 = df1.drop(outliers_old.index)

In [12]:
df2

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,thal,num
0,63,Male,typical angina,145,233,True,lv hypertrophy,150,False,2.3,downsloping,fixed defect,0
1,41,Male,atypical angina,135,203,False,normal,132,False,0.0,flat,fixed defect,0
2,57,Male,asymptomatic,140,192,False,normal,148,False,0.4,flat,fixed defect,0
3,52,Male,typical angina,118,186,False,lv hypertrophy,190,False,0.0,flat,fixed defect,0
4,57,Male,asymptomatic,110,201,False,normal,126,True,1.5,flat,fixed defect,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
902,53,Male,asymptomatic,125,0,False,normal,120,False,1.5,upsloping,reversable defect,4
903,62,Male,asymptomatic,166,170,False,st-t abnormality,120,True,3.0,flat,reversable defect,4
904,56,Male,non-anginal,170,0,False,lv hypertrophy,123,True,2.5,downsloping,normal,4
905,56,Male,non-anginal,144,208,True,st-t abnormality,105,TURE,0.0,downsloping,fixed defect,4


In [13]:
thalch_out = df2[(df2['thalch'] < 60) | (df2['thalch'] > 200)]
thalch_rows = thalch_out.shape[0]
print("Number of outliers:", thalch_rows)

Number of outliers: 1


In [14]:
outliers_tha = df2[(df2['thalch'] < 60) | (df2['thalch'] > 200)]
df3 = df2.drop(outliers_tha.index)

In [15]:
df3.shape

(878, 13)

In [16]:
chol_out = df3[(df3['chol'] < 100) | (df3['chol'] > 420)]
chol_rows = chol_out.shape[0]
print("Number of outliers:", chol_rows)

Number of outliers: 178


In [17]:
outliers_chol = df3[(df3['chol'] < 100) | (df3['chol'] > 420)]
df4 = df3.drop(outliers_chol.index)

In [18]:
df4.shape

(700, 13)

In [19]:
df4['exang'] = df4['exang'].replace({'FALSE': False, 'TURE': True})

In [20]:
numerical = ['age','trestbps', 'chol', 'thalch', 'oldpeak' ]
df_std = {}

for col in numerical:
    df_std[col] = StandardScaler()
    df4[col] = df_std[col].fit_transform(df4[numerical])

In [21]:
categorical = ['sex', 'cp', 'restecg', 'slope', 'thal', 'fbs', 'exang']
label_encoders = {}

for col in categorical:
    label_encoders[col] = LabelEncoder()
    df4[col] = label_encoders[col].fit_transform(df4[col])

In [22]:
df4.reset_index(drop=True, inplace=True)

In [23]:
data = df4.copy()

In [24]:
data

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,thal,num
0,1.072096,1,3,1.072096,1.072096,1,0,1.072096,0,1.072096,0,0,0
1,-1.336405,1,1,-1.336405,-1.336405,0,1,-1.336405,0,-1.336405,1,0,0
2,0.415232,1,0,0.415232,0.415232,0,1,0.415232,0,0.415232,1,0,0
3,-0.132155,1,3,-0.132155,-0.132155,0,0,-0.132155,0,-0.132155,1,0,0
4,0.415232,1,0,0.415232,0.415232,0,1,0.415232,1,0.415232,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1.291050,1,0,1.291050,1.291050,0,1,1.291050,1,1.291050,0,2,4
696,0.962618,1,0,0.962618,0.962618,1,2,0.962618,1,0.962618,1,1,4
697,0.962618,1,0,0.962618,0.962618,0,2,0.962618,1,0.962618,1,2,4
698,0.305755,1,2,0.305755,0.305755,1,2,0.305755,1,0.305755,0,0,4


In [26]:
X = data.iloc[:,0:12]
y = data['num']

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [28]:
model = DecisionTreeClassifier(criterion='entropy', max_depth=5)
model.fit(X_train, y_train)

In [29]:
preds=model.predict(X_test)
pd.Series(preds).value_counts()

0    90
1    30
2    20
dtype: int64

In [30]:
pd.crosstab(y_test,preds)

col_0,0,1,2
num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,73,8,3
1,11,13,7
2,0,2,5
3,5,6,2
4,1,1,3


In [31]:
np.mean(preds==y_test)

0.65