In [51]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,LabelEncoder
import seaborn as sns
import matplotlib.pyplot as plt


In [52]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("redwankarimsony/heart-disease-data")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/heart-disease-data


In [53]:
df = pd.read_csv("/kaggle/input/heart-disease-data/heart_disease_uci.csv")
df.head()

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


#Check for missing values

In [54]:
df.isnull().sum()

Unnamed: 0,0
id,0
age,0
sex,0
dataset,0
cp,0
trestbps,59
chol,30
fbs,90
restecg,2
thalch,55


#It appears that we have many null values so I need to fill them up

In [55]:
num_cols = ['trestbps', 'chol', 'thalch', 'oldpeak', 'ca']
cat_cols = ['fbs', 'exang', 'restecg', 'slope', 'thal']


In [56]:
for col in num_cols:
    skew_val = df[col].skew()
    if abs(skew_val) > 0.5:
        df[col] = df[col].fillna(df[col].median())
        print(f"Filled {col} with median (skew={skew_val:.2f})")
    else:
        df[col] = df[col].fillna(df[col].mean())
        print(f"Filled {col} with mean (skew={skew_val:.2f})")


Filled trestbps with mean (skew=0.21)
Filled chol with median (skew=-0.61)
Filled thalch with mean (skew=-0.21)
Filled oldpeak with median (skew=1.04)
Filled ca with median (skew=1.17)


In [57]:
for col in cat_cols:
    df[col] = df[col].fillna(df[col].mode()[0])
    print(f"Filled {col} with mode: {df[col].mode()[0]}")


Filled fbs with mode: False
Filled exang with mode: False
Filled restecg with mode: normal
Filled slope with mode: flat
Filled thal with mode: normal


  df[col] = df[col].fillna(df[col].mode()[0])


In [58]:
df.isnull().sum()

Unnamed: 0,0
id,0
age,0
sex,0
dataset,0
cp,0
trestbps,0
chol,0
fbs,0
restecg,0
thalch,0


In [59]:
df = df.drop(["id","dataset"],axis=1)
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,63,Male,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,67,Male,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,67,Male,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,37,Male,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,41,Female,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


#I need to convert non numerical values to check for corrleation

In [60]:
le = LabelEncoder()

# Applying the label encoding to binary columns or odinal if I have ordinal columns
df['sex'] = le.fit_transform(df['sex'])      # Male=1, Female=0
df['fbs'] = df['fbs'].astype(int)            # True → 1, False → 0
df['exang'] = df['exang'].astype(int)        # True → 1, False → 0

In [61]:
# One-hot encoding for multi-category columns
df = pd.get_dummies(df, columns=['cp', 'restecg', 'slope', 'thal'], drop_first=True)


In [62]:
df.head()

Unnamed: 0,age,sex,trestbps,chol,fbs,thalch,exang,oldpeak,ca,num,cp_atypical angina,cp_non-anginal,cp_typical angina,restecg_normal,restecg_st-t abnormality,slope_flat,slope_upsloping,thal_normal,thal_reversable defect
0,63,1,145.0,233.0,1,150.0,0,2.3,0.0,0,False,False,True,False,False,False,False,False,False
1,67,1,160.0,286.0,0,108.0,1,1.5,3.0,2,False,False,False,False,False,True,False,True,False
2,67,1,120.0,229.0,0,129.0,1,2.6,2.0,1,False,False,False,False,False,True,False,False,True
3,37,1,130.0,250.0,0,187.0,0,3.5,0.0,0,False,True,False,True,False,False,False,True,False
4,41,0,130.0,204.0,0,172.0,0,1.4,0.0,0,True,False,False,False,False,False,True,True,False


In [63]:
df["num"].value_counts()

Unnamed: 0_level_0,count
num,Unnamed: 1_level_1
0,411
1,265
2,109
3,107
4,28


#It appears we have multiple classes but I'm trying to predict whether the person has a heart disease or no so I'll convert this problem to binary classfication as 0 doesn't have a disease and 1 does have the disease

In [64]:
df["num"]= df["num"].apply(lambda x:1 if x>0 else 0)
df["num"].value_counts()


Unnamed: 0_level_0,count
num,Unnamed: 1_level_1
1,509
0,411


In [65]:
target_corr = df.corr()['num'].sort_values(ascending=False)
target_corr


Unnamed: 0,num
num,1.0
exang,0.433605
oldpeak,0.366138
sex,0.307284
age,0.2827
thal_reversable defect,0.257029
ca,0.164755
fbs,0.108071
restecg_st-t abnormality,0.10475
trestbps,0.102922


In [66]:
target_corr.index

Index(['num', 'exang', 'oldpeak', 'sex', 'age', 'thal_reversable defect', 'ca',
       'fbs', 'restecg_st-t abnormality', 'trestbps', 'slope_flat',
       'cp_typical angina', 'restecg_normal', 'slope_upsloping',
       'cp_non-anginal', 'chol', 'thal_normal', 'thalch',
       'cp_atypical angina'],
      dtype='object')

In [67]:
df.head()

Unnamed: 0,age,sex,trestbps,chol,fbs,thalch,exang,oldpeak,ca,num,cp_atypical angina,cp_non-anginal,cp_typical angina,restecg_normal,restecg_st-t abnormality,slope_flat,slope_upsloping,thal_normal,thal_reversable defect
0,63,1,145.0,233.0,1,150.0,0,2.3,0.0,0,False,False,True,False,False,False,False,False,False
1,67,1,160.0,286.0,0,108.0,1,1.5,3.0,1,False,False,False,False,False,True,False,True,False
2,67,1,120.0,229.0,0,129.0,1,2.6,2.0,1,False,False,False,False,False,True,False,False,True
3,37,1,130.0,250.0,0,187.0,0,3.5,0.0,0,False,True,False,True,False,False,False,True,False
4,41,0,130.0,204.0,0,172.0,0,1.4,0.0,0,True,False,False,False,False,False,True,True,False


In [68]:
X = df.drop("num",axis=1)
Y = df["num"]
print(X,Y)

     age  sex    trestbps   chol  fbs      thalch  exang  oldpeak   ca  \
0     63    1  145.000000  233.0    1  150.000000      0      2.3  0.0   
1     67    1  160.000000  286.0    0  108.000000      1      1.5  3.0   
2     67    1  120.000000  229.0    0  129.000000      1      2.6  2.0   
3     37    1  130.000000  250.0    0  187.000000      0      3.5  0.0   
4     41    0  130.000000  204.0    0  172.000000      0      1.4  0.0   
..   ...  ...         ...    ...  ...         ...    ...      ...  ...   
915   54    0  127.000000  333.0    1  154.000000      0      0.0  0.0   
916   62    1  132.132404  139.0    0  137.545665      0      0.5  0.0   
917   55    1  122.000000  223.0    1  100.000000      0      0.0  0.0   
918   58    1  132.132404  385.0    1  137.545665      0      0.5  0.0   
919   62    1  120.000000  254.0    0   93.000000      1      0.0  0.0   

     cp_atypical angina  cp_non-anginal  cp_typical angina  restecg_normal  \
0                 False          

In [69]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,stratify=Y,test_size=0.2,shuffle=True)
print(X_train.shape,X_test.shape)

(736, 18) (184, 18)


In [70]:
scaler = StandardScaler()
X_train_standarized = scaler.fit_transform(X_train)
X_test_standarized = scaler.transform(X_test)



#I'm going to try different models just to see which one gives me the best result  

In [71]:
models = [
    ("Logistic Regression", LogisticRegression()),
    ("Random Forest", RandomForestClassifier()),
    ("Gradient Boosting", GradientBoostingClassifier()),
]
# storing the models in list of tuples just for easier code
for name, model in models:
    model.fit(X_train_standarized, Y_train)
    y_pred = model.predict(X_test_standarized)

    print(name)
    print("Accuracy:", accuracy_score(Y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(Y_test, y_pred))
    print("Classification Report:\n", classification_report(Y_test, y_pred))
    print("\n")


Logistic Regression
Accuracy: 0.7880434782608695
Confusion Matrix:
 [[62 20]
 [19 83]]
Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.76      0.76        82
           1       0.81      0.81      0.81       102

    accuracy                           0.79       184
   macro avg       0.79      0.78      0.79       184
weighted avg       0.79      0.79      0.79       184



Random Forest
Accuracy: 0.8315217391304348
Confusion Matrix:
 [[64 18]
 [13 89]]
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.78      0.81        82
           1       0.83      0.87      0.85       102

    accuracy                           0.83       184
   macro avg       0.83      0.83      0.83       184
weighted avg       0.83      0.83      0.83       184



Gradient Boosting
Accuracy: 0.8206521739130435
Confusion Matrix:
 [[65 17]
 [16 86]]
Classification Report:
               pre