# Loading Data


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
url ='https://raw.githubusercontent.com/Adu9o9/Heart-Disease-Prediction-ML/refs/heads/main/heart_disease_uci.csv'
df = pd.read_csv(url)
df

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,916,54,Female,VA Long Beach,asymptomatic,127.0,333.0,True,st-t abnormality,154.0,False,0.0,,,,1
916,917,62,Male,VA Long Beach,typical angina,,139.0,False,st-t abnormality,,,,,,,0
917,918,55,Male,VA Long Beach,asymptomatic,122.0,223.0,True,st-t abnormality,100.0,False,0.0,,,fixed defect,2
918,919,58,Male,VA Long Beach,asymptomatic,,385.0,True,lv hypertrophy,,,,,,,0


In [None]:
print("--- Initial Data (First 5 Rows) ---")
print(df.head())

--- Initial Data (First 5 Rows) ---
   id  age     sex    dataset               cp  trestbps   chol    fbs  \
0   1   63    Male  Cleveland   typical angina     145.0  233.0   True   
1   2   67    Male  Cleveland     asymptomatic     160.0  286.0  False   
2   3   67    Male  Cleveland     asymptomatic     120.0  229.0  False   
3   4   37    Male  Cleveland      non-anginal     130.0  250.0  False   
4   5   41  Female  Cleveland  atypical angina     130.0  204.0  False   

          restecg  thalch  exang  oldpeak        slope   ca  \
0  lv hypertrophy   150.0  False      2.3  downsloping  0.0   
1  lv hypertrophy   108.0   True      1.5         flat  3.0   
2  lv hypertrophy   129.0   True      2.6         flat  2.0   
3          normal   187.0  False      3.5  downsloping  0.0   
4  lv hypertrophy   172.0  False      1.4    upsloping  0.0   

                thal  num  
0       fixed defect    0  
1             normal    2  
2  reversable defect    1  
3             normal    0  


In [None]:
print("\n--- Data Info ---")
df.info()


--- Data Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        920 non-null    int64  
 1   age       920 non-null    int64  
 2   sex       920 non-null    object 
 3   dataset   920 non-null    object 
 4   cp        920 non-null    object 
 5   trestbps  861 non-null    float64
 6   chol      890 non-null    float64
 7   fbs       830 non-null    object 
 8   restecg   918 non-null    object 
 9   thalch    865 non-null    float64
 10  exang     865 non-null    object 
 11  oldpeak   858 non-null    float64
 12  slope     611 non-null    object 
 13  ca        309 non-null    float64
 14  thal      434 non-null    object 
 15  num       920 non-null    int64  
dtypes: float64(5), int64(3), object(8)
memory usage: 115.1+ KB


## Data Preprocessing


In [None]:
print("\n--- Null Values Before Cleaning ---")
print(df.isnull().sum())


--- Null Values Before Cleaning ---
id            0
age           0
sex           0
dataset       0
cp            0
trestbps     59
chol         30
fbs          90
restecg       2
thalch       55
exang        55
oldpeak      62
slope       309
ca          611
thal        486
num           0
dtype: int64


In [None]:
# We will fill the missing values in 'ca' with the median of its column.
# The median is a good choice as it's less sensitive to outliers than the mean.
df['ca'] = df['ca'].fillna(df['ca'].median())
df['thal'] = df['thal'].fillna(df['thal'].mode()[0])
df['slope'] = df['slope'].fillna(df['slope'].mode()[0])

In [None]:
print("\n--- Null Values After Cleaning ---")
print(df.isnull().sum())


--- Null Values After Cleaning ---
id           0
age          0
sex          0
dataset      0
cp           0
trestbps    59
chol        30
fbs         90
restecg      2
thalch      55
exang       55
oldpeak     62
slope        0
ca           0
thal         0
num          0
dtype: int64


In [None]:
df.shape


(920, 16)

In [None]:
df.describe()

Unnamed: 0,id,age,trestbps,chol,thalch,oldpeak,ca,num
count,920.0,920.0,861.0,890.0,865.0,858.0,920.0,920.0
mean,460.5,53.51087,132.132404,199.130337,137.545665,0.878788,0.227174,0.995652
std,265.725422,9.424685,19.06607,110.78081,25.926276,1.091226,0.628936,1.142693
min,1.0,28.0,0.0,0.0,60.0,-2.6,0.0,0.0
25%,230.75,47.0,120.0,175.0,120.0,0.0,0.0,0.0
50%,460.5,54.0,130.0,223.0,140.0,0.5,0.0,1.0
75%,690.25,60.0,140.0,268.0,157.0,1.5,0.0,2.0
max,920.0,77.0,200.0,603.0,202.0,6.2,3.0,4.0


In [None]:
# We convert 'num' (0,1,2,3,4) into 'disease' (0 or 1)
df['disease'] = df['num'].apply(lambda x: 1 if x > 0 else 0)


## Data Preparation

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score

In [None]:
# We use our new 'disease' column as the target.0 for no disease and 1 for some sort of heart disease
y = df['disease']
X = df.drop(columns=['num', 'disease'])
df

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num,disease
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2,1
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,916,54,Female,VA Long Beach,asymptomatic,127.0,333.0,True,st-t abnormality,154.0,False,0.0,flat,0.0,normal,1,1
916,917,62,Male,VA Long Beach,typical angina,,139.0,False,st-t abnormality,,,,flat,0.0,normal,0,0
917,918,55,Male,VA Long Beach,asymptomatic,122.0,223.0,True,st-t abnormality,100.0,False,0.0,flat,0.0,fixed defect,2,1
918,919,58,Male,VA Long Beach,asymptomatic,,385.0,True,lv hypertrophy,,,,flat,0.0,normal,0,0


In [None]:
# Convert categorical text columns ('thal' and 'slope') into numbers
X = pd.get_dummies(X, columns=['thal', 'slope'], drop_first=True)


## Data Splitting

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, classification_report

## Model Building & Evaluation
### Logistic Regression

In [None]:
lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train, y_train)
y_lr_pred = lr.predict(X_test)
lr_accuracy = accuracy_score(y_test, y_lr_pred)
lr_precision = precision_score(y_test, y_lr_pred)

### RandomForest


In [None]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_rf_pred = rf.predict(X_test)
rf_accuracy = accuracy_score(y_test, y_rf_pred)
rf_precision = precision_score(y_test, y_rf_pred, average='weighted') # Use 'weighted' average

In [None]:
from sklearn.metrics import classification_report

print("--- Multiclass Classification Report ---")
print(classification_report(y_test, y_rf_pred))

--- Multiclass Classification Report ---
              precision    recall  f1-score   support

           0       0.92      0.80      0.86        82
           1       0.86      0.94      0.90       102

    accuracy                           0.88       184
   macro avg       0.89      0.87      0.88       184
weighted avg       0.88      0.88      0.88       184



## Model Comparison Table

In [None]:
df_models = pd.DataFrame({
    'Method': ['Logistic Regression', 'Random Forest'],
    'Test Accuracy': [lr_accuracy, rf_accuracy],
    'Test Weighted Precision': [lr_precision, rf_precision]
})

In [None]:
print("\n--- Model Comparison Table ---")
print(df_models.reset_index(drop=True))


--- Model Comparison Table ---
                Method  Test Accuracy  Test Weighted Precision
0  Logistic Regression       0.842391                 0.841121
1        Random Forest       0.880435                 0.883670


### CONCLUSION
"The objective of this project was to build a model to predict the presence of heart disease. Two models were trained and evaluated: Logistic Regression and Random Forest.

Based on the results, the Random Forest model was the superior performer, achieving an overall accuracy of 88.0% and a weighted precision of 88.4% on the test set. The detailed classification report reveals that the model was particularly effective at identifying patients with heart disease (Class 1), achieving a recall of 94%. This indicates a strong ability to correctly flag individuals who have the condition, which is a critical requirement for a medical diagnostic model. The Logistic Regression model was less effective, with an accuracy of 84.2%.

Therefore, the final recommendation is the Random Forest model for this prediction task."