In [1]:
#import the modules
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier

In [6]:
#load dataset
data=pd.read_csv("heart_disease_uci.csv")
data.head(3)

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,1
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1


Analyze dataset for null values

In [7]:
#check for null attributes
print("\nNULL ATTRIBUTES\n")
print(data.isnull().sum()) 



NULL ATTRIBUTES

id            0
age           0
sex           0
dataset       0
cp            0
trestbps     59
chol         30
fbs          90
restecg       2
thalch       55
exang        55
oldpeak      62
slope       309
ca          611
thal        486
num           0
dtype: int64


Attributes info:

id (Unique id for each patient)

age (Age of the patient in years)

dataset (place of study)

sex (Male/Female)

cp chest pain type ([typical angina, atypical angina, non-anginal, asymptomatic])

trestbps resting blood pressure (resting blood pressure (in mm Hg on admission to the hospital))

chol (serum cholesterol in mg/dl)

fbs (if fasting blood sugar > 120 mg/dl)

restecg (resting electrocardiographic results) -- Values: [normal, stt abnormality, lv hypertrophy]

thalach: maximum heart rate achieved

exang: exercise-induced angina (True/ False)

oldpeak: ST depression induced by exercise relative to rest

slope: the slope of the peak exercise ST segment

ca: number of major vessels (0-3) colored by fluoroscopy

thal: [normal; fixed defect; reversible defect]

num: the predicted attribute(0: no chance, 1: chance)

In [8]:
#drop non required columns
data=data.drop(['dataset','id'], axis=1)
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,63,Male,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,67,Male,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,1
2,67,Male,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,37,Male,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,41,Female,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


In [9]:
#replace null values with mean
data['trestbps']=data['trestbps'].replace(np.nan,data['trestbps'].mean() )
data['chol']=data['chol'].replace(np.nan,data['chol'].mean() )
data['thalch']=data['thalch'].replace(np.nan,data['thalch'].mean() )
data['oldpeak']=data['oldpeak'].replace(np.nan,data['oldpeak'].mean() )
data['ca']=data['ca'].replace(np.nan,data['ca'].mean() )


In [10]:

#process categorical columns
# Categorical columns to process
categorical_cols = ['sex', 'cp', 'fbs', 'restecg', 'exang','thal', 'slope']

# 1. Fill missing values in categorical columns with the most frequent value
for col in categorical_cols:
    mode_val = data[col].mode()[0]
    data[col] = data[col].fillna(mode_val)

# 2. Convert to categorical (optional)
for col in categorical_cols:
    data[col] = data[col].astype('category')

# 3. One-Hot Encode
data_encoded = pd.get_dummies(data, columns=categorical_cols, drop_first=True)

# Preview
print(data_encoded.head(3))


   age  trestbps   chol  thalch  oldpeak   ca  num  sex_Male  \
0   63     145.0  233.0   150.0      2.3  0.0    0      True   
1   67     160.0  286.0   108.0      1.5  3.0    1      True   
2   67     120.0  229.0   129.0      2.6  2.0    1      True   

   cp_atypical angina  cp_non-anginal  cp_typical angina  fbs_True  \
0               False           False               True      True   
1               False           False              False     False   
2               False           False              False     False   

   restecg_normal  restecg_st-t abnormality  exang_True  thal_normal  \
0           False                     False       False        False   
1           False                     False        True         True   
2           False                     False        True        False   

   thal_reversable defect  slope_flat  slope_upsloping  
0                   False       False            False  
1                   False        True            False  
2 

  data[col] = data[col].fillna(mode_val)


In [11]:
final_data=data_encoded

In [12]:
# train using logistic regression
# Step 1: Split the data into features (X) and target (y)
X = final_data.drop('num', axis=1)  # Assuming 'num' is the target column
y = final_data['num']

# Step 2: Split the data into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Scale the features (important for logistic regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 4: Train the Logistic Regression model
model = LogisticRegression(random_state=42)
model.fit(X_train_scaled, y_train)

# Step 5: Predict on the test set
y_pred = model.predict(X_test_scaled)

# Step 6: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Output results
print(f"Accuracy: {accuracy:.4f}")
print(f"Confusion Matrix (RF):\n{conf_matrix}")
print(f"Classification Report (RF):\n{class_report}")

Accuracy: 0.8043
Confusion Matrix (RF):
[[62 13]
 [23 86]]
Classification Report (RF):
              precision    recall  f1-score   support

           0       0.73      0.83      0.78        75
           1       0.87      0.79      0.83       109

    accuracy                           0.80       184
   macro avg       0.80      0.81      0.80       184
weighted avg       0.81      0.80      0.81       184



In [13]:
#Random forest Algorithm


# Step 1: Initialize and train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)

# Step 2: Predict on the test set
y_pred_rf = rf_model.predict(X_test_scaled)

# Step 3: Evaluate the model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
conf_matrix_rf = confusion_matrix(y_test, y_pred_rf)
class_report_rf = classification_report(y_test, y_pred_rf)

# Output the results
print(f"Random Forest Accuracy: {accuracy_rf:.4f}")
print(f"Confusion Matrix (RF):\n{conf_matrix_rf}")
print(f"Classification Report (RF):\n{class_report_rf}")


Random Forest Accuracy: 0.8533
Confusion Matrix (RF):
[[64 11]
 [16 93]]
Classification Report (RF):
              precision    recall  f1-score   support

           0       0.80      0.85      0.83        75
           1       0.89      0.85      0.87       109

    accuracy                           0.85       184
   macro avg       0.85      0.85      0.85       184
weighted avg       0.86      0.85      0.85       184

