In [49]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import GridSearchCV

In [2]:
# Load the data
data = pd.read_csv("HeartDisease.csv") 

In [3]:
# Check for missing values
print("Missing values:", data.isna().sum())

Missing values: age                    0
gender                 0
chest_pain             0
rest_bps               0
cholestrol             0
fasting_blood_sugar    0
rest_ecg               0
thalach                0
exer_angina            0
old_peak               0
slope                  0
ca                     0
thalassemia            0
target                 0
dtype: int64


In [27]:
# categorical features
df_cat =  data.nunique() < 10
categorical_features = list()
cat = dict(df_cat)
for x, y in cat.items():
    if y == True:
        categorical_features.append(x)
        
categorical_features


['gender',
 'chest_pain',
 'fasting_blood_sugar',
 'rest_ecg',
 'exer_angina',
 'slope',
 'ca',
 'thalassemia',
 'target']

In [28]:
numerical_columns = [x for x in data.columns.tolist() if x not in categorical_features]
numerical_columns

['age', 'rest_bps', 'cholestrol', 'thalach', 'old_peak']

In [24]:
# Feature scaling
data2 = pd.DataFrame()
scaler = StandardScaler()
numerical_features = [
    col for col in data.columns if col not in categorical_features and col != "target"
]
data2[numerical_features] = scaler.fit_transform(data[numerical_features])

In [34]:
df = pd.concat([data2, data[categorical_features]], axis=1)

In [35]:
df.sample(5)

Unnamed: 0,age,rest_bps,cholestrol,thalach,old_peak,gender,chest_pain,fasting_blood_sugar,rest_ecg,exer_angina,slope,ca,thalassemia,target
206,0.511041,-1.234996,-0.140381,-0.334401,0.138373,1,0,0,0,1,1,1,3,0
10,-0.040403,0.478391,-0.140381,0.452748,0.138373,1,0,0,1,0,2,0,2,1
47,-0.812425,0.364165,0.207478,0.277826,-0.896862,1,2,0,0,0,2,0,2,1
214,0.180175,-0.378302,0.052874,-0.24694,0.138373,1,0,1,0,1,1,1,2,0
272,1.393352,-0.663867,-0.179032,-3.439267,-0.034166,1,0,0,1,0,1,0,2,0


In [37]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,303.0,4.690051e-17,1.001654,-2.797624,-0.75728,0.069886,0.731619,2.49624
rest_bps,303.0,-7.035077e-16,1.001654,-2.148802,-0.663867,-0.092738,0.478391,3.905165
cholestrol,303.0,-1.113887e-16,1.001654,-2.32416,-0.681494,-0.121055,0.545674,6.140401
thalach,303.0,-6.800574e-16,1.001654,-3.439267,-0.706111,0.146634,0.715131,2.289429
old_peak,303.0,2.3450260000000003e-17,1.001654,-0.896862,-0.896862,-0.206705,0.483451,4.451851
gender,303.0,0.6831683,0.466011,0.0,0.0,1.0,1.0,1.0
chest_pain,303.0,0.9669967,1.032052,0.0,0.0,1.0,2.0,3.0
fasting_blood_sugar,303.0,0.1485149,0.356198,0.0,0.0,0.0,0.0,1.0
rest_ecg,303.0,0.5280528,0.52586,0.0,0.0,1.0,1.0,2.0
exer_angina,303.0,0.3267327,0.469794,0.0,0.0,0.0,1.0,1.0


In [40]:
df.drop_duplicates(inplace=True)

In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 302 entries, 0 to 302
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   age                  302 non-null    float64
 1   rest_bps             302 non-null    float64
 2   cholestrol           302 non-null    float64
 3   thalach              302 non-null    float64
 4   old_peak             302 non-null    float64
 5   gender               302 non-null    int64  
 6   chest_pain           302 non-null    int64  
 7   fasting_blood_sugar  302 non-null    int64  
 8   rest_ecg             302 non-null    int64  
 9   exer_angina          302 non-null    int64  
 10  slope                302 non-null    int64  
 11  ca                   302 non-null    int64  
 12  thalassemia          302 non-null    int64  
 13  target               302 non-null    int64  
dtypes: float64(5), int64(9)
memory usage: 35.4 KB


In [42]:
# Separate features and target variable
X = df.drop("target", axis=1)
y = df["target"]

In [43]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [44]:
# Function to evaluate and print model performance
def evaluate_model(model_name, model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)

    print(f"\n** {model_name} Performance **")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print(f"AUC-ROC: {auc:.4f}")

In [45]:
# Define models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Support Vector": LinearSVC()
}

In [46]:
# Evaluate baseline models
for model_name, model in models.items():
    evaluate_model(model_name, model)


** Logistic Regression Performance **
Accuracy: 0.8361
Precision: 0.8667
Recall: 0.8125
F1-Score: 0.8387
AUC-ROC: 0.8373

** Random Forest Performance **
Accuracy: 0.8852
Precision: 0.9032
Recall: 0.8750
F1-Score: 0.8889
AUC-ROC: 0.8858


In [47]:
# Hyperparameter tuning with GridSearchCV
# Random Forest hyperparameter tuning
param_grid = {
    "n_estimators": [50, 100, 200],
    "max_depth": [3, 5, 8],
}
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

evaluate_model("Tuned Random Forest", best_model)



** Tuned Random Forest Performance **
Accuracy: 0.9016
Precision: 0.9062
Recall: 0.9062
F1-Score: 0.9062
AUC-ROC: 0.9014
