In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import GridSearchCV

In [2]:
# Load the data
data = pd.read_csv("HeartDisease.csv") 

In [3]:
# Check for missing values
print("Missing values:", data.isna().sum())

Missing values: age                    0
gender                 0
chest_pain             0
rest_bps               0
cholestrol             0
fasting_blood_sugar    0
rest_ecg               0
thalach                0
exer_angina            0
old_peak               0
slope                  0
ca                     0
thalassemia            0
target                 0
dtype: int64


In [4]:
# categorical features
df_cat =  data.nunique() < 10
categorical_features = list()
cat = dict(df_cat)
for x, y in cat.items():
    if y == True:
        categorical_features.append(x)
        
categorical_features


['gender',
 'chest_pain',
 'fasting_blood_sugar',
 'rest_ecg',
 'exer_angina',
 'slope',
 'ca',
 'thalassemia',
 'target']

In [5]:
numerical_columns = [x for x in data.columns.tolist() if x not in categorical_features]
numerical_columns

['age', 'rest_bps', 'cholestrol', 'thalach', 'old_peak']

In [6]:
# Feature scaling
data2 = pd.DataFrame()
scaler = StandardScaler()
numerical_features = [
    col for col in data.columns if col not in categorical_features and col != "target"
]
data2[numerical_features] = scaler.fit_transform(data[numerical_features])

In [7]:
df = pd.concat([data2, data[categorical_features]], axis=1)

In [8]:
df.sample(5)

Unnamed: 0,age,rest_bps,cholestrol,thalach,old_peak,gender,chest_pain,fasting_blood_sugar,rest_ecg,exer_angina,slope,ca,thalassemia,target
140,-0.371269,-0.663867,0.941846,0.321556,-0.379244,0,2,0,0,0,2,0,2,1
194,0.62133,0.478391,-1.183957,0.234095,1.691225,1,2,0,0,0,1,0,2,0
267,-0.591847,-0.778093,-1.879674,-1.034089,-0.206705,1,2,0,0,0,2,3,2,0
276,0.400752,0.821069,-0.546216,-1.95243,0.828529,1,0,0,1,0,1,1,3,0
123,-0.040403,-1.349222,0.400733,0.758861,-0.896862,0,2,0,0,0,2,0,2,1


In [9]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,303.0,4.690051e-17,1.001654,-2.797624,-0.75728,0.069886,0.731619,2.49624
rest_bps,303.0,-7.035077e-16,1.001654,-2.148802,-0.663867,-0.092738,0.478391,3.905165
cholestrol,303.0,-1.113887e-16,1.001654,-2.32416,-0.681494,-0.121055,0.545674,6.140401
thalach,303.0,-6.800574e-16,1.001654,-3.439267,-0.706111,0.146634,0.715131,2.289429
old_peak,303.0,2.3450260000000003e-17,1.001654,-0.896862,-0.896862,-0.206705,0.483451,4.451851
gender,303.0,0.6831683,0.466011,0.0,0.0,1.0,1.0,1.0
chest_pain,303.0,0.9669967,1.032052,0.0,0.0,1.0,2.0,3.0
fasting_blood_sugar,303.0,0.1485149,0.356198,0.0,0.0,0.0,0.0,1.0
rest_ecg,303.0,0.5280528,0.52586,0.0,0.0,1.0,1.0,2.0
exer_angina,303.0,0.3267327,0.469794,0.0,0.0,0.0,1.0,1.0


In [10]:
df.drop_duplicates(inplace=True)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 302 entries, 0 to 302
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   age                  302 non-null    float64
 1   rest_bps             302 non-null    float64
 2   cholestrol           302 non-null    float64
 3   thalach              302 non-null    float64
 4   old_peak             302 non-null    float64
 5   gender               302 non-null    int64  
 6   chest_pain           302 non-null    int64  
 7   fasting_blood_sugar  302 non-null    int64  
 8   rest_ecg             302 non-null    int64  
 9   exer_angina          302 non-null    int64  
 10  slope                302 non-null    int64  
 11  ca                   302 non-null    int64  
 12  thalassemia          302 non-null    int64  
 13  target               302 non-null    int64  
dtypes: float64(5), int64(9)
memory usage: 35.4 KB


In [12]:
# Separate features and target variable
X = df.drop("target", axis=1)
y = df["target"]

In [13]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
# Function to evaluate and print model performance
def evaluate_model(model_name, model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)

    print(f"\n** {model_name} Performance **")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print(f"AUC-ROC: {auc:.4f}")

In [24]:
# Define models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "SVM": SVC(kernel="linear"),  
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "XGBoost": XGBClassifier(n_estimators=100),
}



In [25]:
# Evaluate baseline models
for model_name, model in models.items():
    evaluate_model(model_name, model)


** Logistic Regression Performance **
Accuracy: 0.8361
Precision: 0.8667
Recall: 0.8125
F1-Score: 0.8387
AUC-ROC: 0.8373

** Random Forest Performance **
Accuracy: 0.8852
Precision: 0.9032
Recall: 0.8750
F1-Score: 0.8889
AUC-ROC: 0.8858

** SVM Performance **
Accuracy: 0.8525
Precision: 0.8710
Recall: 0.8438
F1-Score: 0.8571
AUC-ROC: 0.8529

** KNN Performance **
Accuracy: 0.8689
Precision: 0.8750
Recall: 0.8750
F1-Score: 0.8750
AUC-ROC: 0.8685

** XGBoost Performance **
Accuracy: 0.8525
Precision: 0.8710
Recall: 0.8438
F1-Score: 0.8571
AUC-ROC: 0.8529


In [26]:
# Hyperparameter tuning with GridSearchCV
# Random Forest hyperparameter tuning
param_grid = {
    "n_estimators": [50, 100, 200],
    "max_depth": [3, 5, 8],
}
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

evaluate_model("Tuned Random Forest", best_model)



** Tuned Random Forest Performance **
Accuracy: 0.9016
Precision: 0.9062
Recall: 0.9062
F1-Score: 0.9062
AUC-ROC: 0.9014


In [27]:
# Function to evaluate and print model performance with K-fold CV
def evaluate_model_cv(model_name, model, k):
    cv = KFold(n_splits=k, shuffle=True, random_state=42)
    accuracy_scores, precision_scores, recall_scores, f1_scores, auc_scores = [], [], [], [], []
    for train_index, test_index in cv.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        accuracy_scores.append(accuracy_score(y_test, y_pred))
        precision_scores.append(precision_score(y_test, y_pred))
        recall_scores.append(recall_score(y_test, y_pred))
        f1_scores.append(f1_score(y_test, y_pred))
        auc_scores.append(roc_auc_score(y_test, y_pred))

    print(f"\n** {model_name} Performance (K-fold CV - k={k})**")
    print(f"Mean Accuracy: {np.mean(accuracy_scores):.4f} (Std. Dev: {np.std(accuracy_scores):.4f})")
    print(f"Mean Precision: {np.mean(precision_scores):.4f} (Std. Dev: {np.std(precision_scores):.4f})")
    print(f"Mean Recall: {np.mean(recall_scores):.4f} (Std. Dev: {np.std(recall_scores):.4f})")
    print(f"Mean F1-Score: {np.mean(f1_scores):.4f} (Std. Dev: {np.std(f1_scores):.4f})")
    print(f"Mean AUC-ROC: {np.mean(auc_scores):.4f} (Std. Dev: {np.std(auc_scores):.4f})")


In [28]:
# K-fold cross-validation (with k=5)
k = 5
for model_name, model in models.items():
    evaluate_model_cv(model_name, model, k)


** Logistic Regression Performance (K-fold CV - k=5)**
Mean Accuracy: 0.8311 (Std. Dev: 0.0372)
Mean Precision: 0.8237 (Std. Dev: 0.0549)
Mean Recall: 0.8840 (Std. Dev: 0.0362)
Mean F1-Score: 0.8509 (Std. Dev: 0.0281)
Mean AUC-ROC: 0.8251 (Std. Dev: 0.0404)

** Random Forest Performance (K-fold CV - k=5)**
Mean Accuracy: 0.7980 (Std. Dev: 0.0508)
Mean Precision: 0.8053 (Std. Dev: 0.0574)
Mean Recall: 0.8374 (Std. Dev: 0.0398)
Mean F1-Score: 0.8195 (Std. Dev: 0.0372)
Mean AUC-ROC: 0.7949 (Std. Dev: 0.0537)

** SVM Performance (K-fold CV - k=5)**
Mean Accuracy: 0.8245 (Std. Dev: 0.0554)
Mean Precision: 0.8150 (Std. Dev: 0.0715)
Mean Recall: 0.8922 (Std. Dev: 0.0434)
Mean F1-Score: 0.8487 (Std. Dev: 0.0375)
Mean AUC-ROC: 0.8185 (Std. Dev: 0.0605)

** KNN Performance (K-fold CV - k=5)**
Mean Accuracy: 0.8179 (Std. Dev: 0.0476)
Mean Precision: 0.8006 (Std. Dev: 0.0560)
Mean Recall: 0.8993 (Std. Dev: 0.0540)
Mean F1-Score: 0.8444 (Std. Dev: 0.0304)
Mean AUC-ROC: 0.8105 (Std. Dev: 0.0539)

*

In [29]:
from sklearn.model_selection import GridSearchCV

# Example hyperparameter grid for Random Forest
param_grid = {
    "n_estimators": [50, 100, 200],
    "max_depth": [3, 5, 8],
}

# Create a GridSearchCV object with the tuned Random Forest model
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_rf_model = grid_search.best_estimator_

# Evaluate the tuned Random Forest model with K-fold CV
evaluate_model_cv("Tuned Random Forest", best_rf_model, k=5)



** Tuned Random Forest Performance (K-fold CV - k=5)**
Mean Accuracy: 0.8277 (Std. Dev: 0.0565)
Mean Precision: 0.8226 (Std. Dev: 0.0615)
Mean Recall: 0.8807 (Std. Dev: 0.0607)
Mean F1-Score: 0.8484 (Std. Dev: 0.0448)
Mean AUC-ROC: 0.8242 (Std. Dev: 0.0591)


In [31]:
# Define hyperparameter grid for XGBoost (example)
xgb_param_grid = {
    "learning_rate": [0.1, 0.3],  # Adjust these values as needed
    "max_depth": [3, 5, 8],
    "n_estimators": [100, 200]
}

# Create a GridSearchCV object for XGBoost
grid_search_xgb = GridSearchCV(XGBClassifier(random_state=42), xgb_param_grid, cv=5)
grid_search_xgb.fit(X_train, y_train)
best_xgb_model = grid_search_xgb.best_estimator_

# Evaluate the tuned XGBoost model with K-fold CV
evaluate_model_cv("Tuned XGBoost", best_xgb_model, k=5)


** Tuned XGBoost Performance (K-fold CV - k=5)**
Mean Accuracy: 0.8111 (Std. Dev: 0.0234)
Mean Precision: 0.8168 (Std. Dev: 0.0486)
Mean Recall: 0.8480 (Std. Dev: 0.0536)
Mean F1-Score: 0.8295 (Std. Dev: 0.0201)
Mean AUC-ROC: 0.8080 (Std. Dev: 0.0259)
