In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, loguniform, randint
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve


print("Libraries imported successfully!")

Libraries imported successfully!


In [2]:
X_train = pd.read_csv('../heart_disease_data/split_data/X_train_selected.csv')
X_test = pd.read_csv('../heart_disease_data/split_data/X_test_selected.csv')
y_train = pd.read_csv('../heart_disease_data/split_data/y_train.csv')['target_binary']
y_test = pd.read_csv('../heart_disease_data/split_data/y_test.csv')['target_binary']

print("*"*40, "All Data Loaded!!!", "*"*40)

**************************************** All Data Loaded!!! ****************************************


## GridSearch

#### Logistic regression

In [3]:
lr = LogisticRegression(random_state=42, max_iter=1000)

param_grid_lr = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']  
}

grid_lr = GridSearchCV(lr, param_grid_lr, cv=5, scoring='roc_auc', n_jobs=-1)
grid_lr.fit(X_train, y_train)

best_lr = grid_lr.best_estimator_
print("Logistic Regression tuned\n")
print(best_lr)

Logistic Regression tuned

LogisticRegression(C=100, max_iter=1000, penalty='l1', random_state=42,
                   solver='liblinear')


#### Decision Tree

In [4]:
dt = DecisionTreeClassifier(random_state=42)

param_grid_dt = {
    'max_depth': [3, 5, 7, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

grid_dt = GridSearchCV(dt, param_grid_dt, cv=5, scoring='roc_auc', n_jobs=-1)
grid_dt.fit(X_train, y_train)

best_dt = grid_dt.best_estimator_
print("Decision Tree tuned\n")
print(best_dt)

Decision Tree tuned

DecisionTreeClassifier(max_depth=7, min_samples_leaf=4, random_state=42)


#### Random Forest

In [5]:
rf = RandomForestClassifier(random_state=42)

param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

grid_rf = GridSearchCV(rf, param_grid_rf, cv=5, scoring='roc_auc', n_jobs=-1)
grid_rf.fit(X_train, y_train)

best_rf = grid_rf.best_estimator_
print("Random Forest tuned\n")
print(best_rf)

Random Forest tuned

RandomForestClassifier(max_depth=5, min_samples_leaf=4, min_samples_split=10,
                       n_estimators=200, random_state=42)


#### SVM

In [6]:
svm = SVC(random_state=42, probability=True)

# # Hyperparameters to tune
param_grid_svm = {
    'C': [0.1, 1, 10, 100],           # Regularization 
    'kernel': ['rbf', 'linear'],      
    'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1]  
}


grid_svm = GridSearchCV(svm, param_grid_svm, cv=5, scoring='roc_auc', n_jobs=-1)
grid_svm.fit(X_train, y_train)
best_svm = grid_svm.best_estimator_
print("SVM tuned\n")
print(best_svm)

SVM tuned

SVC(C=1, kernel='linear', probability=True, random_state=42)


### Models Evaluation 

In [7]:

models_tuned = {
    'Logistic Regression': best_lr,
    'Decision Tree': best_dt,
    'Random Forest': best_rf,
    'SVM': best_svm
}

results_tuned = []

for name, model in models_tuned.items():
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)

    results_tuned.append([acc, prec, rec, f1, auc])

# Create comparison table
results_tuned_df = pd.DataFrame(
    results_tuned,
    columns=['Accuracy', 'Precision', 'Recall', 'F1', 'AUC'],
    index=models_tuned.keys()
)

print("\nTuned Model Performance:")
print(results_tuned_df.round(3))


Tuned Model Performance:
                     Accuracy  Precision  Recall     F1    AUC
Logistic Regression     0.902      0.906   0.906  0.906  0.931
Decision Tree           0.836      0.893   0.781  0.833  0.878
Random Forest           0.885      0.931   0.844  0.885  0.954
SVM                     0.902      0.906   0.906  0.906  0.942


### RandomizedSearchCV

#### Logistic Regression

In [8]:
param_dist_lr = {
    'C': loguniform(0.01, 100),           
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']                
}

#### Decision Tree

In [9]:
param_dist_dt = {
    'max_depth': [None] + list(randint(3, 20).rvs(10)),  
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 10),
    'criterion': ['gini', 'entropy']
}

#### Random Forest

In [10]:
param_dist_rf = {
    'n_estimators': randint(50, 300),
    'max_depth': [None] + list(randint(5, 30).rvs(10)),
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 10),
    'bootstrap': [True, False],
    'criterion': ['gini', 'entropy']
}

#### SVM

In [11]:
param_dist_svm = {
    'C': loguniform(0.01, 100),
    'kernel': ['rbf', 'linear'],
    'gamma': loguniform(0.001, 1)
}

In [12]:
# List of models and their parameter distributions
models = [
    ('Logistic Regression', LogisticRegression(random_state=42, max_iter=1000), param_dist_lr),
    ('Decision Tree', DecisionTreeClassifier(random_state=42), param_dist_dt),
    ('Random Forest', RandomForestClassifier(random_state=42), param_dist_rf),
    ('SVM', SVC(probability=True, random_state=42), param_dist_svm)
]

# Store best models and results
best_models_random = {}
random_results = []

# Run RandomizedSearchCV for each model
for name, model, param_dist in models:
    print(f"\nTuning {name} with RandomizedSearchCV...")
    
    random_search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_dist,
        n_iter=50,              
        cv=5,                    
        scoring='roc_auc',
        n_jobs=-1,
        random_state=42,
        verbose=0
    )
    
  
    random_search.fit(X_train, y_train)
    
    # Save best model and results
    best_models_random[name] = random_search.best_estimator_
    
    # Evaluate on test set
    y_pred = random_search.predict(X_test)
    y_prob = random_search.predict_proba(X_test)[:, 1]
    
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)
    
    random_results.append([acc, prec, rec, f1, auc])
    



Tuning Logistic Regression with RandomizedSearchCV...

Tuning Decision Tree with RandomizedSearchCV...

Tuning Random Forest with RandomizedSearchCV...

Tuning SVM with RandomizedSearchCV...


In [13]:
for k, v in best_models_random.items():
    print(k)

Logistic Regression
Decision Tree
Random Forest
SVM


In [14]:

results_random_df = pd.DataFrame(
    random_results,
    columns=['Accuracy', 'Precision', 'Recall', 'F1', 'AUC'],
    index=best_models_random.keys()
)

print("\nRandomizedSearchCV Results:")
print(results_random_df.round(3))


RandomizedSearchCV Results:
                     Accuracy  Precision  Recall     F1    AUC
Logistic Regression     0.902      0.906   0.906  0.906  0.931
Decision Tree           0.885      0.903   0.875  0.889  0.952
Random Forest           0.902      0.933   0.875  0.903  0.957
SVM                     0.902      0.906   0.906  0.906  0.946


### Baseline results 

In [15]:
baseline_results= pd.read_csv("../results/model_performance_baseline.csv", index_col=0)
baseline_results

Unnamed: 0,Accuracy,Precision,Recall,F1,AUC
Logistic Regression,0.902,0.906,0.906,0.906,0.935
Decision Tree,0.77,0.781,0.781,0.781,0.77
Random Forest,0.852,0.897,0.812,0.852,0.912
SVM,0.902,0.906,0.906,0.906,0.944


### Model Comparison

In [16]:
output_dir = "../results"

results_baseline = baseline_results.copy()
results_grid = results_tuned_df.copy()
results_random = results_random_df.copy()

# Combine
comparison = pd.concat([
    results_baseline.add_suffix(' (Baseline)'),
    results_grid.add_suffix(' (Grid)'),
    results_random.add_suffix(' (Random)')
], axis=1)

# Save and show
comparison.round(3).to_csv(os.path.join(output_dir, "hyperparameter_comparison.csv"), index=False)

print(f"Model Comparison saved to: {os.path.join(output_dir, 'hyperparameter_comparison.csv')}")


Model Comparison saved to: ../results\hyperparameter_comparison.csv


In [17]:
comparison_df= pd.read_csv('../results/hyperparameter_comparison.csv', index_col=0) 
comparison_df

Unnamed: 0_level_0,Precision (Baseline),Recall (Baseline),F1 (Baseline),AUC (Baseline),Accuracy (Grid),Precision (Grid),Recall (Grid),F1 (Grid),AUC (Grid),Accuracy (Random),Precision (Random),Recall (Random),F1 (Random),AUC (Random)
Accuracy (Baseline),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0.902,0.906,0.906,0.906,0.935,0.902,0.906,0.906,0.906,0.931,0.902,0.906,0.906,0.906,0.931
0.77,0.781,0.781,0.781,0.77,0.836,0.893,0.781,0.833,0.878,0.885,0.903,0.875,0.889,0.952
0.852,0.897,0.812,0.852,0.912,0.885,0.931,0.844,0.885,0.954,0.902,0.933,0.875,0.903,0.957
0.902,0.906,0.906,0.906,0.944,0.902,0.906,0.906,0.906,0.942,0.902,0.906,0.906,0.906,0.946


### Saving the best model


In [18]:
best_rf_random = best_models_random['Random Forest']



# Define output directory
output_dir = "../models/"
os.makedirs(output_dir, exist_ok=True)

# Save the best model
model_path = os.path.join(output_dir, "final_model.pkl")
joblib.dump(best_rf_random, model_path)

print(f"Best model (Random Forest - RandomizedSearchCV) saved to: {model_path}")

Best model (Random Forest - RandomizedSearchCV) saved to: ../models/final_model.pkl


### Loading the model

In [19]:
# Load the model
model_path = "../models/final_model.pkl"
loaded_model = joblib.load(model_path)

print("Model loaded successfully!\n")
print(f"Model type: {type(loaded_model)}")

Model loaded successfully!

Model type: <class 'sklearn.ensemble._forest.RandomForestClassifier'>


In [20]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

def preprocess_df(dff):
    df= dff.copy()

    

    # mode_val = df[['ca','thal']].mode()[0]
    # df[['ca','thal']].fillna(mode_val, inplace=True)

    # df['ca'].fillna(df['ca'].mode()[0], inplace=True)
    # df['thal'].fillna(df['thal'].mode()[0], inplace=True)

    df['ca'] = df['ca'].fillna(df['ca'].mode()[0])
    df['thal'] = df['thal'].fillna(df['thal'].mode()[0])

    df['ca'] = df['ca'].astype(int)
    df['thal'] = df['thal'].astype(int)


    df['thal_label'] = df['thal'].map({
    3: 'unknown',
    6: 'fixed_defect',
    7: 'reversible_defect'
    })

    df = pd.get_dummies(df, columns=['thal_label'], prefix='thal')

    categorical_cols= ['cp', 'slope', 'ca', 'restecg']

    df[categorical_cols] = df[categorical_cols].astype('category')

    
    df['cp'] = df['cp'].cat.reorder_categories(
    new_categories=[1, 2, 3, 4], 
    ordered=True
    )

    df['slope'] = df['slope'].cat.reorder_categories(
        new_categories=[1, 2, 3], 
        ordered=True
    )
    
    df['ca'] = df['ca'].cat.reorder_categories(
        new_categories=[0, 1, 2, 3], 
        ordered=True
    )
    
    
    
    df['restecg'] = df['restecg'].cat.reorder_categories(
        new_categories=[0, 1, 2], 
        ordered=True
    )


    df['target_binary'] = (df['target'] > 0).astype(int)

    df['target_binary'] = df['target_binary'].astype('category')
    df['target'] = df['target'].astype('int8')  


    #Feature and target selection 
    features= df[['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs',
    'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca',
                    'thal_unknown', 'thal_fixed_defect', 'thal_reversible_defect']]

    target= df['target_binary']


    #Data Splitting 
    X_train, X_test, y_train, y_test = train_test_split(
    features, target, test_size=0.2, random_state=42
    )

    #Scaling 
    scaler = StandardScaler()
    numerical_cols = ['thalach', 'oldpeak']

    
  
    
    X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
    X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

    final_features = [
    'oldpeak',
    'ca',
    'cp',
    'thalach',
    'exang',
    'slope',
    'sex',
    'thal_unknown',
    'thal_reversible_defect'
]

    
    # Extract X and y with final features
    X_train_final = X_train[final_features]
    X_test_final = X_test[final_features]
    
    print("Preprocessing Done!")
    return X_train_final, X_test_final, y_train, y_test, scaler, final_features
   


    

In [22]:
data= pd.read_csv("../data/heart_disease.csv")
# Run preprocessing
X_train, X_test, y_train, y_test, scaler, final_features = preprocess_df(data)

# Save artifacts
joblib.dump(scaler, "../models/scaler.pkl")
joblib.dump(final_features, "../models/final_features.pkl")

model_rf = joblib.load('../models/final_model.pkl')

# Train model
model_rf.fit(X_train, y_train)

Preprocessing Done!


In [23]:
y_pred = model_rf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"Model trained! Test Accuracy: {acc:.3f}")

Model trained! Test Accuracy: 0.902
