In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,classification_report
import pickle

In [2]:
# Load the dataset
url = "./heart.csv"
data = pd.read_csv(url)

In [3]:
data = data.apply(pd.to_numeric, errors='coerce')

In [4]:
# Checking the first few rows
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [5]:
# Handle missing values
data.replace('?', np.nan, inplace=True)

In [6]:
data.fillna(data.median(), inplace=True)

In [7]:

# Encode categorical variables
label_encoder = LabelEncoder()
for col in ['sex', 'cp', 'restecg', 'exang', 'slope', 'ca', 'thal']:
    data[col] = label_encoder.fit_transform(data[col])

In [8]:
# Convert target variable to binary
data['target'] = data['target'].apply(lambda x: 0 if x == 0 else 1)

In [9]:

# Splitting features (X) and target variable (y)
X = data.iloc[:, :-1]  # Features
y = data['target']  # Target variable

In [10]:
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [11]:
# Standardizing the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [12]:
def objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 100, 500)
    max_depth = trial.suggest_int("max_depth", 10, 50)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 10)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 5)
    bootstrap = trial.suggest_categorical("bootstrap", [True, False])

    model = RandomForestClassifier(
        n_estimators=n_estimators, max_depth=max_depth,
        min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf,
        bootstrap=bootstrap, random_state=42, n_jobs=-1
    )
    
    model.fit(X_train, y_train)
    return accuracy_score(y_test, model.predict(X_test))

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)

print(f"Best Hyperparameters: {study.best_params}")


[I 2025-02-11 16:37:01,724] A new study created in memory with name: no-name-268a0477-e79e-47ab-8083-abde01bf7aaf
[I 2025-02-11 16:37:02,362] Trial 0 finished with value: 0.9448051948051948 and parameters: {'n_estimators': 159, 'max_depth': 40, 'min_samples_split': 4, 'min_samples_leaf': 4, 'bootstrap': True}. Best is trial 0 with value: 0.9448051948051948.
[I 2025-02-11 16:37:02,816] Trial 1 finished with value: 0.9837662337662337 and parameters: {'n_estimators': 144, 'max_depth': 46, 'min_samples_split': 3, 'min_samples_leaf': 2, 'bootstrap': True}. Best is trial 1 with value: 0.9837662337662337.
[I 2025-02-11 16:37:03,590] Trial 2 finished with value: 0.9902597402597403 and parameters: {'n_estimators': 283, 'max_depth': 21, 'min_samples_split': 2, 'min_samples_leaf': 1, 'bootstrap': True}. Best is trial 2 with value: 0.9902597402597403.
[I 2025-02-11 16:37:03,812] Trial 3 finished with value: 0.9902597402597403 and parameters: {'n_estimators': 102, 'max_depth': 18, 'min_samples_spli

Best Hyperparameters: {'n_estimators': 283, 'max_depth': 21, 'min_samples_split': 2, 'min_samples_leaf': 1, 'bootstrap': True}


In [13]:
# Extract the best hyperparameters
best_params = study.best_params

# Train the optimized Random Forest model
optimized_rf = RandomForestClassifier(
    n_estimators=best_params["n_estimators"],
    max_depth=best_params["max_depth"],
    min_samples_split=best_params["min_samples_split"],
    min_samples_leaf=best_params["min_samples_leaf"],
    bootstrap=best_params["bootstrap"],
    random_state=42,
    n_jobs=-1
)

optimized_rf.fit(X_train, y_train)

# Make predictions
y_pred = optimized_rf.predict(X_test)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Optimized RF Accuracy: {accuracy * 100:.2f}%")
print(classification_report(y_test, y_pred))


Optimized RF Accuracy: 99.03%
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       159
           1       1.00      0.98      0.99       149

    accuracy                           0.99       308
   macro avg       0.99      0.99      0.99       308
weighted avg       0.99      0.99      0.99       308



In [None]:
# Save the best model
with open("best_heart_disease_model.pkl", "wb") as model_file:
    pickle.dump(optimized_rf, model_file)