In [4]:
import pandas as pd
import os
import joblib

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# Load cleaned data
df = pd.read_csv('../data/heart_disease_cleaned.csv')

# Separate features and target
X = df.drop('target', axis=1)
y = df['target']

# Identify numeric and categorical columns (based on your dataset)
num_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
cat_cols = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']

# Create a preprocessing pipeline
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
])

# Build the main pipeline: preprocessing + model
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Define the hyperparameter grid
param_grid = {
    'classifier__n_estimators': [50, 100],
    'classifier__max_depth': [4, None],
    'classifier__min_samples_split': [2, 5]
}

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Grid Search Cross-Validation
grid = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,
    scoring='accuracy',
    verbose=1
)

# Fit the model
grid.fit(X_train, y_train)

# Show results
print("✅ Best Parameters:", grid.best_params_)
print(f"✅ Test Accuracy: {grid.score(X_test, y_test):.3f}")

# Ensure the models directory exists
os.makedirs('../models', exist_ok=True)

# Save the best model pipeline
joblib.dump(grid.best_estimator_, '../models/final_model.pkl')
print("✅ Model saved to '../models/final_model.pkl'")


Fitting 5 folds for each of 8 candidates, totalling 40 fits
✅ Best Parameters: {'classifier__max_depth': None, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 50}
✅ Test Accuracy: 0.683
✅ Model saved to '../models/final_model.pkl'
