2.6 Hyperparameter Tuning

In [1]:
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive')

# Define the path where you want to save the notebook
notebook_path = '/content/drive/My Drive/Colab_Notebooks/Heart_Disease_Project/note_books'

# Create the directory if it doesn't exist
os.makedirs(notebook_path, exist_ok=True)

print(f"Directory created or already exists: {notebook_path}")

# Note: Saving the notebook programmatically requires additional steps that depend on your environment
# In a standard Colab environment, you typically save the notebook manually through the 'File' menu.
# If you are in a different environment or need programmatic saving, you might need to use specific APIs or libraries.

Mounted at /content/drive
Directory created or already exists: /content/drive/My Drive/Colab_Notebooks/Heart_Disease_Project/note_books


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load reduced dataset
df = pd.read_csv('/content/drive/My Drive/Colab_Notebooks/Heart_Disease_Project/data/reduced_heart_disease.csv')
X = df.drop('num', axis=1)
y = df['num']

# Train/Test split (80/20, stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Baseline Random Forest
baseline_rf = RandomForestClassifier(random_state=42)
baseline_rf.fit(X_train, y_train)

# Baseline SVM
baseline_svm = SVC(probability=True, random_state=42)
baseline_svm.fit(X_train, y_train)

In [7]:
from sklearn.model_selection import GridSearchCV

rf_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_grid = GridSearchCV(
    RandomForestClassifier(random_state=42),
    rf_params,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1
)
rf_grid.fit(X_train, y_train)
rf_best = rf_grid.best_estimator_

print("Best Random Forest parameters:", rf_grid.best_params_)
print("Best Random Forest AUC (CV):", rf_grid.best_score_)

Best Random Forest parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}
Best Random Forest AUC (CV): 0.8562131202131201


In [8]:
from sklearn.model_selection import RandomizedSearchCV

svm_params = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto'],
    'degree': [2, 3, 4]
}

svm_rand = RandomizedSearchCV(
    SVC(probability=True, random_state=42),
    svm_params,
    n_iter=10,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1,
    random_state=42
)
svm_rand.fit(X_train, y_train)
svm_best = svm_rand.best_estimator_

print("Best SVM parameters:", svm_rand.best_params_)
print("Best SVM AUC (CV):", svm_rand.best_score_)

Best SVM parameters: {'kernel': 'linear', 'gamma': 'scale', 'degree': 2, 'C': 0.1}
Best SVM AUC (CV): 0.8723010323010323


In [9]:
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score

def evaluate(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    return {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1-score": f1_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_prob)
    }

results = {
    'Baseline RF': evaluate(baseline_rf, X_test, y_test),
    'Optimized RF': evaluate(rf_best, X_test, y_test),
    'Baseline SVM': evaluate(baseline_svm, X_test, y_test),
    'Optimized SVM': evaluate(svm_best, X_test, y_test)
}

results_df = pd.DataFrame(results).T
print(results_df)

               Accuracy  Precision    Recall  F1-score       AUC
Baseline RF    0.850000   0.851852  0.821429  0.836364  0.918527
Optimized RF   0.816667   0.814815  0.785714  0.800000  0.925223
Baseline SVM   0.866667   0.916667  0.785714  0.846154  0.915179
Optimized SVM  0.833333   0.909091  0.714286  0.800000  0.921875


Best Hyperparameters Found
Random Forest (GridSearchCV)
Best parameters:
max_depth: 10
min_samples_leaf: 1
min_samples_split: 10
n_estimators: 100
Best cross-validated AUC: 0.856
SVM (RandomizedSearchCV)
Best parameters:
kernel: linear
gamma: scale
degree: 2
C: 0.1
Best cross-validated AUC: 0.872

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from joblib import dump

# Example: Combine scaler and model in a pipeline (edit as needed!)
pipeline = Pipeline([
    ('scaler', StandardScaler()),      # or other preprocessing
    ('model', rf_best)                 # use your optimized model, e.g., rf_best or svm_best
])

# Fit pipeline on full data (or training data)
pipeline.fit(X_train, y_train)

# Save to .pkl file
dump(pipeline, 'heart_disease_model_pipeline.pkl')
print("Model pipeline exported as heart_disease_model_pipeline.pkl")

Model pipeline exported as heart_disease_model_pipeline.pkl
