In [11]:
# Day 65 – Random Forest Hyperparameter Tuning

import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load dataset
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [12]:
# ---------------------------
# Baseline Random Forest Model
# ---------------------------
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print(" Baseline Accuracy:", accuracy_score(y_test, y_pred))

 Baseline Accuracy: 0.956140350877193


In [13]:

# ---------------------------
# Hyperparameter Tuning - GridSearchCV
# ---------------------------
param_grid = {
    "n_estimators": [50, 100, 200],
    "max_depth": [None, 5, 10],
    "max_features": ["sqrt", "log2"],
    "min_samples_split": [2, 5, 10]
}

grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,
    scoring="accuracy"
)

grid_search.fit(X_train, y_train)
print("\n Best Params (GridSearchCV):", grid_search.best_params_)
print("Best CV Score:", grid_search.best_score_)

# Evaluate best model
best_grid = grid_search.best_estimator_
y_pred_grid = best_grid.predict(X_test)
print(" Accuracy after GridSearchCV:", accuracy_score(y_test, y_pred_grid))


 Best Params (GridSearchCV): {'max_depth': None, 'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 200}
Best CV Score: 0.9604395604395606
 Accuracy after GridSearchCV: 0.956140350877193


In [14]:

# ---------------------------
# Hyperparameter Tuning - RandomizedSearchCV
# ---------------------------
param_dist = {
    "n_estimators": np.arange(50, 300, 50),
    "max_depth": [None] + list(np.arange(5, 20, 5)),
    "max_features": ["sqrt", "log2", None],
    "min_samples_split": np.arange(2, 11)
}

random_search = RandomizedSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_distributions=param_dist,
    n_iter=20,  # number of random parameter combinations
    cv=5,
    n_jobs=-1,
    scoring="accuracy",
    random_state=42
)

random_search.fit(X_train, y_train)
print("\n Best Params (RandomizedSearchCV):", random_search.best_params_)
print(" Best CV Score:", random_search.best_score_)

# Evaluate best model
best_random = random_search.best_estimator_
y_pred_random = best_random.predict(X_test)
print(" Accuracy after RandomizedSearchCV:", accuracy_score(y_test, y_pred_random))


 Best Params (RandomizedSearchCV): {'n_estimators': np.int64(150), 'min_samples_split': np.int64(2), 'max_features': 'sqrt', 'max_depth': np.int64(10)}
 Best CV Score: 0.9604395604395606
 Accuracy after RandomizedSearchCV: 0.956140350877193


In [15]:
# ---------------------------
# Final Comparison
# ---------------------------
print("\n Final Model Comparison:")
print("Baseline Accuracy:", accuracy_score(y_test, y_pred))
print("GridSearchCV Accuracy:", accuracy_score(y_test, y_pred_grid))
print("RandomizedSearchCV Accuracy:", accuracy_score(y_test, y_pred_random))
print("\nClassification Report (Best RandomizedSearchCV Model):")
print(classification_report(y_test, y_pred_random))


 Final Model Comparison:
Baseline Accuracy: 0.956140350877193
GridSearchCV Accuracy: 0.956140350877193
RandomizedSearchCV Accuracy: 0.956140350877193

Classification Report (Best RandomizedSearchCV Model):
              precision    recall  f1-score   support

           0       0.95      0.93      0.94        42
           1       0.96      0.97      0.97        72

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114

