In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score

# Load dataset
data = load_breast_cancer()
X, y = data.data, data.target

# Split into train/test sets (optional but good practice for evaluation)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Initialize Random Forest classifier
rf = RandomForestClassifier(random_state=42)

# Define hyperparameter grid to search
param_grid = {
    'n_estimators': [50, 100, 200],         # number of trees
    'max_depth': [None, 5, 10, 20],         # max depth of tree
    'min_samples_split': [2, 5, 10],        # min samples to split internal node
    'min_samples_leaf': [1, 2, 4]           # min samples required at leaf node
}

# Setup GridSearchCV
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    scoring='accuracy',
    cv=5,
    n_jobs=-1,
    verbose=1
)

# Fit grid search to the training data
grid_search.fit(X_train, y_train)

# Best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)

# Best cross-validation accuracy
print("Best CV accuracy:", grid_search.best_score_)

# Evaluate best model on test set
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test)
test_acc = accuracy_score(y_test, y_pred)
print(f"Test set accuracy: {test_acc:.4f}")


Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best hyperparameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV accuracy: 0.9604395604395606
Test set accuracy: 0.9561
