In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import time
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import (
    RandomizedSearchCV, GridSearchCV, train_test_split
)
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from scipy.stats import randint, uniform

In [2]:
X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
param_distributions = {
    'n_estimators': randint(50, 300),
    'learning_rate': uniform(0.01, 0.3),
    'max_depth': randint(2, 10),
    'min_samples_split': randint(2, 20),
    'subsample': uniform(0.6, 0.4)
}


random_search = RandomizedSearchCV(
    GradientBoostingClassifier(random_state=42),
    param_distributions,
    n_iter=80,
    cv=5,
    scoring='accuracy',
    random_state=42,
    n_jobs=-1,
    verbose=1,
    return_train_score=True
)

start_random = time.time()
random_search.fit(X_train, y_train)
time_random = time.time() - start_random

print(f"\nTemps: {time_random:.2f} secondes")
print(f"Meilleur score CV: {random_search.best_score_:.4f}")
print(f"Meilleurs parametres: {random_search.best_params_}")

Fitting 5 folds for each of 80 candidates, totalling 400 fits

Temps: 22.85 secondes
Meilleur score CV: 0.9802
Meilleurs parametres: {'learning_rate': np.float64(0.2287021504122962), 'max_depth': 2, 'min_samples_split': 6, 'n_estimators': 283, 'subsample': np.float64(0.7433862914177091)}


In [4]:
param_grid = {
    'n_estimators': [50, 100, 150, 200, 250],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'subsample': [0.7, 0.8, 0.9, 1.0]
}

n_combinations = 1
for v in param_grid.values():
    n_combinations *= len(v)
print(f"Nombre de combinaisons GridSearch: {n_combinations}")
print(f"Avec 5 folds: {n_combinations * 5} entrainements")


grid_search = GridSearchCV(
    GradientBoostingClassifier(random_state=42),
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

start_grid = time.time()
grid_search.fit(X_train, y_train)
time_grid = time.time() - start_grid

test_score_gs = grid_search.score(X_test, y_test)
print(f"\nTemps: {time_grid:.2f} secondes")
print(f"Meilleur score CV: {grid_search.best_score_:.4f}")
print(f"Score test (GridSearch): {test_score_gs:.4f}")

Nombre de combinaisons GridSearch: 720
Avec 5 folds: 3600 entrainements
Fitting 5 folds for each of 720 candidates, totalling 3600 fits

Temps: 152.80 secondes
Meilleur score CV: 0.9780
Score test (GridSearch): 0.9561
