In [39]:
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score


In [40]:
# 載入乳腺癌數據集
data = load_breast_cancer()
X = data.data
y = data.target

# 拆分數據為訓練集和測試集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [41]:
# 初始化隨機森林分類器
baseline_model = RandomForestClassifier(random_state=42)

# 訓練基線模型
baseline_model.fit(X_train, y_train)

# 預測測試集
y_pred_baseline = baseline_model.predict(X_test)

# 計算基線模型分數
baseline_f1 = f1_score(y_test, y_pred_baseline)
print(f"Baseline Macro F1: {baseline_f1:.5f}")

# 輸出模型的參數值
print("Baseline Model Parameters:")
print(baseline_model.get_params())

Baseline Macro F1: 0.97222
Baseline Model Parameters:
{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}


In [42]:
# 定義超參數範圍
param_grid = {
    'n_estimators': [200, 300, 400],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    # 'min_samples_leaf': [1, 2, 4]
}


### 網格搜索 (Grid Search)

In [43]:
# 初始化網格搜索
grid_search = GridSearchCV(estimator=baseline_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# 進行網格搜索
grid_search.fit(X_train, y_train)

# 輸出最佳參數和最佳分數
print(f"Best parameters found: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_}")


Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best parameters found: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}
Best cross-validation score: 0.9626373626373625


### 隨機搜索 (Random Search)

In [44]:
from sklearn.model_selection import RandomizedSearchCV

# 初始化隨機搜索
random_search = RandomizedSearchCV(estimator=baseline_model, param_distributions=param_grid, n_iter=100, cv=5, n_jobs=-1, verbose=2, random_state=42)

# 進行隨機搜索
random_search.fit(X_train, y_train)

# 輸出最佳參數和最佳分數
print(f"Best parameters found: {random_search.best_params_}")
print(f"Best cross-validation score: {random_search.best_score_}")


Fitting 5 folds for each of 36 candidates, totalling 180 fits




Best parameters found: {'n_estimators': 200, 'min_samples_split': 2, 'max_depth': None}
Best cross-validation score: 0.9626373626373625


In [45]:
# 使用最佳模型進行預測
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# 計算分數
best_f1 = f1_score(y_test, y_pred)

print(f"Baseline Macro F1: {baseline_f1:.5f}")
print(f"Best Macro F1: {best_f1:.5f}")


Baseline Macro F1: 0.97222
Best Macro F1: 0.97222
