# import包

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import uniform
import random
import os

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV, HalvingRandomSearchCV
import optuna

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

import warnings
warnings.filterwarnings('ignore')



In [2]:
def seed_everything(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42)

# 读取数据

In [3]:
path = './heart.csv'
df = pd.read_csv(path)

# 数据划分

In [4]:
target = 'output'
used = [x for x in df.columns if x != target]

In [5]:
random_state = 1

In [6]:
x_train, x_test, y_train, y_test = train_test_split(df[used], df[target], stratify=df[target], 
                                                    random_state=random_state)

# 模型训练与计算准确率

In [7]:
clf = RandomForestClassifier(random_state=42)
clf.fit(x_train, y_train)
clf.score(x_test, y_test)

0.8157894736842105

# GridSearch

In [8]:
parameters = {
    'max_depth': [2,3,4,5,6],
    'min_samples_split': [2,3],
    'min_samples_leaf': [2,3],
    'min_weight_fraction_leaf': [0, 0.1, 0.2]
}

clf = GridSearchCV(
    RandomForestClassifier(random_state=42),
    parameters, refit=True, verbose=1,
)
clf.fit(x_train, y_train)

# 打印最优参数
print(clf.best_params_)

# 使用最优参数评估测试集
print(clf.best_estimator_.score(x_test, y_test))

Fitting 5 folds for each of 60 candidates, totalling 300 fits
{'max_depth': 2, 'min_samples_leaf': 3, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0}
0.8157894736842105


# Random Search 

In [9]:
parameters = {
    'max_depth': [2,3,4,5,6],
    'min_samples_split': [2,3],
    'min_samples_leaf': [2,3],
    'min_weight_fraction_leaf': uniform(loc=0.1, scale=0.3)
}

clf = RandomizedSearchCV(
    RandomForestClassifier(random_state=42),
    parameters, refit=True, verbose=1, n_iter=10,
)

clf.fit(x_train, y_train)

# 打印最优参数
print(clf.best_params_)

# 使用最优参数评估测试集
print(clf.best_estimator_.score(x_test, y_test))

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'max_depth': 5, 'min_samples_leaf': 3, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.23682099526511077}
0.7894736842105263


# Halving search

## HalvingGridSearchCV

In [10]:
parameters = {
    'max_depth': [2,3,4,5,6],
    'min_samples_split': [2,3],
    'min_samples_leaf': [2,3],
    'min_weight_fraction_leaf': [0, 0.1, 0.2]
}

clf = HalvingGridSearchCV(
    RandomForestClassifier(random_state=42),
    parameters, refit=True, verbose=1,
)
clf.fit(x_train, y_train)

# 打印最优参数
print(clf.best_params_)

# 使用最优参数评估测试集
print(clf.best_estimator_.score(x_test, y_test))

n_iterations: 3
n_required_iterations: 4
n_possible_iterations: 3
min_resources_: 20
max_resources_: 227
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 60
n_resources: 20
Fitting 5 folds for each of 60 candidates, totalling 300 fits
----------
iter: 1
n_candidates: 20
n_resources: 60
Fitting 5 folds for each of 20 candidates, totalling 100 fits
----------
iter: 2
n_candidates: 7
n_resources: 180
Fitting 5 folds for each of 7 candidates, totalling 35 fits
{'max_depth': 4, 'min_samples_leaf': 3, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.2}
0.8157894736842105


## HalvingRandomSearchCV

In [11]:
parameters = {
    'max_depth': [2,3,4,5,6],
    'min_samples_split': [2,3],
    'min_samples_leaf': [2,3],
    'min_weight_fraction_leaf': uniform(loc=0.1, scale=0.3)
}

clf = HalvingRandomSearchCV(
    RandomForestClassifier(random_state=42),
    parameters, refit=True, verbose=1
)

clf.fit(x_train, y_train)


# 打印最优参数
print(clf.best_params_)

# 使用最优参数评估测试集
print(clf.best_estimator_.score(x_test, y_test))

n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 20
max_resources_: 227
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 11
n_resources: 20
Fitting 5 folds for each of 11 candidates, totalling 55 fits
----------
iter: 1
n_candidates: 4
n_resources: 60
Fitting 5 folds for each of 4 candidates, totalling 20 fits
----------
iter: 2
n_candidates: 2
n_resources: 180
Fitting 5 folds for each of 2 candidates, totalling 10 fits
{'max_depth': 4, 'min_samples_leaf': 2, 'min_samples_split': 3, 'min_weight_fraction_leaf': 0.24769764400035457}
0.8026315789473685


# Bayesian optimization

In [12]:
# 数据划分
x_train_bayes, x_val_bayes, y_train_bayes, y_val_bayes =\
        train_test_split(x_train, y_train, stratify=y_train, random_state=42)

In [13]:
def objective(trial):
    params = {
        'max_depth': trial.suggest_int("max_depth", 1, 7),
        'min_samples_split': trial.suggest_int("min_samples_split", 2, 5),
        'min_samples_leaf': trial.suggest_int("min_samples_leaf", 2, 5),
        'min_weight_fraction_leaf': trial.suggest_float("min_weight_fraction_leaf", 0.1, 0.4)
    }
    
    clf = RandomForestClassifier(random_state=42, **params)
    clf.fit(x_train_bayes, y_train_bayes)
    
    preds = clf.predict(x_val_bayes)
    auc = roc_auc_score(y_val_bayes, preds)
    
    return auc

In [14]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)

[32m[I 2022-10-21 15:16:32,935][0m A new study created in memory with name: no-name-23f559a1-c14b-4e61-812e-4d4a0bb6df0b[0m
[32m[I 2022-10-21 15:16:33,080][0m Trial 0 finished with value: 0.7847394540942929 and parameters: {'max_depth': 6, 'min_samples_split': 5, 'min_samples_leaf': 2, 'min_weight_fraction_leaf': 0.20284383056582334}. Best is trial 0 with value: 0.7847394540942929.[0m
[32m[I 2022-10-21 15:16:33,201][0m Trial 1 finished with value: 0.8008684863523574 and parameters: {'max_depth': 7, 'min_samples_split': 2, 'min_samples_leaf': 4, 'min_weight_fraction_leaf': 0.320926780584202}. Best is trial 1 with value: 0.8008684863523574.[0m
[32m[I 2022-10-21 15:16:33,322][0m Trial 2 finished with value: 0.8008684863523574 and parameters: {'max_depth': 7, 'min_samples_split': 2, 'min_samples_leaf': 5, 'min_weight_fraction_leaf': 0.2943519790633592}. Best is trial 1 with value: 0.8008684863523574.[0m
[32m[I 2022-10-21 15:16:33,443][0m Trial 3 finished with value: 0.8008684

In [15]:
# 使用最优参数进行训练
best_params = study.best_params

clf = RandomForestClassifier(random_state=42, **best_params)
clf.fit(x_train, y_train)
clf.score(x_test, y_test)

0.75