In [1]:
import matplotlib.pyplot as plt
import optuna
import pandas as pd
from sklearn.model_selection import cross_val_score, StratifiedKFold

from model_creation import save_model_params, create_random_forest

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
plt.style.use("default")

In [3]:
TARGET_COL_NAME = "Expert Diagnose"

dataset_train = pd.read_csv("../dataset/train.csv")
dataset_test = pd.read_csv("../dataset/test.csv")

X_train, y_train = dataset_train.drop(columns=[TARGET_COL_NAME, "Patient Number"], axis=1), dataset_train[TARGET_COL_NAME]
X_test, y_test = dataset_test.drop(columns=[TARGET_COL_NAME, "Patient Number"], axis=1), dataset_test[TARGET_COL_NAME]
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((90, 13), (90,), (30, 13), (30,))

In [4]:
def objective_random_forest(trial: optuna.trial.Trial):
    params = dict(
        scaler_name = trial.suggest_categorical("scaler_name", ["StandardScaler", "MinMaxScaler"]),
        n_features_to_select = trial.suggest_int("n_features_to_select", 8, len(X_train.columns)),
        n_estimators = trial.suggest_int("n_estimators", 300, 2000, step=10),
        min_samples_split = trial.suggest_int("min_samples_split", 2, 10),
        max_depth = trial.suggest_int("max_depth", 10, 500, step=10),
    )

    pipe = create_random_forest(params)

    cv = StratifiedKFold(n_splits=5)
    score = cross_val_score(pipe, X_train, y_train, cv=cv, scoring="roc_auc_ovr").mean()
    return score


In [5]:
random_forest_study = optuna.create_study(study_name="RandomForestStudy", direction="maximize")
random_forest_study.optimize(objective_random_forest, n_trials=100, n_jobs=-1)

[I 2025-12-29 13:11:43,924] A new study created in memory with name: RandomForestStudy
[I 2025-12-29 13:12:36,575] Trial 13 finished with value: 0.9746978021978023 and parameters: {'scaler_name': 'StandardScaler', 'n_features_to_select': 13, 'n_estimators': 540, 'min_samples_split': 9, 'max_depth': 500}. Best is trial 13 with value: 0.9746978021978023.
[I 2025-12-29 13:13:15,041] Trial 7 finished with value: 0.9753434065934066 and parameters: {'scaler_name': 'StandardScaler', 'n_features_to_select': 10, 'n_estimators': 390, 'min_samples_split': 8, 'max_depth': 220}. Best is trial 7 with value: 0.9753434065934066.
[I 2025-12-29 13:13:43,817] Trial 9 finished with value: 0.9751304945054946 and parameters: {'scaler_name': 'MinMaxScaler', 'n_features_to_select': 8, 'n_estimators': 360, 'min_samples_split': 2, 'max_depth': 170}. Best is trial 7 with value: 0.9753434065934066.
[I 2025-12-29 13:13:47,778] Trial 14 finished with value: 0.9772527472527474 and parameters: {'scaler_name': 'MinMax

In [6]:
save_model_params("random_forest", random_forest_study.best_params)
random_forest_study.best_params

{'scaler_name': 'StandardScaler',
 'n_features_to_select': 10,
 'n_estimators': 1420,
 'min_samples_split': 2,
 'max_depth': 470}