In [65]:
import matplotlib.pyplot as plt
import optuna
import pandas as pd
from sklearn.model_selection import cross_val_score, StratifiedKFold

from model_creation import save_model_params, create_random_forest

In [66]:
plt.style.use("default")

In [64]:
TARGET_COL_NAME = "Expert Diagnose"

dataset_train = pd.read_csv("dataset/train.csv")
dataset_test = pd.read_csv("dataset/test.csv")

X_train, y_train = dataset_train.drop(columns=[TARGET_COL_NAME, "Patient Number"], axis=1), dataset_train[TARGET_COL_NAME]
X_test, y_test = dataset_test.drop(columns=[TARGET_COL_NAME, "Patient Number"], axis=1), dataset_test[TARGET_COL_NAME]
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((90, 17), (90,), (30, 17), (30,))

In [67]:
def objective_random_forest(trial):
    params = dict(
        scaler_name = trial.suggest_categorical("scaler_name", ["StandardScaler", "MinMaxScaler"]),
        n_features_to_select = trial.suggest_int("n_features_to_select", 9, len(X.columns)),
        n_estimators = trial.suggest_int("n_estimators", 500, 1000),
        min_samples_split = trial.suggest_int("min_samples_split", 2, 8),
    )

    pipe = create_random_forest(params)

    cv = StratifiedKFold(n_splits=5)
    score = cross_val_score(pipe, X_train, y_train, cv=cv, scoring="roc_auc_ovr").mean()
    return score


In [50]:
random_forest_study = optuna.create_study(study_name="RandomForestStudy", direction="maximize")
random_forest_study.optimize(objective_random_forest, n_trials=50, n_jobs=-1)

[I 2025-12-28 14:04:57,271] A new study created in memory with name: RandomForestStudy
Best trial: 8. Best value: 0.994121:   0%|          | 0/50 [01:37<?, ?it/s]

[I 2025-12-28 14:06:34,643] Trial 8 finished with value: 0.9941208791208792 and parameters: {'scaler_name': 'MinMaxScaler', 'n_features_to_select': 16, 'n_estimators': 572, 'min_samples_split': 8}. Best is trial 8 with value: 0.9941208791208792.


Best trial: 8. Best value: 0.994121:   4%|▍         | 2/50 [02:43<1:02:59, 78.74s/it]

[I 2025-12-28 14:07:40,359] Trial 1 finished with value: 0.9934752747252749 and parameters: {'scaler_name': 'StandardScaler', 'n_features_to_select': 13, 'n_estimators': 500, 'min_samples_split': 7}. Best is trial 8 with value: 0.9941208791208792.


Best trial: 8. Best value: 0.994121:   6%|▌         | 3/50 [03:21<47:21, 60.46s/it]  

[I 2025-12-28 14:08:18,984] Trial 11 finished with value: 0.9919368131868133 and parameters: {'scaler_name': 'MinMaxScaler', 'n_features_to_select': 15, 'n_estimators': 773, 'min_samples_split': 6}. Best is trial 8 with value: 0.9941208791208792.


Best trial: 8. Best value: 0.994121:   8%|▊         | 4/50 [03:29<30:25, 39.69s/it]

[I 2025-12-28 14:08:26,806] Trial 0 finished with value: 0.9893818681318681 and parameters: {'scaler_name': 'StandardScaler', 'n_features_to_select': 15, 'n_estimators': 735, 'min_samples_split': 5}. Best is trial 8 with value: 0.9941208791208792.


Best trial: 3. Best value: 0.994245:  10%|█         | 5/50 [03:34<20:24, 27.21s/it]

[I 2025-12-28 14:08:31,953] Trial 3 finished with value: 0.9942445054945056 and parameters: {'scaler_name': 'StandardScaler', 'n_features_to_select': 13, 'n_estimators': 542, 'min_samples_split': 5}. Best is trial 3 with value: 0.9942445054945056.


Best trial: 3. Best value: 0.994245:  12%|█▏        | 6/50 [03:54<18:09, 24.76s/it]

[I 2025-12-28 14:08:51,975] Trial 13 finished with value: 0.9934752747252749 and parameters: {'scaler_name': 'StandardScaler', 'n_features_to_select': 17, 'n_estimators': 668, 'min_samples_split': 8}. Best is trial 3 with value: 0.9942445054945056.


Best trial: 3. Best value: 0.994245:  14%|█▍        | 7/50 [04:38<22:07, 30.88s/it]

[I 2025-12-28 14:09:35,360] Trial 10 finished with value: 0.9924587912087913 and parameters: {'scaler_name': 'StandardScaler', 'n_features_to_select': 10, 'n_estimators': 551, 'min_samples_split': 6}. Best is trial 3 with value: 0.9942445054945056.


Best trial: 3. Best value: 0.994245:  16%|█▌        | 8/50 [05:10<21:48, 31.15s/it]

[I 2025-12-28 14:10:07,222] Trial 7 finished with value: 0.9933516483516485 and parameters: {'scaler_name': 'StandardScaler', 'n_features_to_select': 13, 'n_estimators': 642, 'min_samples_split': 3}. Best is trial 3 with value: 0.9942445054945056.


Best trial: 3. Best value: 0.994245:  18%|█▊        | 9/50 [05:17<16:18, 23.86s/it]

[I 2025-12-28 14:10:14,923] Trial 6 finished with value: 0.9909203296703296 and parameters: {'scaler_name': 'StandardScaler', 'n_features_to_select': 14, 'n_estimators': 775, 'min_samples_split': 3}. Best is trial 3 with value: 0.9942445054945056.


Best trial: 3. Best value: 0.994245:  20%|██        | 10/50 [05:26<12:45, 19.13s/it]

[I 2025-12-28 14:10:23,539] Trial 9 finished with value: 0.9902747252747254 and parameters: {'scaler_name': 'StandardScaler', 'n_features_to_select': 11, 'n_estimators': 852, 'min_samples_split': 8}. Best is trial 3 with value: 0.9942445054945056.


Best trial: 3. Best value: 0.994245:  20%|██        | 10/50 [06:21<12:45, 19.13s/it]

[I 2025-12-28 14:11:18,505] Trial 15 finished with value: 0.9925824175824177 and parameters: {'scaler_name': 'StandardScaler', 'n_features_to_select': 17, 'n_estimators': 967, 'min_samples_split': 2}. Best is trial 3 with value: 0.9942445054945056.


Best trial: 5. Best value: 0.995783:  24%|██▍       | 12/50 [06:33<15:36, 24.64s/it]

[I 2025-12-28 14:11:30,693] Trial 5 finished with value: 0.9957829670329671 and parameters: {'scaler_name': 'StandardScaler', 'n_features_to_select': 13, 'n_estimators': 748, 'min_samples_split': 2}. Best is trial 5 with value: 0.9957829670329671.


Best trial: 5. Best value: 0.995783:  26%|██▌       | 13/50 [07:56<26:03, 42.25s/it]

[I 2025-12-28 14:12:53,458] Trial 2 finished with value: 0.9909203296703298 and parameters: {'scaler_name': 'MinMaxScaler', 'n_features_to_select': 10, 'n_estimators': 952, 'min_samples_split': 6}. Best is trial 5 with value: 0.9957829670329671.


Best trial: 5. Best value: 0.995783:  28%|██▊       | 14/50 [08:03<19:02, 31.75s/it]

[I 2025-12-28 14:13:00,906] Trial 20 finished with value: 0.992706043956044 and parameters: {'scaler_name': 'MinMaxScaler', 'n_features_to_select': 16, 'n_estimators': 893, 'min_samples_split': 7}. Best is trial 5 with value: 0.9957829670329671.


Best trial: 5. Best value: 0.995783:  30%|███       | 15/50 [08:19<15:36, 26.76s/it]

[I 2025-12-28 14:13:15,934] Trial 4 finished with value: 0.9942445054945056 and parameters: {'scaler_name': 'StandardScaler', 'n_features_to_select': 12, 'n_estimators': 908, 'min_samples_split': 3}. Best is trial 5 with value: 0.9957829670329671.


Best trial: 5. Best value: 0.995783:  32%|███▏      | 16/50 [08:43<14:49, 26.15s/it]

[I 2025-12-28 14:13:40,660] Trial 18 finished with value: 0.9924587912087913 and parameters: {'scaler_name': 'StandardScaler', 'n_features_to_select': 10, 'n_estimators': 597, 'min_samples_split': 8}. Best is trial 5 with value: 0.9957829670329671.


Best trial: 5. Best value: 0.995783:  34%|███▍      | 17/50 [08:52<11:30, 20.92s/it]

[I 2025-12-28 14:13:49,696] Trial 16 finished with value: 0.9925824175824177 and parameters: {'scaler_name': 'MinMaxScaler', 'n_features_to_select': 14, 'n_estimators': 808, 'min_samples_split': 3}. Best is trial 5 with value: 0.9957829670329671.


Best trial: 5. Best value: 0.995783:  36%|███▌      | 18/50 [09:45<16:19, 30.60s/it]

[I 2025-12-28 14:14:42,755] Trial 14 finished with value: 0.9934752747252749 and parameters: {'scaler_name': 'StandardScaler', 'n_features_to_select': 13, 'n_estimators': 746, 'min_samples_split': 2}. Best is trial 5 with value: 0.9957829670329671.


Best trial: 5. Best value: 0.995783:  38%|███▊      | 19/50 [09:57<12:57, 25.09s/it]

[I 2025-12-28 14:14:54,968] Trial 17 finished with value: 0.9916895604395604 and parameters: {'scaler_name': 'StandardScaler', 'n_features_to_select': 14, 'n_estimators': 989, 'min_samples_split': 4}. Best is trial 5 with value: 0.9957829670329671.


Best trial: 22. Best value: 0.996676:  40%|████      | 20/50 [10:38<14:51, 29.70s/it]

[I 2025-12-28 14:15:35,442] Trial 22 finished with value: 0.9966758241758242 and parameters: {'scaler_name': 'MinMaxScaler', 'n_features_to_select': 12, 'n_estimators': 576, 'min_samples_split': 5}. Best is trial 22 with value: 0.9966758241758242.


Best trial: 22. Best value: 0.996676:  42%|████▏     | 21/50 [12:26<25:40, 53.14s/it]

[I 2025-12-28 14:17:23,139] Trial 12 finished with value: 0.989010989010989 and parameters: {'scaler_name': 'MinMaxScaler', 'n_features_to_select': 10, 'n_estimators': 870, 'min_samples_split': 2}. Best is trial 22 with value: 0.9966758241758242.


Best trial: 22. Best value: 0.996676:  44%|████▍     | 22/50 [13:09<23:21, 50.07s/it]

[I 2025-12-28 14:18:06,154] Trial 19 finished with value: 0.9901510989010991 and parameters: {'scaler_name': 'StandardScaler', 'n_features_to_select': 9, 'n_estimators': 763, 'min_samples_split': 5}. Best is trial 22 with value: 0.9966758241758242.


Best trial: 22. Best value: 0.996676:  46%|████▌     | 23/50 [13:31<18:47, 41.77s/it]

[I 2025-12-28 14:18:28,697] Trial 25 finished with value: 0.9950137362637363 and parameters: {'scaler_name': 'StandardScaler', 'n_features_to_select': 12, 'n_estimators': 663, 'min_samples_split': 4}. Best is trial 22 with value: 0.9966758241758242.


Best trial: 22. Best value: 0.996676:  48%|████▊     | 24/50 [14:04<16:56, 39.09s/it]

[I 2025-12-28 14:19:01,002] Trial 26 finished with value: 0.9909203296703296 and parameters: {'scaler_name': 'StandardScaler', 'n_features_to_select': 12, 'n_estimators': 659, 'min_samples_split': 4}. Best is trial 22 with value: 0.9966758241758242.


Best trial: 22. Best value: 0.996676:  50%|█████     | 25/50 [14:09<12:02, 28.92s/it]

[I 2025-12-28 14:19:06,637] Trial 23 finished with value: 0.9942445054945056 and parameters: {'scaler_name': 'MinMaxScaler', 'n_features_to_select': 12, 'n_estimators': 912, 'min_samples_split': 4}. Best is trial 22 with value: 0.9966758241758242.


Best trial: 22. Best value: 0.996676:  52%|█████▏    | 26/50 [14:41<11:55, 29.82s/it]

[I 2025-12-28 14:19:38,498] Trial 27 finished with value: 0.9909203296703296 and parameters: {'scaler_name': 'StandardScaler', 'n_features_to_select': 12, 'n_estimators': 699, 'min_samples_split': 4}. Best is trial 22 with value: 0.9966758241758242.


Best trial: 22. Best value: 0.996676:  54%|█████▍    | 27/50 [14:46<08:37, 22.51s/it]

[I 2025-12-28 14:19:44,011] Trial 21 finished with value: 0.9933516483516485 and parameters: {'scaler_name': 'MinMaxScaler', 'n_features_to_select': 11, 'n_estimators': 990, 'min_samples_split': 4}. Best is trial 22 with value: 0.9966758241758242.


Best trial: 22. Best value: 0.996676:  56%|█████▌    | 28/50 [14:51<06:15, 17.08s/it]

[I 2025-12-28 14:19:48,433] Trial 28 finished with value: 0.9934752747252749 and parameters: {'scaler_name': 'StandardScaler', 'n_features_to_select': 12, 'n_estimators': 704, 'min_samples_split': 4}. Best is trial 22 with value: 0.9966758241758242.


Best trial: 22. Best value: 0.996676:  58%|█████▊    | 29/50 [15:30<08:20, 23.81s/it]

[I 2025-12-28 14:20:27,962] Trial 24 finished with value: 0.9942445054945056 and parameters: {'scaler_name': 'StandardScaler', 'n_features_to_select': 12, 'n_estimators': 884, 'min_samples_split': 4}. Best is trial 22 with value: 0.9966758241758242.


Best trial: 22. Best value: 0.996676:  60%|██████    | 30/50 [15:49<07:23, 22.18s/it]

[I 2025-12-28 14:20:46,234] Trial 29 finished with value: 0.9942445054945056 and parameters: {'scaler_name': 'StandardScaler', 'n_features_to_select': 12, 'n_estimators': 668, 'min_samples_split': 4}. Best is trial 22 with value: 0.9966758241758242.


Best trial: 22. Best value: 0.996676:  62%|██████▏   | 31/50 [16:16<07:32, 23.81s/it]

[I 2025-12-28 14:21:13,885] Trial 30 finished with value: 0.992706043956044 and parameters: {'scaler_name': 'StandardScaler', 'n_features_to_select': 12, 'n_estimators': 695, 'min_samples_split': 4}. Best is trial 22 with value: 0.9966758241758242.


Best trial: 22. Best value: 0.996676:  64%|██████▍   | 32/50 [17:29<11:31, 38.43s/it]

[I 2025-12-28 14:22:26,333] Trial 31 finished with value: 0.9924587912087913 and parameters: {'scaler_name': 'MinMaxScaler', 'n_features_to_select': 11, 'n_estimators': 686, 'min_samples_split': 4}. Best is trial 22 with value: 0.9966758241758242.


Best trial: 22. Best value: 0.996676:  66%|██████▌   | 33/50 [18:02<10:26, 36.85s/it]

[I 2025-12-28 14:22:59,579] Trial 32 finished with value: 0.9942445054945056 and parameters: {'scaler_name': 'MinMaxScaler', 'n_features_to_select': 12, 'n_estimators': 679, 'min_samples_split': 5}. Best is trial 22 with value: 0.9966758241758242.


Best trial: 22. Best value: 0.996676:  68%|██████▊   | 34/50 [19:20<13:07, 49.25s/it]

[I 2025-12-28 14:24:17,794] Trial 33 finished with value: 0.9935989010989011 and parameters: {'scaler_name': 'MinMaxScaler', 'n_features_to_select': 12, 'n_estimators': 680, 'min_samples_split': 4}. Best is trial 22 with value: 0.9966758241758242.


Best trial: 22. Best value: 0.996676:  70%|███████   | 35/50 [19:51<10:57, 43.82s/it]

[I 2025-12-28 14:24:48,943] Trial 34 finished with value: 0.9942445054945056 and parameters: {'scaler_name': 'MinMaxScaler', 'n_features_to_select': 12, 'n_estimators': 698, 'min_samples_split': 4}. Best is trial 22 with value: 0.9966758241758242.


Best trial: 22. Best value: 0.996676:  72%|███████▏  | 36/50 [20:36<10:17, 44.08s/it]

[I 2025-12-28 14:25:33,618] Trial 43 finished with value: 0.9925824175824177 and parameters: {'scaler_name': 'MinMaxScaler', 'n_features_to_select': 14, 'n_estimators': 514, 'min_samples_split': 5}. Best is trial 22 with value: 0.9966758241758242.


Best trial: 22. Best value: 0.996676:  74%|███████▍  | 37/50 [20:42<07:02, 32.54s/it]

[I 2025-12-28 14:25:39,258] Trial 38 finished with value: 0.9909203296703296 and parameters: {'scaler_name': 'MinMaxScaler', 'n_features_to_select': 11, 'n_estimators': 621, 'min_samples_split': 5}. Best is trial 22 with value: 0.9966758241758242.


Best trial: 22. Best value: 0.996676:  76%|███████▌  | 38/50 [20:44<04:43, 23.63s/it]

[I 2025-12-28 14:25:42,022] Trial 39 finished with value: 0.9950137362637363 and parameters: {'scaler_name': 'MinMaxScaler', 'n_features_to_select': 11, 'n_estimators': 608, 'min_samples_split': 5}. Best is trial 22 with value: 0.9966758241758242.


Best trial: 22. Best value: 0.996676:  78%|███████▊  | 39/50 [20:50<03:19, 18.13s/it]

[I 2025-12-28 14:25:47,350] Trial 35 finished with value: 0.9924587912087913 and parameters: {'scaler_name': 'MinMaxScaler', 'n_features_to_select': 12, 'n_estimators': 705, 'min_samples_split': 4}. Best is trial 22 with value: 0.9966758241758242.


Best trial: 22. Best value: 0.996676:  80%|████████  | 40/50 [21:07<02:58, 17.87s/it]

[I 2025-12-28 14:26:04,648] Trial 44 finished with value: 0.9933516483516485 and parameters: {'scaler_name': 'StandardScaler', 'n_features_to_select': 14, 'n_estimators': 513, 'min_samples_split': 5}. Best is trial 22 with value: 0.9966758241758242.


Best trial: 22. Best value: 0.996676:  82%|████████▏ | 41/50 [21:13<02:09, 14.37s/it]

[I 2025-12-28 14:26:10,915] Trial 40 finished with value: 0.9907967032967033 and parameters: {'scaler_name': 'MinMaxScaler', 'n_features_to_select': 11, 'n_estimators': 611, 'min_samples_split': 5}. Best is trial 22 with value: 0.9966758241758242.


Best trial: 22. Best value: 0.996676:  84%|████████▍ | 42/50 [21:17<01:29, 11.23s/it]

[I 2025-12-28 14:26:14,816] Trial 36 finished with value: 0.9915659340659342 and parameters: {'scaler_name': 'MinMaxScaler', 'n_features_to_select': 11, 'n_estimators': 699, 'min_samples_split': 4}. Best is trial 22 with value: 0.9966758241758242.


Best trial: 22. Best value: 0.996676:  86%|████████▌ | 43/50 [21:18<00:56,  8.09s/it]

[I 2025-12-28 14:26:15,449] Trial 37 finished with value: 0.9924587912087912 and parameters: {'scaler_name': 'MinMaxScaler', 'n_features_to_select': 11, 'n_estimators': 620, 'min_samples_split': 4}. Best is trial 22 with value: 0.9966758241758242.


Best trial: 22. Best value: 0.996676:  88%|████████▊ | 44/50 [21:33<01:01, 10.32s/it]

[I 2025-12-28 14:26:31,156] Trial 41 finished with value: 0.9924587912087913 and parameters: {'scaler_name': 'MinMaxScaler', 'n_features_to_select': 11, 'n_estimators': 623, 'min_samples_split': 5}. Best is trial 22 with value: 0.9966758241758242.


Best trial: 22. Best value: 0.996676:  90%|█████████ | 45/50 [21:51<01:01, 12.39s/it]

[I 2025-12-28 14:26:48,338] Trial 42 finished with value: 0.9915659340659342 and parameters: {'scaler_name': 'MinMaxScaler', 'n_features_to_select': 11, 'n_estimators': 616, 'min_samples_split': 5}. Best is trial 22 with value: 0.9966758241758242.


Best trial: 22. Best value: 0.996676:  92%|█████████▏| 46/50 [22:14<01:02, 15.64s/it]

[I 2025-12-28 14:27:11,595] Trial 45 finished with value: 0.992706043956044 and parameters: {'scaler_name': 'StandardScaler', 'n_features_to_select': 13, 'n_estimators': 531, 'min_samples_split': 5}. Best is trial 22 with value: 0.9966758241758242.


Best trial: 22. Best value: 0.996676:  94%|█████████▍| 47/50 [22:21<00:39, 13.14s/it]

[I 2025-12-28 14:27:18,903] Trial 46 finished with value: 0.992706043956044 and parameters: {'scaler_name': 'StandardScaler', 'n_features_to_select': 13, 'n_estimators': 513, 'min_samples_split': 5}. Best is trial 22 with value: 0.9966758241758242.


Best trial: 22. Best value: 0.996676:  96%|█████████▌| 48/50 [22:32<00:25, 12.53s/it]

[I 2025-12-28 14:27:29,946] Trial 48 finished with value: 0.9966758241758242 and parameters: {'scaler_name': 'StandardScaler', 'n_features_to_select': 13, 'n_estimators': 527, 'min_samples_split': 7}. Best is trial 22 with value: 0.9966758241758242.


Best trial: 22. Best value: 0.996676:  98%|█████████▊| 49/50 [22:40<00:10, 10.94s/it]

[I 2025-12-28 14:27:37,269] Trial 47 finished with value: 0.991043956043956 and parameters: {'scaler_name': 'StandardScaler', 'n_features_to_select': 13, 'n_estimators': 617, 'min_samples_split': 6}. Best is trial 22 with value: 0.9966758241758242.


Best trial: 22. Best value: 0.996676: 100%|██████████| 50/50 [22:53<00:00, 27.47s/it]

[I 2025-12-28 14:27:50,692] Trial 49 finished with value: 0.9893818681318682 and parameters: {'scaler_name': 'MinMaxScaler', 'n_features_to_select': 9, 'n_estimators': 608, 'min_samples_split': 6}. Best is trial 22 with value: 0.9966758241758242.





In [58]:
"""
{'scaler_name': 'MinMaxScaler',
 'n_features_to_select': 12,
 'n_estimators': 576,
 'min_samples_split': 5}
"""
save_model_params("random_forest", random_forest_study.best_params)
random_forest_study.best_params

{'scaler_name': 'MinMaxScaler',
 'n_features_to_select': 12,
 'n_estimators': 576,
 'min_samples_split': 5}