In [1]:
import matplotlib.pyplot as plt
import optuna
import pandas as pd
from sklearn.model_selection import cross_val_score, StratifiedKFold

from model_creation import save_model_params, create_logistic_regression

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
plt.style.use("default")

In [3]:
TARGET_COL_NAME = "Expert Diagnose"

dataset_train = pd.read_csv("../dataset/train.csv")
dataset_test = pd.read_csv("../dataset/test.csv")

X_train, y_train = dataset_train.drop(columns=[TARGET_COL_NAME, "Patient Number"], axis=1), dataset_train[TARGET_COL_NAME]
X_test, y_test = dataset_test.drop(columns=[TARGET_COL_NAME, "Patient Number"], axis=1), dataset_test[TARGET_COL_NAME]
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((90, 13), (90,), (30, 13), (30,))

In [4]:
def objective_logistic_regression(trial: optuna.trial.Trial):
    params = dict(
        scaler_name = trial.suggest_categorical("scaler_name", ["StandardScaler", "MinMaxScaler"]),
        n_features_to_select = trial.suggest_int("n_features_to_select", 5, len(X_train.columns)),
        C = trial.suggest_float("C", 1e-3, 1e2, log=True),
        solver = trial.suggest_categorical("solver", ['lbfgs', 'liblinear', 'saga']),
        max_iter = trial.suggest_int("max_iter", 20, 1000, step=20),
    )

    pipe = create_logistic_regression(params)

    cv = StratifiedKFold(n_splits=5)
    score = cross_val_score(pipe, X_train, y_train, cv=cv, scoring="roc_auc_ovr").mean()
    return score


In [5]:
study = optuna.create_study(study_name="LogisticRegressionStudy", direction="maximize")
study.optimize(objective_logistic_regression, n_trials=300)

[I 2025-12-29 20:34:13,195] A new study created in memory with name: LogisticRegressionStudy
[I 2025-12-29 20:34:13,245] Trial 0 finished with value: 0.9500824175824176 and parameters: {'scaler_name': 'StandardScaler', 'n_features_to_select': 12, 'C': 0.005165512577349, 'solver': 'lbfgs', 'max_iter': 800}. Best is trial 0 with value: 0.9500824175824176.
[I 2025-12-29 20:34:13,321] Trial 1 finished with value: 0.9399793956043956 and parameters: {'scaler_name': 'StandardScaler', 'n_features_to_select': 6, 'C': 0.0025723710126729364, 'solver': 'lbfgs', 'max_iter': 100}. Best is trial 0 with value: 0.9500824175824176.
[I 2025-12-29 20:34:13,370] Trial 2 finished with value: 0.9730082417582417 and parameters: {'scaler_name': 'StandardScaler', 'n_features_to_select': 12, 'C': 0.11345304097776107, 'solver': 'lbfgs', 'max_iter': 660}. Best is trial 2 with value: 0.9730082417582417.
[I 2025-12-29 20:34:13,507] Trial 3 finished with value: 0.9338873626373628 and parameters: {'scaler_name': 'Stan

In [7]:
save_model_params("logistic_regression", study.best_params)
study.best_params

{'scaler_name': 'MinMaxScaler',
 'n_features_to_select': 10,
 'C': 14.250702321022288,
 'solver': 'liblinear',
 'max_iter': 320}