In [18]:
import matplotlib.pyplot as plt
import optuna
import pandas as pd
from sklearn.model_selection import cross_val_score, StratifiedKFold

from model_creation import save_model_params, create_logistic_regression

In [19]:
plt.style.use("default")

In [20]:
TARGET_COL_NAME = "Expert Diagnose"

dataset_train = pd.read_csv("dataset/train.csv")
dataset_test = pd.read_csv("dataset/test.csv")

X_train, y_train = dataset_train.drop(columns=[TARGET_COL_NAME, "Patient Number"], axis=1), dataset_train[TARGET_COL_NAME]
X_test, y_test = dataset_test.drop(columns=[TARGET_COL_NAME, "Patient Number"], axis=1), dataset_test[TARGET_COL_NAME]
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((90, 13), (90,), (30, 13), (30,))

In [21]:
def objective_logistic_regression(trial: optuna.trial.Trial):
    params = dict(
        scaler_name = trial.suggest_categorical("scaler_name", ["StandardScaler", "MinMaxScaler"]),
        n_features_to_select = trial.suggest_int("n_features_to_select", 5, len(X_train.columns)),
        C = trial.suggest_float("C", 1e-3, 1e2, log=True),
        solver = trial.suggest_categorical("solver", ['lbfgs', 'liblinear', 'saga']),
        max_iter = trial.suggest_int("max_iter", 20, 1000, step=20),
    )

    pipe = create_logistic_regression(params)

    cv = StratifiedKFold(n_splits=5)
    score = cross_val_score(pipe, X_train, y_train, cv=cv, scoring="roc_auc_ovr").mean()
    return score


In [22]:
study = optuna.create_study(study_name="LogisticRegressionStudy", direction="maximize")
study.optimize(objective_logistic_regression, n_trials=300)

[I 2025-12-29 15:31:22,055] A new study created in memory with name: LogisticRegressionStudy
[I 2025-12-29 15:31:22,106] Trial 0 finished with value: 0.9751648351648352 and parameters: {'scaler_name': 'StandardScaler', 'n_features_to_select': 13, 'C': 56.14435781658859, 'solver': 'liblinear', 'max_iter': 280}. Best is trial 0 with value: 0.9751648351648352.
[I 2025-12-29 15:31:22,180] Trial 1 finished with value: 0.9426648351648351 and parameters: {'scaler_name': 'MinMaxScaler', 'n_features_to_select': 9, 'C': 0.002287312458374885, 'solver': 'lbfgs', 'max_iter': 520}. Best is trial 0 with value: 0.9751648351648352.
[I 2025-12-29 15:31:22,276] Trial 2 finished with value: 0.971929945054945 and parameters: {'scaler_name': 'MinMaxScaler', 'n_features_to_select': 8, 'C': 0.333786698345784, 'solver': 'saga', 'max_iter': 300}. Best is trial 0 with value: 0.9751648351648352.
[I 2025-12-29 15:31:22,344] Trial 3 finished with value: 0.9782417582417583 and parameters: {'scaler_name': 'MinMaxScal

In [23]:
save_model_params("logistic_regression", study.best_params)
study.best_params

{'scaler_name': 'StandardScaler',
 'n_features_to_select': 13,
 'C': 2.2541679013547404,
 'solver': 'lbfgs',
 'max_iter': 380}