In [1]:
import logging
from src.dataset import create_dataloaders
from src.utils import load_and_preprocess_data, split_and_scale_data
from src.train import train_and_evaluate_model

import optuna

In [2]:
logging.basicConfig(
    filename='training_logs.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

In [3]:
geo_csv_path = "dataframes/threshold_df.csv"
curated_csv_path = "dataframes/molab_df_curated.csv"
label_col = 'label-1RN-0Normal'
exclude_columns = ['label-1RN-0Normal', 'Patient ID', 'id', 'BASELINE_TIME_POINT', "CROSSING_TIME_POINT", "BASELINE_VOLUME"]

geo_df, exclude_columns = load_and_preprocess_data(geo_csv_path, curated_csv_path, label_col, exclude_columns)
geo_df_train, geo_df_test = split_and_scale_data(
    geo_df, label_col, [col for col in geo_df.columns if col not in exclude_columns]
)

In [4]:

epochs = 50

def objective(trial):
    # Define the hyperparameters to tune
    hidden_size = trial.suggest_categorical("hidden_size", [64, 128, 256, 512])
    batch_size = trial.suggest_categorical("batch_size", [8, 16, 32, 64])
    num_layers = trial.suggest_int("num_layers", 1, 5)
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1e-1)

    # Create dataloaders
    dataloaders, feature_columns = create_dataloaders(geo_df_train, label_col, exclude_columns, batch_size)

    # Train and evaluate the model
    metrics = train_and_evaluate_model(
        trial, dataloaders, feature_columns, geo_df_test, exclude_columns,
        num_epochs=epochs, hidden_size=hidden_size, num_layers=num_layers, batch_size=batch_size, learning_rate=learning_rate
    )

    # Return the validation AUC as the objective value
    return metrics['auc']


# Add stream handler of stdout to show the messages
study_name = "pretrained-encoder"  # Unique identifier of the study.
study = optuna.create_study(study_name=study_name, direction="maximize")
study.optimize(objective, n_trials=20)

# Get the trial data as a DataFrame
trial_data = study.trials_dataframe()

# Save the trial data to a CSV file
trial_data.to_csv(f'optuna_results/optuna_results_clinical_cv.csv', index=False)





[32m[I 2025-04-28 14:25:16,906][0m A new study created in memory with name: pretrained-encoder[0m
[32m[I 2025-04-28 14:25:26,650][0m Trial 0 finished with value: 0.7514074074074074 and parameters: {'hidden_size': 256, 'batch_size': 64, 'num_layers': 2, 'learning_rate': 6.869613865010829e-05}. Best is trial 0 with value: 0.7514074074074074.[0m
[32m[I 2025-04-28 14:25:35,491][0m Trial 1 finished with value: 0.7064444444444445 and parameters: {'hidden_size': 64, 'batch_size': 16, 'num_layers': 3, 'learning_rate': 0.0002977918510977308}. Best is trial 0 with value: 0.7514074074074074.[0m
[32m[I 2025-04-28 14:25:47,565][0m Trial 2 finished with value: 0.7936296296296297 and parameters: {'hidden_size': 512, 'batch_size': 8, 'num_layers': 5, 'learning_rate': 0.014766110350098184}. Best is trial 2 with value: 0.7936296296296297.[0m
[32m[I 2025-04-28 14:25:54,153][0m Trial 3 finished with value: 0.748962962962963 and parameters: {'hidden_size': 128, 'batch_size': 32, 'num_layers':