In [1]:
import logging

import pandas as pd

from src.dataset import create_dataloaders, ClinicalDataset, ImagingDataset
from src.utils import load_and_preprocess_data, split_and_scale_data
from src.train import train_and_evaluate_model
from src.models import SimpleNN, SimpleNNWithBatchNorm

import optuna

In [2]:
logging.basicConfig(
    filename='training_logs_clinical.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

In [3]:
modality = "clinical"  # can be "clinical", "imaging", or "multimodal

assert modality in ["clinical", "imaging", "multimodal"], f"Modality {modality} not supported"

# Common parameters
geo_csv_path = "dataframes/threshold_df_new.csv"
curated_csv_path = "dataframes/molab_df_curated.csv"
img_seq_path = "representations/molab-hardy-leaf-97_embeddings.npy"
label_col = 'label-1RN-0Normal'
# exclude_columns = ['label-1RN-0Normal', 'Patient ID', 'id', 'BASELINE_TIME_POINT', "CROSSING_TIME_POINT", "BASELINE_VOLUME", "scan_date"]

exclude_columns = ['label-1RN-0Normal', 'Patient ID', 'id', 'BASELINE_TIME_POINT',
                   "CROSSING_TIME_POINT", "scan_date"] + ['1+2.0', '2+2.0', '2+3.0', '2+1.0']
is_radiomics = True



In [4]:
if modality == "imaging":
    ds_cls = ImagingDataset
    model = SimpleNNWithBatchNorm
    ds_cls_kwargs = {"data_dir": img_seq_path, "is_gap": True}

elif modality == "clinical":
    ds_cls = ClinicalDataset
    model = SimpleNN
    ds_cls_kwargs = {"columns_to_drop": exclude_columns}

elif modality == "multimodal":
    pass  # TODO: Future implementation


In [7]:
epochs = 70

def objective(trial):
    # Define the hyperparameters to tune
    hidden_size = trial.suggest_categorical("hidden_size", [64, 128, 256, 512])
    batch_size = trial.suggest_categorical("batch_size", [8, 16, 32, 64])
    num_layers = trial.suggest_int("num_layers", 1, 5)
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1e-1)

    random_state = trial.suggest_int("random_state", 0, 10)
    # geo_df = load_and_preprocess_data(geo_csv_path, curated_csv_path, label_col)
    radiomics_df = pd.read_csv("dataframes/filtered_radiomics.csv").dropna()
    geo_df_train, geo_df_test = split_and_scale_data(radiomics_df, label_col, [col for col in radiomics_df.columns if col not in exclude_columns])

    # Create dataloaders
    dataloaders, feature_columns = create_dataloaders(
        geo_df_train,
        label_col,
        exclude_columns,
        batch_size,
        dataset_cls=ds_cls,
        dataset_kwargs=ds_cls_kwargs,
        random_state=random_state
    )

    input_size = len(feature_columns) if modality == "clinical" else 384 # TODO: Remove hardcoded value

    # Model kwargs for model agnostic training
    model_kwargs = {"input_size": input_size, "hidden_size": hidden_size, "num_layer": num_layers}

    # Train and evaluate the model
    metrics = train_and_evaluate_model(
        trial, dataloaders, geo_df_test, exclude_columns,
        num_epochs=epochs,
        batch_size=batch_size, learning_rate=learning_rate,
        model_cls=model, model_kwargs=model_kwargs,
        dataset_cls=ds_cls, dataset_kwargs=ds_cls_kwargs
    )

    # Return the validation AUC as the objective value
    return metrics['auc']


# Add stream handler of stdout to show the messages
study_name = "pretrained-encoder"  # Unique identifier of the study.
study = optuna.create_study(study_name=study_name, direction="maximize")
study.optimize(objective, n_trials=25)

# Get the trial data as a DataFrame
trial_data = study.trials_dataframe()

# Save the trial data to a CSV file
trial_data.to_csv(f'optuna_results/optuna_results_{modality}_cv.csv', index=False)

[32m[I 2025-05-16 13:30:29,160][0m A new study created in memory with name: pretrained-encoder[0m
[32m[I 2025-05-16 13:30:37,411][0m Trial 0 finished with value: 0.9125925925925926 and parameters: {'hidden_size': 128, 'batch_size': 64, 'num_layers': 4, 'learning_rate': 0.028818882795035212, 'random_state': 2, 'weight_decay': 0.0012103000019560357}. Best is trial 0 with value: 0.9125925925925926.[0m
[32m[I 2025-05-16 13:30:46,989][0m Trial 1 finished with value: 0.8626666666666667 and parameters: {'hidden_size': 128, 'batch_size': 16, 'num_layers': 1, 'learning_rate': 0.0003111223556821864, 'random_state': 9, 'weight_decay': 0.5163973457775259}. Best is trial 0 with value: 0.9125925925925926.[0m
[32m[I 2025-05-16 13:30:59,906][0m Trial 2 finished with value: 0.8665925925925926 and parameters: {'hidden_size': 512, 'batch_size': 8, 'num_layers': 2, 'learning_rate': 0.07425568057855, 'random_state': 6, 'weight_decay': 0.0028380534369989874}. Best is trial 0 with value: 0.9125925