# Hyperautomatic Barrel Batching System (HBBS)

This notebook automates the process of running multiple user defined experiments with different datsets and settings sequentially. Each experiment runs a user defined number of trials for different user defined model configurations (baseline, infused-fixed, infused-trainable) to evaluate and compare their performance systematically. It collects validation metrics for each run, aggregates them, and saves the results.

If you just want to run one experiment, try the automated_barrel_batching_system

### ||RUN ON RESTART||

In [None]:
# Load dependencies

from utils import build_multilabel_dataset, multilabel_split, prep_infused_sweetnet, seed_everything
from tqdm.auto import tqdm

import os
import sys
import pickle
import torch
import yaml
import numpy
import time

from glycowork.ml.processing import split_data_to_train, dataset_to_dataloader
from glycowork.ml import model_training


In [None]:
# Load embeddings

pickle_file_path = 'glm_embeddings_1.pkl'

# --- Load the Pickle File ---
if os.path.exists(pickle_file_path):
    print(f"Loading embeddings from: {pickle_file_path}")
    try:
        # Open the file in binary read mode ('rb')
        with open(pickle_file_path, 'rb') as file_handle:
            # Load the object(s) from the pickle file
            glm_embeddings = pickle.load(file_handle)

        print("Embeddings loaded successfully!")        

    except Exception as e:
        print(f"An error occurred while loading the pickle file: {e}")
else:
    print(f"Error: File not found at '{pickle_file_path}'. Please check the filename and path.")

## Experimental setup
Change parameters here to define each Experiment.


In [None]:
# --- immutable parameters ---
BASE_RANDOM_STATE = 42  # Initial seed for reproducibility of the entire sequence of experiments
DATA_DIR = "Hyperoptimization"   # Directory where the datasets are stored
os.makedirs(DATA_DIR, exist_ok=True) # Create the directory if it doesn't exist

# Mutable parameters that can be changed for each experiment below
settings = {
    "glycan_dataset":    'df_disease', # The glycowork dataset to use
    "glycan_class":      'disease_association', # The class to predict from the chosen dataset
    "num_runs":          2,  # Number of trials per configuration (e.g., 5 or 10)
    "epochs":            2, # Number of training epochs per run
    "batch_size":        128, # 32 or 128 seems to work well
    "train_size":        0.7, # Fraction of data to use for training (0.7 = 70% train, 15% val, 15% test)
    "learning_rate":     0.005, # Learning rate for the optimizer
    "drop_last":         False, # Whether to drop the last batch if it's smaller than the batch size
    "augment_prob":      0.0,  # Adjust if you want augmentation for training
    "generaliz_prob":    0.2,  # Adjust if you want generalization for training
    "patience":          25, # number of epochs without improvement before EarlyStop kicks in 
    "min_class_size":     2, # Minimum number of samples in a class to be included in the dataset
}
# ---- Datasets and num_classes within them ------
#  'df_species': 'Species', 'Genus', 'Family', 'Order', 'Class', 'Phylum', 'Kingdom', 'Domain', 'ref'
#  'df_tissue': 'tissue_sample', 'tissue_species', 'tissue_id', 'tissue_ref'
#  'df_disease': 'disease_association', 'disease_sample', 'disease_direction', 'disease_species', 'disease_id', 'disease_ref'

# --- Define Experiments ---
# This list defines all sets of parameters you want to change for each experiment, they override the default parameters
# Can be used to test different datasets and num_classes
# Or for crude hyperparameter tuning
experiments = [
    # --- learning_rate variations ---
    # Disease dataset
    { "name": "disease_lr_0005", "glycan_dataset": "df_disease", "glycan_class": "disease_association", "learning_rate": 0.0005},
    { "name": "disease_lr_001", "glycan_dataset": "df_disease", "glycan_class": "disease_association", "learning_rate": 0.001},
    { "name": "disease_lr_01", "glycan_dataset": "df_disease", "glycan_class": "disease_association", "learning_rate": 0.01},
    { "name": "disease_lr_05", "glycan_dataset": "df_disease", "glycan_class": "disease_association", "learning_rate": 0.05},
    # Kingdom dataset
    { "name": "kingdom_lr_0005", "glycan_dataset": "df_species", "glycan_class": "Kingdom", "learning_rate": 0.0005},
    { "name": "kingdom_lr_001", "glycan_dataset": "df_species", "glycan_class": "Kingdom", "learning_rate": 0.001},
    { "name": "kingdom_lr_01", "glycan_dataset": "df_species", "glycan_class": "Kingdom", "learning_rate": 0.01},
    { "name": "kingdom_lr_05", "glycan_dataset": "df_species", "glycan_class": "Kingdom", "learning_rate": 0.05},
    # Tissue dataset
    { "name": "tissue_lr_0005", "glycan_dataset": "df_tissue", "glycan_class": "tissue_sample", "learning_rate": 0.0005 },
    { "name": "tissue_lr_001", "glycan_dataset": "df_tissue", "glycan_class": "tissue_sample", "learning_rate": 0.001 },
    { "name": "tissue_lr_01", "glycan_dataset": "df_tissue", "glycan_class": "tissue_sample", "learning_rate": 0.01 },
    { "name": "tissue_lr_05", "glycan_dataset": "df_tissue", "glycan_class": "tissue_sample", "learning_rate": 0.05 },

    # --- batch_size variations ---
    # Disease dataset
    { "name": "disease_bs_32", "glycan_dataset": "df_disease", "glycan_class": "disease_association", "batch_size": 32, },
    # Kingdom dataset
    { "name": "kingdom_bs_32", "glycan_dataset": "df_species", "glycan_class": "Kingdom", "batch_size": 32, },
    # Tissue dataset
    { "name": "tissue_bs_32", "glycan_dataset": "df_tissue", "glycan_class": "tissue_sample", "batch_size": 32, },

    # --- min_class_size variations ---
    # Disease dataset
    { "name": "disease_bs_32", "glycan_dataset": "df_disease", "glycan_class": "disease_association", "min_class_size": 32, },
    # Kingdom dataset
    { "name": "kingdom_bs_32", "glycan_dataset": "df_species", "glycan_class": "Kingdom", "min_class_size": 32, },
    # Tissue dataset
    { "name": "tissue_bs_32", "glycan_dataset": "df_tissue", "glycan_class": "tissue_sample", "min_class_size": 32, }
]

# --- Define Experiment Configurations for each run of the experiment ---
# This list defines all sets of parameters you want to test for all the runs of each experiment
experiment_configs = [
    {
        "name": "baseline_trainable",
        "initialization_method": "random", 
        "trainable_embeddings": True
    },
    #{
    #    "name": "baseline_fixed",
    #    "initialization_method": "random", 
    #    "trainable_embeddings": False
    #},
    {
        "name": "infused_trainable",
        "initialization_method": "external", 
        "trainable_embeddings": True
    }#,
    #{
    #    "name": "infused_fixed",
    #    "initialization_method": "external", 
    #    "trainable_embeddings": False
    #}  
    # add more configurations as needed     
]


# Set the random seed for reproducibility
seed_everything(BASE_RANDOM_STATE)

print()
print(f"A batch of {len(experiments)} experiments with {len(experiment_configs)} configurations have been set up")

## Run Experiment Loop

In [None]:
# --- Hyperautomation loop ---

# silence weird warnings for the sigmoid function
numpy.seterr(over='ignore')

all_run_summary_results = {} # For storing best metrics from each run
all_run_epoch_histories = {} # For storing full epoch-wise histories

print(f"initializing hyperautomation loop for {len(experiments)} experiments")

# Loop through each experiment
for experiment in tqdm(experiments,desc="Running hyperautomation loop"):
    
    # load the settings for this experiment
    for setting in settings:
        if setting in experiment:
            settings[setting] = experiment[setting]
       
    # get name of the current experiment
    experiment_name = experiment["name"]
    num_runs = settings['num_runs']

    #print("----------------")
    #print(f"Initializing {experiment_name} experiment with {len(experiment_configs)} configurations and {num_runs} runs per configuration.")
    #print()

   


    # generate the paths for saving results
    results_summary_path =      os.path.join(DATA_DIR,f"summary_{experiment_name}.csv")
    epoch_data_path =           os.path.join(DATA_DIR,f"epoch_data_{experiment_name}.pkl")
    test_set_path =             os.path.join(DATA_DIR,f"test_set_{experiment_name}.pkl")
    saved_models_dir =          os.path.join(DATA_DIR,f"saved_models_{experiment_name}")
    experiment_settings_path =  os.path.join(DATA_DIR,f"experiment_settings_{experiment_name}.yaml")

    # generate yaml file to save the settings for this experiment
    experiment_log_data = {
        "experiment_name": experiment_name,
        "global_parameters_for_this_experiment": settings, # This holds updated settings
        "experiment_specific_configs": experiment, # The specific entry from the 'experiments' list
        "all_model_configurations_tested": experiment_configs, # All configs tested within this experiment
        "base_random_state_for_experiment_batch": BASE_RANDOM_STATE # Log the base seed
    }
    try:
        with open(experiment_settings_path, 'w') as f:
            yaml.dump(experiment_log_data, f, default_flow_style=False, sort_keys=False) # sort_keys=False to preserve order

        #print(f"--- Settings for experiment '{experiment_name}' saved to {experiment_settings_path} ---")
        

    except Exception as e:
        print(f"Error saving settings for experiment '{experiment_name}' to YAML: {e}")

    #print()

    # Set the initial random seed for this experiment
    seed_everything(BASE_RANDOM_STATE, True)
    #print()
    
    # Load dataset for the experiment
    glycans, labels, label_names = build_multilabel_dataset(glycan_dataset = 'df_disease', 
                                                        glycan_class = 'disease_association', 
                                                        min_class_size = settings['min_class_size'], 
                                                        silent= True)

    
    num_classes = len(labels[0]) # Number of classes in the dataset

    # calculate the split ratio for train/val/test
    test_split_ratio = 1 - ((1 - settings['train_size'])/2)
    val_split_ratio = 1- ((1 - settings['train_size']) / (1 + settings['train_size']))

    # split out the test set outside of the loop to stop data leakage
    temp_glycans, test_glycans, _, temp_labels, test_labels, _ = multilabel_split(glycans, labels, train_size = test_split_ratio, 
                                                                random_state = BASE_RANDOM_STATE, no_test = True, silent= True)

    # load the test set into a dataloader
    test_dataloader = dataset_to_dataloader(glycan_list = test_glycans, 
                                            labels = test_labels,
                                            batch_size = settings['batch_size'],
                                            drop_last = settings['drop_last'],
                                            augment_prob = settings['augment_prob'], 
                                            generalization_prob = settings['generaliz_prob']
    )
    # Save the test set to a pickle file
    with open(test_set_path, 'wb') as f:
        pickle.dump(test_dataloader, f) 
    #print(f"Saved test set to {test_set_path}")

    for i in tqdm(range(num_runs), desc=f"{experiment_name} runs"):

        # Print the current run number
        #print("----------------")
        #print(f"Run {i+1}/{num_runs}")
        

        # Increment the random state for each run
        random_state = BASE_RANDOM_STATE + i

        # Set the random seed for reproducibility for this run
        seed_everything(random_state, True)    

        # --- split data for this run outside of core loop for efficiency ---
        # Split the remainding dataset into training and validation
        train_glycans, val_glycans, _, train_labels, val_labels, _ = multilabel_split(temp_glycans, temp_labels, train_size=val_split_ratio, 
                                                                    random_state=random_state, no_test = True, silent= True)

        # Load into dataloders for training and validation
        dataloaders = split_data_to_train(
            glycan_list_train = train_glycans, glycan_list_val = val_glycans, labels_train = train_labels, labels_val = val_labels,
            batch_size = settings['batch_size'],
            drop_last = settings['drop_last'],
            augment_prob = settings['augment_prob'], 
            generalization_prob = settings['generaliz_prob']
        )
        
        # --- Loop through each configuration & train model ---
        for config in tqdm(experiment_configs, desc="Training models for each config", leave=False):

            # Extract the configuration parameters
            config_name = config["name"]
            initialization_method = config["initialization_method"]
            trainable_embeddings = config["trainable_embeddings"]

            # Print the current configuration being used
            #print("----------------")
            #print(f"Running configuration: {config_name}")
            

            # --- Model Training ---
            # Initialize the model with the specified parameters
            model =  prep_infused_sweetnet(
                        initialization_method = initialization_method,
                        num_classes = num_classes,
                        embeddings_dict = glm_embeddings, 
                        trainable_embeddings = trainable_embeddings, 
                        silent= True
                        ) 
            
            
            
            # Run the training setup function to prepare the model for training
            optimizer, scheduler, criterion = model_training.training_setup(model, settings['learning_rate'], num_classes = num_classes)

            # Silence all of the epoch-wise prints for the training process
            original_stdout = sys.stdout # Save the original stdout
            sys.stdout = open(os.devnull, 'w') # Redirect stdout to /dev/null (or equivalent on Windows)
            
            # Start the timer for the training process
            start_time = time.time()

            # Run the training process
            model_ft, current_run_metrics = model_training.train_model(model, dataloaders, criterion, optimizer, scheduler,
                                                                        num_epochs = settings['epochs'], mode = 'multilabel', 
                                                                        return_metrics = True, patience = settings['patience'])
            
            # Save the time taken for this run
            end_time = time.time() # Record time after training
            time_elapsed_seconds = end_time - start_time 
            

            sys.stdout.close() # Close the null device file
            sys.stdout = original_stdout # Restore original stdout 

      
            # Collect the epoch-wise metrics from this configuration
            config_run_identifier = f"{config_name}_{i+1}"
            all_run_epoch_histories[config_run_identifier] = current_run_metrics


            # Save the best model to a file
            os.makedirs(saved_models_dir, exist_ok=True) # make sure the directory exists

            model_filename = f"{config_name}_run_{i+1}_state_dict.pth"
            model_filepath = os.path.join(saved_models_dir, model_filename)
            torch.save(model_ft.state_dict(), model_filepath)
            #print(f"Saved model to {model_filepath}")
        
            # --- Save the best metrics from this run to the summary results dictionary ---
            # I should add a way to turn certain metrics off for some runs, when doing hyperparameter optimization
            # generate keys for the summary results
            loss_key = f"{config_name}_loss"
            lrap_key = f"{config_name}_lrap"
            ndcg_key = f"{config_name}_ndcg"
            time_key = f"{config_name}_time"
            metric_keys = [loss_key, lrap_key, ndcg_key,time_key]
            
            # add keys to the summary results dictionary if they don't exist
            for key in metric_keys:
                if key not in all_run_summary_results:
                    all_run_summary_results[key] = []

            # Find the best metrics from the current run
            best_loss = min(current_run_metrics['val']['loss'])
            best_lrap = max(current_run_metrics['val']['lrap'])
            best_ndcg = max(current_run_metrics['val']['ndcg'])
            train_time = time_elapsed_seconds
            best_metrics = [best_loss, best_lrap, best_ndcg, train_time]

            for key, metric in zip(metric_keys, best_metrics):
                # Append the best metric to the summary results dictionary
                all_run_summary_results[key].append(metric)
            
        # --- Export metrics at end of each run, in case of early termination ---

        # Save the epoch-wise metrics for this run to a pickle file
        with open(epoch_data_path, 'wb') as f:
            pickle.dump(all_run_epoch_histories, f)
        #print(f"Saved training histories to {epoch_data_path}")

        # Save the summary results to a CSV file
        with open(results_summary_path, 'w') as f:
            # Write the header
            f.write(','.join(all_run_summary_results.keys()) + '\n')
            
            # Write the data
            any_key = next(iter(all_run_summary_results.keys()))
            num_runs_done = len(all_run_summary_results[any_key])
            for i in range(num_runs_done):
                row = [str(all_run_summary_results[key][i]) for key in all_run_summary_results.keys()]
                f.write(','.join(row) + '\n')
        #print(f"Saved summary results to {results_summary_path}")

print("All experiments completed successfully!")

In [None]:
import numpy as np
def sigmoid(x: float, # input value
            cutoff: float = 700 # cutoff value for sigmoid transformation
           ) -> float: # sigmoid transformed value
    "Apply sigmoid transformation to input"
    print(f"DEBUG: Sigmoid input x type: {type(x)}, dtype: {x.dtype if hasattr(x, 'dtype') else 'N/A'}")
    print(f"DEBUG: Sigmoid input x min: {np.min(x)}, max: {np.max(x)}") # If x is an array

    if cutoff is not None:
        x_clipped = np.clip(x, -cutoff, cutoff)
        print(f"DEBUG: x_clipped type: {type(x_clipped)}, dtype: {x_clipped.dtype if hasattr(x_clipped, 'dtype') else 'N/A'}")
        print(f"DEBUG: x_clipped min: {np.min(x_clipped)}, max: {np.max(x_clipped)}") # If x_clipped is an array
        print(f"DEBUG: np.exp(-x_clipped) input value: {-x_clipped}") # Print the actual value going into exp

        # You can temporarily change RuntimeWarning to an error to stop execution right at the source:
        # import numpy as np
        # np.seterr(all='raise') 

        return 1 / (1 + np.exp(-x_clipped))
    else:
        return 1 / (1 + np.exp(-x))

In [None]:
sigmoid(-11111100000.)

In [None]:
import inspect
from glycowork.ml.model_training import sigmoid # Assuming this is the correct import path

print(inspect.getsource(sigmoid))

In [None]:
# create test_dataloader
from glycowork.ml.processing import dataset_to_dataloader
test_dataloader = dataset_to_dataloader(glycan_list = test_glycans, labels = test_labels, generaliz_prob = 0)


In [None]:
# Developing function to test model here before migrating to utils.py 


#NOT DONE YET, Need to add stuff to generate metrics

import torch
import torch.utils.data
import torch.nn 
from typing import Dict # Import Dict for type hinting
from glycowork.ml.model_training import sigmoid
from sklearn.metrics import label_ranking_average_precision_score, ndcg_score

import torch
device = "cpu"
if torch.cuda.is_available():
    device = "cuda:0"

def test_model(model: torch.nn.Module, 
               dataloader: torch.utils.data.DataLoader, 
               criterion: torch.nn.Module) -> dict[str, float]:
    """
    Evaluates a multi-label model on a test set.

    Parameters
    ----------
    model: torch.nn.Module
        The trained model to evaluate.
    dataloader: torch.utils.data.DataLoader
        DataLoader containing the test split.
    criterion: torch.nn.Module
        The loss function to calculate average loss during evaluation.

    Returns
    -------
    dict
        A dictionary containing calculated evaluation metrics
        for the multi-label task (Loss, LRAP, NDCG).
    """
    model.eval()  # Set the model to evaluation mode

    with torch.no_grad(): 
        total_loss = 0.
        raw_output = []
        true_labels = []

        for data in dataloader:
            # Get all relevant node attributes
            x, y, edge_index, batch = data.labels, data.y, data.edge_index, data.batch
            prot = getattr(data, 'train_idx', None)
            if prot is not None:
                prot = prot.view(max(batch) + 1, -1).to(device)
            x = x.to(device)            
            y = y.view(max(batch) + 1, -1).to(device)
            edge_index = edge_index.to(device)
            batch = batch.to(device)

            # --- Forward Pass ---
            # Call the model with the extracted data, similar to train_model
            if prot is not None:
                 pred = model(prot, x, edge_index, batch)
            else:
                 pred = model(x, edge_index, batch)

            # --- Calculate and Accumulate Loss ---
            # Use the criterion to calculate loss and accumulate total loss
            
            loss = criterion(pred, y.float())
            # Accumulate loss, weighted by the number of graphs in the batch
            # max(batch) + 1 gives the number of graphs in a PyG Batch object
            total_loss += loss.item() * (max(batch) + 1)

            # --- Collect Outputs and Labels ---
            # Append the raw model outputs and true labels to lists
            # Keep them as tensors on the device for now
            raw_output.append(pred)
            true_labels.append(y)

    # NOT DONE YET, Need to add stuff to generate metrics

In [None]:
test_model(model_ft, test_dataloader, criterion)

In [None]:
# Load trial data
Dir = "Datasets"   # Directory where the datasets are stored
pickle_file_path = os.path.join(Dir, "test_set_kingdom1.pkl")

# --- Load the Pickle File ---
if os.path.exists(pickle_file_path):
    print(f"Loading data from: {pickle_file_path}")
    try:
        # Open the file in binary read mode ('rb')
        with open(pickle_file_path, 'rb') as file_handle:
            # Load the object(s) from the pickle file
            user_data_string_from_input = pickle.load(file_handle)

        print("Data loaded successfully!")        

    except Exception as e:
        print(f"An error occurred while loading the pickle file: {e}")
else:
    print(f"Error: File not found at '{pickle_file_path}'. Please check the filename and path.")

In [None]:
print(next(iter(user_data_string_from_input)))