# Imports

In [None]:
import os
import sys

import pandas as pd

from ray import tune
from ray.tune.schedulers import PopulationBasedTraining

module_path = os.path.abspath(os.path.join(os.getcwd(), ".."))
if module_path not in sys.path:
    os.environ["PYTHONPATH"] = module_path
    sys.path.append(module_path)

from src.train import train_model
from src.experiment import get_variables_combinations

# Training and Tuning a Machine Learning Model with Population-Based Training

In [None]:
def main(
    i,
    model,
    exp_name,
    file_name,
    n_samples,
    max_num_epochs,
    min_num_epochs,
    local_dir="../ray_results",
):
    target_variable = "Flow_Kalltveit"
    datetime_variable = "Datetime"

    variables = [get_variables_combinations(file_name, datetime_variable)[i]]

    config = {
        "data_file": file_name,
        "datetime": datetime_variable,
        "data": {
            "target_variable": target_variable,
            "sequence_length": tune.choice([25]),
            "batch_size": tune.choice([256]),
            "variables": tune.grid_search(variables),
            "split_size": {"train_size": 0.7, "val_size": 0.2, "test_size": 0.1},
        },
        "model": tune.grid_search(model),
        "model_arch": {
            "input_size": tune.sample_from(
                lambda spec: len(spec.config.data["variables"]) + 1
            ),
            "hidden_size": tune.choice([32, 64]),
            "num_layers": tune.choice([1, 2, 3]),
            "output_size": 1,
        },
        "training": {
            "learning_rate": tune.loguniform(1e-5, 1e-1),
            "weight_decay": tune.loguniform(1e-5, 1e-1),
        },
        "num_epochs": max_num_epochs,
    }

    reporter = tune.JupyterNotebookReporter(
        metric_columns=["train_loss", "val_loss", "test_loss", "training_iteration"]
    )

    scheduler_population = PopulationBasedTraining(
        time_attr="training_iteration",
        perturbation_interval=min_num_epochs,
        hyperparam_mutations={
            "weight_decay": tune.uniform(0.0, 0.3),
            "learning_rate": tune.loguniform(1e-5, 1e-1),
            "model_arch.hidden_size": tune.choice([32, 64]),
            "model_arch.num_layers": tune.choice([1, 2, 3]),
        },
    )

    stop = {
        "training_iteration": max_num_epochs,
    }

    if not os.path.exists(local_dir):
        os.makedirs(local_dir)

    results = tune.run(
        train_model,
        resources_per_trial={"cpu": 12, "gpu": 1},
        config=config,
        num_samples=n_samples,
        scheduler=scheduler_population,
        progress_reporter=reporter,
        name=exp_name,
        local_dir=local_dir,
        metric="val_loss",
        mode="min",
        stop=stop,
        # search_alg=search_alg, # Add the chosen search algorithm
        keep_checkpoints_num=1,
        checkpoint_score_attr="val_loss",
    )
    return results

# Training and Evaluating Multiple Machine Learning Models on Cleaned Data

In [None]:
data_dir = "./data"
clean_data_dir = os.path.abspath(os.path.join(data_dir, "clean_data"))

results = []

model_dict = {
    "test-lstm": "LSTM",
    "test-temp": "LSTMTemporalAttention",
    "test-spa_temp": "LSTMSpatioTemporalAttention",
    "test-fcn": "FCN",
}
for i in range(4):
    for exp_name, model in model_dict.items():
        filename = "cleaned_data_4.csv"
        # Get the full path of the file
        file_path = os.path.join(clean_data_dir, filename)

        num = filename.split("_")[2].split(".")[0]
        experiment = f"data_{num}-{exp_name}"

        analysis = main(
            i,
            [model],
            exp_name=experiment,
            file_name=filename,
            n_samples=25,
            max_num_epochs=100,
            min_num_epochs=25,
        )

        results.append(analysis)

# Combining and Displaying Results of Machine Learning Model Training and Evaluation

In [None]:
dfs = []  # List to store results

for analysis in results:
    df = analysis.dataframe()[
        [
            "train_loss",
            "val_loss",
            "train_loss",
            "config/model",
            "time_total_s",
            "config/data/variables",
        ]
    ]
    dfs.append(df)

# Concatenate all results into a single dataframe
combined_df = pd.concat(dfs, ignore_index=True)
print(combined_df)