In [1]:
colab=False
if colab:
    from google.colab import drive
    drive.mount("/content/drive")
    !source /content/drive/MyDrive/my_colab_env/bin/activate
    import sys
    import os
    sys.path.append("/content/drive/MyDrive/my_colab_env/lib/python3.10/site-packages")
    os.chdir("/content/drive/MyDrive/ml4science/ml4physim_startingkit")

# Packed Ensemble Application to the AirfRANS dataset

### Generic Step (Load the required data) <a id='generic_step'></a>

Install the LIPS framework if it is not already done. For more information look at the LIPS framework [Github repository](https://github.com/IRT-SystemX/LIPS)

In [2]:
# !pip install -r requirements.txt
# or
# !pip install -U .


Install the AirfRANS package

In [3]:
# !pip install airfrans

### Generic Step (Load the required data) <a id='generic_step'></a>

In [4]:
import math
import os
from lips import get_root_path

In [5]:
# indicate required paths
LIPS_PATH = get_root_path()
DIRECTORY_NAME = '../ml4physim_startingkit/Dataset'
BENCHMARK_NAME = "Case1"
LOG_PATH = LIPS_PATH + "lips_logs.log"

Define the configuration files path, that aim to describe specific caracteristics of the use case or the augmented simulator.

In [6]:
BENCH_CONFIG_PATH = os.path.join("airfoilConfigurations", "benchmarks",
                                 "confAirfoil.ini")  #Configuration file related to the benchmark
SIM_CONFIG_PATH = os.path.join("airfoilConfigurations", "simulators", "torch_fc.ini")  #Configuration file re

Download the data

In [7]:
not os.path.isdir(DIRECTORY_NAME)

False

In [8]:
from lips.dataset.airfransDataSet import download_data

if not os.path.isdir(DIRECTORY_NAME):
    download_data(root_path=".", directory_name=DIRECTORY_NAME)

Loading the dataset using the dedicated class used by LIPS platform offers a list of advantages:

1. Ease the importing of datasets
1. A set of functions to organize the `inputs` and `outputs` required by augmented simulators


In [9]:
# Load the required benchmark datasets
from lips.benchmark.airfransBenchmark import AirfRANSBenchmark
import pickle

try:
    with open('benchmark.pkl', 'rb') as f:
        benchmark = pickle.load(f)
except:
    benchmark = AirfRANSBenchmark(benchmark_path=DIRECTORY_NAME,
                                config_path=BENCH_CONFIG_PATH,
                                benchmark_name=BENCHMARK_NAME,
                                log_path=LOG_PATH)
    benchmark.load(path=DIRECTORY_NAME)
    with open('benchmark.pkl', 'wb') as f:
        pickle.dump(benchmark, f)

# Model selection (Cross validation)

Importing the necessary dependencies, as well as the `packed_ensemble` methods

In [10]:
if colab:
    sys.path.append(os.getcwd())

In [11]:
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from tqdm import tqdm
import itertools as it

In [12]:
from my_packed_ensemble import *

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
def build_k_indices(num_row, k_fold, seed):
    """build k indices for k-fold.

    Parameters
    ----------
    num_row : int
        Number of rows in the dataset.
    k_fold : int
        Number of folds
    seed : int
        Seed for random generator

    Returns
    -------
    k_indices : np.array
        Array of indices for each fold"""
    interval = int(num_row / k_fold)
    np.random.seed(seed)
    indices = np.random.permutation(num_row)
    k_indices = [indices[k * interval: (k + 1) * interval] for k in range(k_fold)]
    return np.array(k_indices)

Create cross validation on hyperparameters of the model

In [14]:
#from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from lips.dataset.scaler.standard_scaler import StandardScaler

In [15]:
def save_training_validation_losses_plot(train_losses_list: list, val_losses_list: list,
                                         hyperparam_dict: dict, folder: str, plot_name: str):
    """
    Saves the training and validation losses plot.

    Parameters
    ----------
    train_losses_list : list
        List containing the training losses.
    val_losses_list : list
        List containing the validation losses.
    """

    # create folder if it does not exist
    if not os.path.isdir(folder):
        os.makedirs(folder)
    
    # clear previous plot
    plt.clf()

    plt.plot(train_losses_list, label='Training loss', color='blue')
    plt.plot(val_losses_list, label='Validation loss',color='red')

    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    
    plt.title(f'Losses for hidden_sizes={hyperparam_dict["hidden_sizes"]}, dropout={hyperparam_dict["dropout"]}, M={hyperparam_dict["M"]}, \n alpha={hyperparam_dict["alpha"]}, gamma={hyperparam_dict["gamma"]}, lr={hyperparam_dict["lr"]}')
    plt.legend()
    plt.savefig(folder + "/" + plot_name)

In [16]:
def hyperparameters_tuning(param_grid: dict, k_folds: int, num_epochs: int, batch_size: int = 128000,
                            shuffle: bool = False, n_workers: int = 0, seed: int=42, scaler: Scaler=None):
    """
    Performs hyperparameter tuning using K-fold cross validation.

    Parameters
    ----------
    param_grid : dict
        Dictionary containing the values for each hyperparameter to be tested.
    k_folds : int
        Number of folds to be used in the cross validation.
    num_epochs : int
        Number of epochs to be used in the training.
    batch_size : int
        Batch size to be used in the training.
    shuffle : bool
        Whether to shuffle the training dataset.
    n_workers : int
        Number of workers to be used in the training.
    seed : int
        Random seed to be used in the training.

    Returns
    -------
    results_df : pd.DataFrame
        DataFrame containing the results of the hyperparameter tuning.
    """

    # generate all combinations of parameter values
    combinations = it.product(*(param_grid[key] for key in param_grid))

    # create a new dictionary with keys as hyperparameter names and values as lists of combinations
    hyperparameter_dict = {key: [] for key in param_grid}

    # fill in the values for each key in the new dictionary
    for combo in combinations:
        for i, key in enumerate(param_grid):
            hyperparameter_dict[key].append(combo[i])

    hyperparameters_size = len(hyperparameter_dict[list(hyperparameter_dict.keys())[0]])
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print(device)
    torch.manual_seed(seed)
    dataset = benchmark.train_dataset
    input_size, output_size = infer_input_output_size(dataset)

    extract_x, extract_y = dataset.extract_data()
    results_df = pd.DataFrame(columns=[*param_grid.keys(), "mean_loss"])

    for i in tqdm(range(hyperparameters_size)):
        param_dict = {
            'hidden_sizes': hyperparameter_dict["hidden_sizes"][i],
            'dropout': hyperparameter_dict["dropout"][i],
            'M': hyperparameter_dict["M"][i],
            'alpha': hyperparameter_dict["alpha"][i],
            'gamma': hyperparameter_dict["gamma"][i],
            'lr': hyperparameter_dict["lr"][i]
        }

        print(f'Hyperparameters: {i}/hidden_sizes={hyperparameter_dict["hidden_sizes"][i]}, \
              dropout={hyperparameter_dict["dropout"][i]}, M={hyperparameter_dict["M"][i]}, alpha={hyperparameter_dict["alpha"][i]}, \
              gamma={hyperparameter_dict["gamma"][i]}, lr={hyperparameter_dict["lr"][i]}')

        # define the K-fold Cross Validator
        k_indices = build_k_indices(extract_y.shape[0], k_folds, seed=seed)
        summed_total_loss = 0

        # k-fold Cross Validation model evaluation
        for fold in range(k_folds):
            print(f"fold: {fold}")
            
            # initialize the Packed MLP model
            model = PackedMLP(
                input_size=input_size,
                output_size=output_size,
                hidden_sizes=hyperparameter_dict["hidden_sizes"][i],
                activation=F.relu,
                device=device,
                dropout=hyperparameter_dict["dropout"][i],
                M=hyperparameter_dict["M"][i],
                alpha=hyperparameter_dict["alpha"][i],
                gamma=hyperparameter_dict["gamma"][i],
                scaler=scaler
            )
            model.to(device)

            val_ids = k_indices[fold]
            train_ids = k_indices[~(np.arange(k_indices.shape[0]) == fold)]

            train_x = extract_x[train_ids]
            train_y = extract_y[train_ids]

            train_x = train_x.reshape(train_x.shape[0] * train_x.shape[1], -1)
            train_y = train_y.reshape(train_y.shape[0] * train_y.shape[1], -1)

            val_x = extract_x[val_ids]
            val_y = extract_y[val_ids]

            trainloader = model.process_dataset(data=(train_x, train_y), training=True, batch_size=batch_size, shuffle=shuffle, n_workers=n_workers)
            validateloader = model.process_dataset(data=(val_x, val_y), training=False, batch_size=batch_size, shuffle=shuffle, n_workers=n_workers)

            model, train_losses, val_losses = train(model=model, train_loader=trainloader, val_loader=validateloader, epochs=num_epochs, device=device, lr=hyperparameter_dict["lr"][i], verbose=True)

            summed_total_loss += torch.mean(val_losses)

            # saving the curve
            save_training_validation_losses_plot(train_losses_list=train_losses, val_losses_list=val_losses,
                                                 hyperparam_dict=param_dict, folder="CV_plots", plot_name=f'hyperparameters_{i}_fold_{fold}.png')

        mean_total_loss = summed_total_loss / k_folds
        # print fold results
        print(f'FOLD {fold} RESULTS FOR {i}th HYPERPARAMETERS')
        print(f'Average validation loss: {mean_total_loss}')
        print('--------------------------------')

        param_dict.update({'mean_val_loss': mean_total_loss})
        results_df.loc[len(results_df)] = param_dict

    return results_df

In [17]:
param_grid = {
    'hidden_sizes': [(48, 128, 48), (128, 256, 128), (256, 512, 256)],
    'dropout': [True, False],
    "alpha": [2, 4],
    "gamma": [1, 2, 4],
    "M": [4],
    'lr': [3e-4,1e-2,1e-3]
}

In [18]:
param_grid = {
    'hidden_sizes': [(48, 128, 48)],
    'dropout': [True],
    "alpha": [2],
    "gamma": [1],
    "M": [4],
    'lr': [3e-4],
}

In [19]:
torch.cuda.is_available()

True

In [21]:
results_df = hyperparameters_tuning(param_grid, k_folds=4, num_epochs=2, batch_size=1280000, shuffle=True, n_workers=6, scaler=StandardScaler())
#results_df.to_csv("results.csv", index=False)

cuda:0


  0%|          | 0/1 [00:00<?, ?it/s]

Hyperparameters: 0/hidden_sizes=(48, 128, 48),               dropout=True, M=4, alpha=2,               gamma=1, lr=0.0003
fold: 0



  0%|          | 0/11 [02:02<?, ?it/s]
Epochs:   0%|          | 0/2 [02:02<?, ?it/s]
  0%|          | 0/1 [02:05<?, ?it/s]


KeyboardInterrupt: 

In [None]:
CV_RESULTS_FOLDER = "CV/results"

# create folder if it does not exist
if not os.path.isdir(CV_RESULTS_FOLDER):
    os.makedirs(CV_RESULTS_FOLDER)

results_df.to_csv(CV_RESULTS_FOLDER + "/results.csv", index=False)

# Model training

In [None]:
train_loader = process_dataset(benchmark.train_dataset, training=True, n_workers=6)
input_size, output_size = infer_input_output_size(benchmark.train_dataset)

In [None]:
# device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model = PackedMLP(input_size=input_size,
                  output_size=output_size,
                  hidden_sizes=(50, 100, 50),
                  activation=F.relu,
                  device=device,
                  dropout=True,
                  )
model.to(device)
model.device

In [None]:
print(model)

In [None]:
model, train_losses, _ = train(model, train_loader, epochs=1, device=device, lr=3e-4)

##### prediction on `test_dataset`
This dataset has the same distribution as the training set

In [None]:
predictions, observations = predict(model, benchmark._test_dataset, device=device)

In [None]:
print("Prediction dimensions: ", predictions["x-velocity"].shape, predictions["y-velocity"].shape,
      predictions["pressure"].shape, predictions["turbulent_viscosity"].shape)
print("Observation dimensions:", observations["x-velocity"].shape, observations["y-velocity"].shape,
      observations["pressure"].shape, observations["turbulent_viscosity"].shape)
print("We have good dimensions!")

In [None]:
from lips.evaluation.airfrans_evaluation import AirfRANSEvaluation

evaluator = AirfRANSEvaluation(config_path=BENCH_CONFIG_PATH,
                               scenario=BENCHMARK_NAME,
                               data_path=DIRECTORY_NAME,
                               log_path=LOG_PATH)

observation_metadata = benchmark._test_dataset.extra_data
metrics = evaluator.evaluate(observations=observations,
                             predictions=predictions,
                             observation_metadata=observation_metadata)
print(metrics)

##### Prediction on `test_ood_dataset`
This dataset has a different distribution in comparison to the training set.

In [None]:
predictions, observations = predict(model, benchmark._test_ood_dataset, device=device)
evaluator = AirfRANSEvaluation(config_path=BENCH_CONFIG_PATH,
                               scenario=BENCHMARK_NAME,
                               data_path=DIRECTORY_NAME,
                               log_path=LOG_PATH)

metrics = evaluator.evaluate(observations=observations,
                             predictions=predictions,
                             observation_metadata=observation_metadata)
print(metrics)