# Convert COMSOL data to PARC data

This notebook is used to convert COMSOL simulation data into tensors suitable for deep learning.
It aggregates the data from a parametric sweep into tensors of shape (N_trajectories, N_timesteps, X, Y) for 
each feature.

The input data (exported from COMSOL) is in the form of a csv file with the following columns:

x0 | y0 | t1 | t2 | t3 | ... | tN

x0 | y1 | t1 | t2 | t3 | ... | tN

x0 | y2 | t1 | t2 | t3 | ... | tN

x1 | y0 | t1 | t2 | t3 | ... | tN

x1 | y1 | t1 | t2 | t3 | ... | tN

x1 | y2 | t1 | t2 | t3 | ... | tN

...

xM | yN | t1 | t2 | t3 | ... | tN

where t1, t2, t3, ... are the time steps.

The tensor data is then stored as hdf5 files with the [format used by the well](https://polymathic-ai.org/the_well/data_format/).

Depending on the simulation, different fields and parameters must be stored.
If the geometry is randomized (e.g. different obstacles for fluid flow), it is considered as initial conditions.

In [1]:
import re
from pathlib import Path
from typing import Dict, Tuple, Any, Optional

import pandas as pd
import numpy as np
import h5py
import matplotlib.pyplot as plt

def search_comsol_data(data_path: Path, params: list[str], fields: list[str], seed_name: Optional[int]=None) -> Dict[Tuple[int, int, float], Dict[str, Any]]:
    """
    Search through velocities, pressure, and phase_boundary folders to find and group
    files with matching parameter combinations.

    Parameters
    ----------
    data_path : Path
        Base path containing the subdirectories
    params : list[str]
        List of parameter names to extract from the filenames (e.g. strucID, p_cap, theta, reynolds)
    fields : list[str]
        List of field names (subdirs in folder), e.g. vel_x, vel_y, pressure
    seed_name: str
        Name of the random seed variable, if present

    Returns
    -------
    Dict[Tuple[int, int, float], Dict[str, Any]]
        Dictionary with parameter combinations as keys and file information as values.
        The keys are tuples of (p_cap, theta) and values contain paths to field files.
    """
    data = {}

    # Function to extract parameters from filename
    def extract_params(filename: str, params: list[str]) -> dict:
        param_values = {}
        for param in params:
            param_value = re.search(rf"{param}_(\d+\.?\d*)", filename).group(1)
            # Convert parameter values to appropriate types based on parameter name
            if param.endswith('theta'):  # Angle parameters
                param_values[param] = round(np.rad2deg(float(param_value)), 2)
            else:  # Default to float for other parameters
                param_values[param] = float(param_value)

        return param_values

    # Search through each subdirectory
    for subdir in fields:
        subdir_path = data_path / subdir
        if not subdir_path.exists():
            print(f"Warning: Directory {subdir} not found")
            continue

        files = sorted(subdir_path.glob("*.csv"))
        for file in files:
            # Extract parameters
            # print(f"Processing: {file}")
            param_values = extract_params(file.stem, params)
            # Create parameter combination key from all params except strucID
            # strucID is not included since we want to aggregate over all strucIDs
            param_key = tuple(param_values[p] for p in params if p != seed_name)

            # Initialize nested dictionaries if they don't exist
            if param_key not in data:
                data[param_key] = {field: {} for field in fields}


            # Add file path under appropriate category
            for field in fields:
                if field in str(file):
                    if seed_name is not None:
                        seed = param_values[seed_name]
                        data[param_key][field][seed] = file
                    else:
                        data[param_key][field] = file

    return data


def load_comsol_csv(file_path: Path) -> tuple[np.ndarray, dict]:
    """
    Load COMSOL CSV data and reshape it into (time, x, y) format.
    Also return some metadata about the data.
    """
    # Read CSV and sort by x then y coordinates
    df = pd.read_csv(file_path, sep=";", dtype=np.float32, header=None, skiprows=9)
    df = df.sort_values(by=[df.columns[0], df.columns[1]]).reset_index(drop=True)

    # Get coordinate information
    x_coords = df[0].unique()
    y_coords = df[1].unique()

    # Calculate expected grid size and validate
    grid_size = len(x_coords) * len(y_coords)
    if len(df) != grid_size:
        raise ValueError(
            f"Data grid mismatch: {len(df)} points vs expected {grid_size} ({len(x_coords)}x{len(y_coords)})",
            f"Occured in file {file_path}"

        )

    # Reshape directly to (x, y, time) then transpose to (time, x, y)
    time_steps = df.columns[2:]
    time_data = df[time_steps].values
    data = time_data.reshape(len(x_coords), len(y_coords), len(time_steps))
    data = data.transpose(2, 0, 1)
    
    metadata = {
        "x_coords": x_coords,
        "y_coords": y_coords,
        "time_steps": np.arange(len(time_steps)),
    }

    return data, metadata

def load_comsol_csv_grid(file_path: Path) -> tuple[np.ndarray, dict]:
    """
    Load COMSOL CSV data and reshape it into (time, x, y) format.
    Also return some metadata about the data.

    This function expects the grid layout of comsol outputs
    """

    with open(file_path) as f:

        # Get coordinate information
        # Can't index file object like a list, need to read lines
        lines = f.readlines()

    if "vel_y" in str(file_path):
        delim = ","
    else:
        delim = ";"
    x_coords = np.loadtxt([lines[9]], delimiter=delim, dtype=np.float32)
    y_coords = np.loadtxt([lines[10]], delimiter=delim, dtype=np.float32)

    # Find the line indices where blocks start (lines containing "% Data")
    block_start_indices = []
    for i, line in enumerate(lines):
        if "% Data" in line:
            block_start_indices.append(i)
    
    # Add the end of file as the last boundary
    block_start_indices.append(len(lines))
    
    # Create blocks by chunking the lines
    blocks = []
    for i in range(len(block_start_indices) - 1):
        start_idx = block_start_indices[i] + 2 # use +2 to get the first line with data
        end_idx = block_start_indices[i + 1]
        block = lines[start_idx:end_idx]
        blocks.append(block)
    
    # Now we have the file chunked into blocks separated by "% Data" markers
    data = []
    for block in blocks:
        arr = np.loadtxt(block, delimiter=delim, dtype=np.float32)
        data.append(arr)
    time_data = np.array(data)
    # transpose x, y
    time_data  = np.transpose(time_data, (0,2,1))


    time_steps = len(blocks)
    
    metadata = {
        "x_coords": x_coords,
        "y_coords": y_coords,
        "time_steps": np.arange(time_steps),
    }

    return time_data, metadata

In [2]:
def create_hdf5_dataset_porous_media(output_path: Path, param_key: tuple, data_dict: dict):
    """
    Create HDF5 file with the specified format.
    """
    p_cap, theta = param_key
    filename = f"porous_twophase_flow_p_cap_{p_cap}_theta_{theta}.hdf5"

    with h5py.File(output_path / filename, "w") as f:
        # Root attributes
        f.attrs["simulation_parameters"] = ["p_cap", "theta"]
        f.attrs["p_cap"] = p_cap
        f.attrs["theta"] = theta
        f.attrs["dataset_name"] = "COMSOL_TwoPhaseFlow"
        f.attrs["grid_type"] = "cartesian"
        f.attrs["n_spatial_dims"] = 2
        f.attrs["n_trajectories"] = data_dict["n_trajectories"]

        # Load data from first file to get dimensions
        x_coords = data_dict["x_coords"]
        y_coords = data_dict["y_coords"]
        time_steps = data_dict["time_steps"]

        # Create dimensions group
        dims = f.create_group("dimensions")
        dims.attrs["spatial_dims"] = ["x", "y"]

        time_dset = dims.create_dataset("time", data=time_steps)
        time_dset.attrs["sample_varying"] = False

        x_dset = dims.create_dataset("x", data=x_coords)
        x_dset.attrs["sample_varying"] = False
        x_dset.attrs["time_varying"] = False

        y_dset = dims.create_dataset("y", data=y_coords)
        y_dset.attrs["sample_varying"] = False
        y_dset.attrs["time_varying"] = False

        # Create boundary conditions group
        bc = f.create_group("boundary_conditions")
        
        x_bc = bc.create_group("x_open")
        x_bc.attrs["associated_dims"] = ["x"]
        x_bc.attrs["associated_fields"] = []
        x_bc.attrs["bc_type"] = "open"
        x_bc.attrs["sample_varying"] = False
        x_bc.attrs["time_varying"] = False

        mask = np.zeros_like(x_coords, dtype=np.bool)
        mask[0] = True
        mask[-1] = True
        x_bc.create_dataset("mask", data=mask)
        x_bc.create_dataset("values", data=np.zeros_like(x_coords))

        # y-boundary
        y_bc = bc.create_group("y_wall")
        y_bc.attrs["associated_dims"] = ["y"]
        y_bc.attrs["associated_fields"] = []
        y_bc.attrs["bc_type"] = "wall"
        y_bc.attrs["sample_varying"] = False
        y_bc.attrs["time_varying"] = False
        mask = np.zeros_like(y_coords, dtype=np.bool)
        mask[0] = True
        mask[-1] = True
        y_bc.create_dataset("mask", data=mask)
        y_bc.create_dataset("values", data=np.zeros_like(y_coords))


        # Create scalars group
        scalars = f.create_group("scalars")
        scalars.attrs["field_names"] = ["p_cap", "theta"]

        p_cap_dset = scalars.create_dataset("p_cap", data=p_cap)
        p_cap_dset.attrs["sample_varying"] = False
        p_cap_dset.attrs["time_varying"] = False

        theta_dset = scalars.create_dataset("theta", data=theta)
        theta_dset.attrs["sample_varying"] = False
        theta_dset.attrs["time_varying"] = False

        # Create t0_fields group for pressure
        t0_fields = f.create_group("t0_fields")
        t0_fields.attrs["field_names"] = ["pressure", "phase_boundary"]

        # Load and store pressure field
        pressure_dset = t0_fields.create_dataset(
            "pressure", data=data_dict["pressure"]
        )
        pressure_dset.attrs["dim_varying"] = [True, True]
        pressure_dset.attrs["sample_varying"] = True
        pressure_dset.attrs["time_varying"] = True

        phase_boundary_dset = t0_fields.create_dataset(
            "phase_boundary", data=data_dict["phase_boundary"]
        )
        phase_boundary_dset.attrs["dim_varying"] = [True, True]
        phase_boundary_dset.attrs["sample_varying"] = True
        phase_boundary_dset.attrs["time_varying"] = True

        # Create t1_fields group for velocities
        t1_fields = f.create_group("t1_fields")
        t1_fields.attrs["field_names"] = ["velocity"]

        # Load velocity components
        velocity_dset = t1_fields.create_dataset(
            "velocity", data=data_dict["velocity"]
        )
        velocity_dset.attrs["dim_varying"] = [True, True]
        velocity_dset.attrs["sample_varying"] = True
        velocity_dset.attrs["time_varying"] = True

        # Create empty t2_fields group
        t2_fields = f.create_group("t2_fields")
        t2_fields.attrs["field_names"] = []


def create_hdf5_dataset_cylinder_wall_flow(output_path: Path, param_key: tuple, data_dict: dict):
    """
    Create HDF5 file with the specified format.
    """
    Re, cyl_dia = param_key
    filename = f"cylinder_wall_flow_Re_{Re}_cyl_dia_{cyl_dia}.hdf5"

    with h5py.File(output_path / filename, "w") as f:
        # Root attributes
        f.attrs["simulation_parameters"] = ["Re", "cyl_dia"]
        f.attrs["Re"] = Re
        f.attrs["cyl_dia"] = cyl_dia
        f.attrs["dataset_name"] = "COMSOL_CylinderWallFlow"
        f.attrs["grid_type"] = "cartesian"
        f.attrs["n_spatial_dims"] = 2
        f.attrs["n_trajectories"] = data_dict["n_trajectories"]

        # Load data from first file to get dimensions
        x_coords = data_dict["x_coords"]
        y_coords = data_dict["y_coords"]
        time_steps = data_dict["time_steps"]

        # Create dimensions group
        dims = f.create_group("dimensions")
        dims.attrs["spatial_dims"] = ["x", "y"]

        time_dset = dims.create_dataset("time", data=time_steps)
        time_dset.attrs["sample_varying"] = False

        x_dset = dims.create_dataset("x", data=x_coords)
        x_dset.attrs["sample_varying"] = False
        x_dset.attrs["time_varying"] = False

        y_dset = dims.create_dataset("y", data=y_coords)
        y_dset.attrs["sample_varying"] = False
        y_dset.attrs["time_varying"] = False

        # Create boundary conditions group
        bc = f.create_group("boundary_conditions")
        
        x_bc = bc.create_group("x_open")
        x_bc.attrs["associated_dims"] = ["x"]
        x_bc.attrs["associated_fields"] = []
        x_bc.attrs["bc_type"] = "open"
        x_bc.attrs["sample_varying"] = False
        x_bc.attrs["time_varying"] = False

        mask = np.zeros_like(x_coords, dtype=np.bool)
        mask[0] = True
        mask[-1] = True
        x_bc.create_dataset("mask", data=mask)
        x_bc.create_dataset("values", data=np.zeros_like(x_coords))

        # y-boundary
        y_bc = bc.create_group("y_wall")
        y_bc.attrs["associated_dims"] = ["y"]
        y_bc.attrs["associated_fields"] = []
        y_bc.attrs["bc_type"] = "wall"
        y_bc.attrs["sample_varying"] = False
        y_bc.attrs["time_varying"] = False
        mask = np.zeros_like(y_coords, dtype=np.bool)
        mask[0] = True
        mask[-1] = True
        y_bc.create_dataset("mask", data=mask)
        y_bc.create_dataset("values", data=np.zeros_like(y_coords))


        # Create scalars group
        scalars = f.create_group("scalars")
        scalars.attrs["field_names"] = ["Re", "cyl_dia"]

        Re_dset = scalars.create_dataset("Re", data=Re)
        Re_dset.attrs["sample_varying"] = False
        Re_dset.attrs["time_varying"] = False

        cyl_dia_dset = scalars.create_dataset("cyl_dia", data=cyl_dia)
        cyl_dia_dset.attrs["sample_varying"] = False
        cyl_dia_dset.attrs["time_varying"] = False

        # Create t0_fields group for pressure
        t0_fields = f.create_group("t0_fields")
        t0_fields.attrs["field_names"] = ["pressure"]

        # Load and store pressure field
        pressure_dset = t0_fields.create_dataset(
            "pressure", data=data_dict["pressure"]
        )
        pressure_dset.attrs["dim_varying"] = [True, True]
        pressure_dset.attrs["sample_varying"] = True
        pressure_dset.attrs["time_varying"] = True

        # Create t1_fields group for velocities
        t1_fields = f.create_group("t1_fields")
        t1_fields.attrs["field_names"] = ["velocity"]

        # Load velocity components
        velocity_dset = t1_fields.create_dataset(
            "velocity", data=data_dict["velocity"]
        )
        velocity_dset.attrs["dim_varying"] = [True, True]
        velocity_dset.attrs["sample_varying"] = True
        velocity_dset.attrs["time_varying"] = True

        # Create empty t2_fields group
        t2_fields = f.create_group("t2_fields")
        t2_fields.attrs["field_names"] = []

def create_hdf5_dataset_cylinder_sym_flow(output_path: Path, param_key: tuple, data_dict: dict):
    """
    Create HDF5 file with the specified format.
    """
    Re, cyl_dia = param_key
    filename = f"cylinder_sym_flow_Re_{Re}_cyl_dia_{cyl_dia}.hdf5"

    with h5py.File(output_path / filename, "w") as f:
        # Root attributes
        f.attrs["simulation_parameters"] = ["Re", "cyl_dia"]
        f.attrs["Re"] = Re
        f.attrs["cyl_dia"] = cyl_dia
        f.attrs["dataset_name"] = "COMSOL_CylinderSymFlow"
        f.attrs["grid_type"] = "cartesian"
        f.attrs["n_spatial_dims"] = 2
        f.attrs["n_trajectories"] = data_dict["n_trajectories"]

        # Load data from first file to get dimensions
        x_coords = data_dict["x_coords"]
        y_coords = data_dict["y_coords"]
        time_steps = data_dict["time_steps"]

        # Create dimensions group
        dims = f.create_group("dimensions")
        dims.attrs["spatial_dims"] = ["x", "y"]

        time_dset = dims.create_dataset("time", data=time_steps)
        time_dset.attrs["sample_varying"] = False

        x_dset = dims.create_dataset("x", data=x_coords)
        x_dset.attrs["sample_varying"] = False
        x_dset.attrs["time_varying"] = False

        y_dset = dims.create_dataset("y", data=y_coords)
        y_dset.attrs["sample_varying"] = False
        y_dset.attrs["time_varying"] = False

        # Create boundary conditions group
        bc = f.create_group("boundary_conditions")
        
        x_bc = bc.create_group("x_open")
        x_bc.attrs["associated_dims"] = ["x"]
        x_bc.attrs["associated_fields"] = []
        x_bc.attrs["bc_type"] = "open"
        x_bc.attrs["sample_varying"] = False
        x_bc.attrs["time_varying"] = False

        mask = np.zeros_like(x_coords, dtype=np.bool)
        mask[0] = True
        mask[-1] = True
        x_bc.create_dataset("mask", data=mask)
        x_bc.create_dataset("values", data=np.zeros_like(x_coords))

        # y-boundary
        y_bc = bc.create_group("y_wall")
        y_bc.attrs["associated_dims"] = ["y"]
        y_bc.attrs["associated_fields"] = []
        y_bc.attrs["bc_type"] = "wall"
        y_bc.attrs["sample_varying"] = False
        y_bc.attrs["time_varying"] = False
        mask = np.zeros_like(y_coords, dtype=np.bool)
        mask[0] = True
        mask[-1] = True
        y_bc.create_dataset("mask", data=mask)
        y_bc.create_dataset("values", data=np.zeros_like(y_coords))


        # Create scalars group
        scalars = f.create_group("scalars")
        scalars.attrs["field_names"] = ["Re", "cyl_dia"]

        Re_dset = scalars.create_dataset("Re", data=Re)
        Re_dset.attrs["sample_varying"] = False
        Re_dset.attrs["time_varying"] = False

        cyl_dia_dset = scalars.create_dataset("cyl_dia", data=cyl_dia)
        cyl_dia_dset.attrs["sample_varying"] = False
        cyl_dia_dset.attrs["time_varying"] = False

        # Create t0_fields group for pressure
        t0_fields = f.create_group("t0_fields")
        t0_fields.attrs["field_names"] = ["pressure"]

        # Load and store pressure field
        pressure_dset = t0_fields.create_dataset(
            "pressure", data=data_dict["pressure"]
        )
        pressure_dset.attrs["dim_varying"] = [True, True]
        pressure_dset.attrs["sample_varying"] = True
        pressure_dset.attrs["time_varying"] = True

        # Create t1_fields group for velocities
        t1_fields = f.create_group("t1_fields")
        t1_fields.attrs["field_names"] = ["velocity"]

        # Load velocity components
        velocity_dset = t1_fields.create_dataset(
            "velocity", data=data_dict["velocity"]
        )
        velocity_dset.attrs["dim_varying"] = [True, True]
        velocity_dset.attrs["sample_varying"] = True
        velocity_dset.attrs["time_varying"] = True

        # Create empty t2_fields group
        t2_fields = f.create_group("t2_fields")
        t2_fields.attrs["field_names"] = []

def create_hdf5_dataset_object_periodic_flow(output_path: Path, param_key: tuple, data_dict: dict, split: int = 0):
    """
    Create HDF5 file with the specified format.
    """
    vel = param_key[0]
    filename = f"object_perio_flow_vel_{vel}_{split}.hdf5"

    with h5py.File(output_path / filename, "w") as f:
        # Root attributes
        f.attrs["simulation_parameters"] = ["inlet_vel"]
        f.attrs["inlet_vel"] = vel
        f.attrs["dataset_name"] = "COMSOL_ObjectPeriodicFlowWater"
        f.attrs["grid_type"] = "cartesian"
        f.attrs["n_spatial_dims"] = 2
        f.attrs["n_trajectories"] = data_dict["n_trajectories"]

        # Load data from first file to get dimensions
        x_coords = data_dict["x_coords"]
        y_coords = data_dict["y_coords"]
        time_steps = data_dict["time_steps"]

        # Create dimensions group
        dims = f.create_group("dimensions")
        dims.attrs["spatial_dims"] = ["x", "y"]

        time_dset = dims.create_dataset("time", data=time_steps)
        time_dset.attrs["sample_varying"] = False

        x_dset = dims.create_dataset("x", data=x_coords)
        x_dset.attrs["sample_varying"] = False
        x_dset.attrs["time_varying"] = False

        y_dset = dims.create_dataset("y", data=y_coords)
        y_dset.attrs["sample_varying"] = False
        y_dset.attrs["time_varying"] = False

        # Create boundary conditions group
        bc = f.create_group("boundary_conditions")
        
        x_bc = bc.create_group("x_open")
        x_bc.attrs["associated_dims"] = ["x"]
        x_bc.attrs["associated_fields"] = []
        x_bc.attrs["bc_type"] = "open"
        x_bc.attrs["sample_varying"] = False
        x_bc.attrs["time_varying"] = False

        mask = np.zeros_like(x_coords, dtype=np.bool)
        mask[0] = True
        mask[-1] = True
        x_bc.create_dataset("mask", data=mask)
        x_bc.create_dataset("values", data=np.zeros_like(x_coords))

        # y-boundary
        y_bc = bc.create_group("y_periodic")
        y_bc.attrs["associated_dims"] = ["y"]
        y_bc.attrs["associated_fields"] = []
        y_bc.attrs["bc_type"] = "periodic"
        y_bc.attrs["sample_varying"] = False
        y_bc.attrs["time_varying"] = False
        mask = np.zeros_like(y_coords, dtype=np.bool)
        mask[0] = True
        mask[-1] = True
        y_bc.create_dataset("mask", data=mask)
        y_bc.create_dataset("values", data=np.zeros_like(y_coords))


        # Create scalars group
        scalars = f.create_group("scalars")
        scalars.attrs["field_names"] = ["vel_in"]

        vel_dset = scalars.create_dataset("Re", data=vel)
        vel_dset.attrs["sample_varying"] = False
        vel_dset.attrs["time_varying"] = False

        # Create t0_fields group for pressure
        t0_fields = f.create_group("t0_fields")
        t0_fields.attrs["field_names"] = ["pressure"]

        # Load and store pressure field
        pressure_dset = t0_fields.create_dataset(
            "pressure", data=data_dict["pressure"]
        )
        pressure_dset.attrs["dim_varying"] = [True, True]
        pressure_dset.attrs["sample_varying"] = True
        pressure_dset.attrs["time_varying"] = True

        # Create t1_fields group for velocities
        t1_fields = f.create_group("t1_fields")
        t1_fields.attrs["field_names"] = ["velocity"]

        # Load velocity components
        velocity_dset = t1_fields.create_dataset(
            "velocity", data=data_dict["velocity"]
        )
        velocity_dset.attrs["dim_varying"] = [True, True]
        velocity_dset.attrs["sample_varying"] = True
        velocity_dset.attrs["time_varying"] = True

        # Create empty t2_fields group
        t2_fields = f.create_group("t2_fields")
        t2_fields.attrs["field_names"] = []


def create_hdf5_dataset_object_sym_flow(output_path: Path, param_key: tuple, data_dict: dict, split: int = 0):
    """
    Create HDF5 file with the specified format.
    """
    vel = param_key[0]
    filename = f"object_sym_flow_vel_{vel}_{split}_water.hdf5"

    with h5py.File(output_path / filename, "w") as f:
        # Root attributes
        f.attrs["simulation_parameters"] = ["inlet_vel"]
        f.attrs["inlet_vel"] = vel
        f.attrs["dataset_name"] = "COMSOL_ObjectSymmetricFlowWater"
        f.attrs["grid_type"] = "cartesian"
        f.attrs["n_spatial_dims"] = 2
        f.attrs["n_trajectories"] = data_dict["n_trajectories"]

        # Load data from first file to get dimensions
        x_coords = data_dict["x_coords"]
        y_coords = data_dict["y_coords"]
        time_steps = data_dict["time_steps"]

        # Create dimensions group
        dims = f.create_group("dimensions")
        dims.attrs["spatial_dims"] = ["x", "y"]

        time_dset = dims.create_dataset("time", data=time_steps)
        time_dset.attrs["sample_varying"] = False

        x_dset = dims.create_dataset("x", data=x_coords)
        x_dset.attrs["sample_varying"] = False
        x_dset.attrs["time_varying"] = False

        y_dset = dims.create_dataset("y", data=y_coords)
        y_dset.attrs["sample_varying"] = False
        y_dset.attrs["time_varying"] = False

        # Create boundary conditions group
        bc = f.create_group("boundary_conditions")
        
        x_bc = bc.create_group("x_open")
        x_bc.attrs["associated_dims"] = ["x"]
        x_bc.attrs["associated_fields"] = []
        x_bc.attrs["bc_type"] = "open"
        x_bc.attrs["sample_varying"] = False
        x_bc.attrs["time_varying"] = False

        mask = np.zeros_like(x_coords, dtype=np.bool)
        mask[0] = True
        mask[-1] = True
        x_bc.create_dataset("mask", data=mask)
        x_bc.create_dataset("values", data=np.zeros_like(x_coords))

        # y-boundary
        y_bc = bc.create_group("y_symmetric")
        y_bc.attrs["associated_dims"] = ["y"]
        y_bc.attrs["associated_fields"] = []
        y_bc.attrs["bc_type"] = "symmetric"
        y_bc.attrs["sample_varying"] = False
        y_bc.attrs["time_varying"] = False
        mask = np.zeros_like(y_coords, dtype=np.bool)
        mask[0] = True
        mask[-1] = True
        y_bc.create_dataset("mask", data=mask)
        y_bc.create_dataset("values", data=np.zeros_like(y_coords))


        # Create scalars group
        scalars = f.create_group("scalars")
        scalars.attrs["field_names"] = ["vel_in"]

        vel_dset = scalars.create_dataset("vel_in", data=vel)
        vel_dset.attrs["sample_varying"] = False
        vel_dset.attrs["time_varying"] = False

        # Create t0_fields group for pressure
        t0_fields = f.create_group("t0_fields")
        t0_fields.attrs["field_names"] = ["pressure"]

        # Load and store pressure field
        pressure_dset = t0_fields.create_dataset(
            "pressure", data=data_dict["pressure"]
        )
        pressure_dset.attrs["dim_varying"] = [True, True]
        pressure_dset.attrs["sample_varying"] = True
        pressure_dset.attrs["time_varying"] = True

        # Create t1_fields group for velocities
        t1_fields = f.create_group("t1_fields")
        t1_fields.attrs["field_names"] = ["velocity"]

        # Load velocity components
        velocity_dset = t1_fields.create_dataset(
            "velocity", data=data_dict["velocity"]
        )
        velocity_dset.attrs["dim_varying"] = [True, True]
        velocity_dset.attrs["sample_varying"] = True
        velocity_dset.attrs["time_varying"] = True

        # Create empty t2_fields group
        t2_fields = f.create_group("t2_fields")
        t2_fields.attrs["field_names"] = []

def create_hdf5_dataset_heat_flow(output_path: Path, param_key: tuple, data_dict: dict):
    """
    Create HDF5 file with the specified format.
    """
    vel, radius_heater, dT = param_key
    
    filename = f"heated_flow_vel_{vel}_r_{radius_heater}_dT_{dT}.hdf5"

    with h5py.File(output_path / filename, "w") as f:
        # Root attributes
        f.attrs["simulation_parameters"] = ["inlet_vel", "radius_heater","dT"]
        f.attrs["inlet_vel"] = vel
        f.attrs["radius_heater"] = radius_heater
        f.attrs["dT"] = dT
        f.attrs["dataset_name"] = "COMSOL_HeatedFlow_Air"
        f.attrs["grid_type"] = "cartesian"
        f.attrs["n_spatial_dims"] = 2
        f.attrs["n_trajectories"] = data_dict["n_trajectories"]

        # Load data from first file to get dimensions
        x_coords = data_dict["x_coords"]
        y_coords = data_dict["y_coords"]
        time_steps = data_dict["time_steps"]

        # Create dimensions group
        dims = f.create_group("dimensions")
        dims.attrs["spatial_dims"] = ["x", "y"]

        time_dset = dims.create_dataset("time", data=time_steps)
        time_dset.attrs["sample_varying"] = False

        x_dset = dims.create_dataset("x", data=x_coords)
        x_dset.attrs["sample_varying"] = False
        x_dset.attrs["time_varying"] = False

        y_dset = dims.create_dataset("y", data=y_coords)
        y_dset.attrs["sample_varying"] = False
        y_dset.attrs["time_varying"] = False

        # Create boundary conditions group
        bc = f.create_group("boundary_conditions")
        
        x_bc = bc.create_group("x_open")
        x_bc.attrs["associated_dims"] = ["x"]
        x_bc.attrs["associated_fields"] = []
        x_bc.attrs["bc_type"] = "open"
        x_bc.attrs["sample_varying"] = False
        x_bc.attrs["time_varying"] = False

        mask = np.zeros_like(x_coords, dtype=np.bool)
        mask[0] = True
        mask[-1] = True
        x_bc.create_dataset("mask", data=mask)
        x_bc.create_dataset("values", data=np.zeros_like(x_coords))

        # y-boundary
        y_bc = bc.create_group("y_wall")
        y_bc.attrs["associated_dims"] = ["y"]
        y_bc.attrs["associated_fields"] = []
        y_bc.attrs["bc_type"] = "wall"
        y_bc.attrs["sample_varying"] = False
        y_bc.attrs["time_varying"] = False
        mask = np.zeros_like(y_coords, dtype=np.bool)
        mask[0] = True
        mask[-1] = True
        y_bc.create_dataset("mask", data=mask)
        y_bc.create_dataset("values", data=np.zeros_like(y_coords))


        # Create scalars group
        scalars = f.create_group("scalars")
        scalars.attrs["field_names"] = ["inlet_vel", "radius_heater","dT"]

        vel_dset = scalars.create_dataset("inlet_vel", data=vel)
        vel_dset.attrs["sample_varying"] = False
        vel_dset.attrs["time_varying"] = False

        rad_dset = scalars.create_dataset("radius_heater", data=radius_heater)
        rad_dset.attrs["sample_varying"] = False
        rad_dset.attrs["time_varying"] = False

        dT_dset = scalars.create_dataset("dT", data=dT)
        dT_dset.attrs["sample_varying"] = False
        dT_dset.attrs["time_varying"] = False

        # Create t0_fields group for pressure
        t0_fields = f.create_group("t0_fields")
        t0_fields.attrs["field_names"] = ["pressure","density","temperature"]

        # Load and store pressure field
        pressure_dset = t0_fields.create_dataset(
            "pressure", data=data_dict["pressure"]
        )
        pressure_dset.attrs["dim_varying"] = [True, True]
        pressure_dset.attrs["sample_varying"] = True
        pressure_dset.attrs["time_varying"] = True

        rho_dset = t0_fields.create_dataset(
            "density", data=data_dict["rho"]
        )
        rho_dset.attrs["dim_varying"] = [True, True]
        rho_dset.attrs["sample_varying"] = True
        rho_dset.attrs["time_varying"] = True

        temp_dset = t0_fields.create_dataset(
            "temperature", data=data_dict["temp"]
        )
        temp_dset.attrs["dim_varying"] = [True, True]
        temp_dset.attrs["sample_varying"] = True
        temp_dset.attrs["time_varying"] = True

        # Create t1_fields group for velocities
        t1_fields = f.create_group("t1_fields")
        t1_fields.attrs["field_names"] = ["velocity"]

        # Load velocity components
        velocity_dset = t1_fields.create_dataset(
            "velocity", data=data_dict["velocity"]
        )
        velocity_dset.attrs["dim_varying"] = [True, True]
        velocity_dset.attrs["sample_varying"] = True
        velocity_dset.attrs["time_varying"] = True

        # Create empty t2_fields group
        t2_fields = f.create_group("t2_fields")
        t2_fields.attrs["field_names"] = []



In [None]:
# Load the data
raw_data_path = Path(r"C:\Users\zsa8rk\Coding\MetaPARC\data\raw_comsol\heated_flow")
data_path = Path("C:/Users/zsa8rk/Coding/MetaPARC/data/datasets/heated_object_pipe_flow_air/data")
data_path.mkdir(parents=True, exist_ok=True)
param_names = ["vel_in","r_heat","dT", "strucID"]
seed_name = "strucID"
fields = ["vel_x", "vel_y", "pressure", "rho", "temp"]

# if larger than 1, split one hdf5 file into multiple
split = 0

data_files: dict[tuple[int, int], dict[str, dict[int, Path]]] = search_comsol_data(raw_data_path, param_names, fields, seed_name)

#remove all files containing these parameters
# remove_params0 = [(300, i) for i in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]]
# remove_params1 = [(100, i) for i in [0.1, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]]
# remove_params2 = [(200, i) for i in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]]
# remove_params3 = [(20,i) for i in [0.1,0.2,0.15,0.25,0.45,0.3, 0.4, 0.5, 0.6, 0.7, 0.8]]
# remove_params = remove_params1 + remove_params2 + remove_params3 + remove_params0
# for params_key in remove_params:
#     data_files.pop(params_key, None)


for params_key, features in data_files.items():
    print("Loading data for", params_key)
    data = {}
    data["params"] = params_key
    for feature_name, trajectories in features.items():
        print("\tLoading", feature_name)
        trajectory_data = []
        for trajectory_id, file_path in trajectories.items():
            feature_array, metadata = load_comsol_csv_grid(file_path)
            trajectory_data.append(feature_array)


        trajectory_data = np.stack(trajectory_data, axis=0) # (n_trajectories, n_timesteps, x, y)
        data[feature_name] = trajectory_data

    # Add metadata from the last trajectory
    data["n_trajectories"] = trajectory_data.shape[0]
    data["x_coords"] = metadata["x_coords"]
    data["y_coords"] = metadata["y_coords"]
    data["time_steps"] = metadata["time_steps"]

    # join vel_u and vel_v
    data["velocity"] = np.stack([data["vel_x"], data["vel_y"]], axis=-1)
    # remove vel_u and vel_v
    del data["vel_x"]
    del data["vel_y"]

    if split > 1:
        num_traj = data["n_trajectories"] // split
        for i in range(split):
            part_data = {
                "params": data["params"],
                "x_coords": data["x_coords"],
                "y_coords": data["y_coords"],
                "time_steps": data["time_steps"]
            }
            
            # Split each feature array
            start_idx = i * num_traj
            
            # For the last split, take all remaining trajectories
            if i == split - 1:
                end_idx = data["n_trajectories"]
                part_data["n_trajectories"] = end_idx - start_idx
            else:
                end_idx = (i + 1) * num_traj
                part_data["n_trajectories"] = num_traj
            
            for feature_name in ["pressure", "velocity"]:
                part_data[feature_name] = data[feature_name][start_idx:end_idx]
                
            # Create HDF5 file for this part
            create_hdf5_dataset_heat_flow(data_path, params_key, part_data)
            print(f"Created {params_key} part {i+1}/{split} hdf5 file")

    else:
        create_hdf5_dataset_heat_flow(data_path, params_key, data)
        print(f"Created {params_key} hdf5 file")

    # data_path.mkdir(parents=True, exist_ok=True)
    # np.save(data_path / f"p_cap_{params_key[0]}_theta_{params_key[1]}_pressure.npy", data["pressure"])
    # np.save(data_path / f"p_cap_{params_key[0]}_theta_{params_key[1]}_phase_boundary.npy", data["phase_boundary"])
    # np.save(data_path / f"p_cap_{params_key[0]}_theta_{params_key[1]}_velocity.npy", data["velocity"])


In [28]:
# # Plot 2 data timesteps to check if correct for each feature

# with h5py.File(data_path / "porous_twophase_flow_p_cap_20000.0_theta_90.0.hdf5", "r") as f:
#     pressure = f["t0_fields/pressure"][:]
#     phase_boundary = f["t0_fields/phase_boundary"][:]
#     velocity = f["t1_fields/velocity"][:]

# vel_mag = np.linalg.norm(velocity, axis=-1)

# traj_id = 0

# fig, axs = plt.subplots(2, 3)
# axs[0, 0].imshow(pressure[traj_id,100, :, :])
# axs[0, 1].imshow(phase_boundary[traj_id,100, :, :])
# axs[1, 0].imshow(vel_mag[traj_id,100, :, :])

# axs[0, 2].imshow(pressure[traj_id,300, :, :])
# axs[1, 1].imshow(phase_boundary[traj_id,300, :, :])
# axs[1, 2].imshow(vel_mag[traj_id,300, :, :])

# plt.show()


In [None]:
import shutil
import random
from pathlib import Path

def split_datasets(data_path: Path, train_ratio: float = 0.8, val_ratio: float = 0.1, test_ratio: float = 0.1):
    """Split hdf5 files into train/val/test directories.
    
    Parameters
    ----------
    data_path : Path
        Path to directory containing hdf5 files
    train_ratio : float, optional
        Ratio of files to use for training, by default 0.8
    val_ratio : float, optional
        Ratio of files to use for validation, by default 0.1
    test_ratio : float, optional
        Ratio of files to use for testing, by default 0.1
    """
    # Create subdirectories
    train_dir = data_path / "train"
    val_dir = data_path / "valid" 
    test_dir = data_path / "test"
    
    for dir in [train_dir, val_dir, test_dir]:
        dir.mkdir(exist_ok=True)
    
    # Get list of hdf5 files
    hdf5_files = list(data_path.glob("*.hdf5"))
    
    # Shuffle files
    random.shuffle(hdf5_files)
    
    # Calculate split indices
    n_files = len(hdf5_files)
    n_train = int(n_files * train_ratio)
    n_val = int(n_files * val_ratio)
    
    # Split files
    train_files = hdf5_files[:n_train]
    val_files = hdf5_files[n_train:n_train + n_val]
    test_files = hdf5_files[n_train + n_val:]
    
    # Move files to respective directories
    for file in train_files:
        shutil.move(file, train_dir / file.name)
    
    for file in val_files:
        shutil.move(file, val_dir / file.name)
        
    for file in test_files:
        shutil.move(file, test_dir / file.name)
    print(f"Split {n_files} files into:")
    print(f"Train: {len(train_files)} files")
    print(f"Validation: {len(val_files)} files") 
    print(f"Test: {len(test_files)} files")

# Split the datasets
split_datasets(data_path)
