# Convert COMSOL data to PARC data

This notebook is used to convert COMSOL simulation data into tensors suitable for deep learning.
It aggregates the data from a parametric sweep into tensors of shape (N_timesteps, X, Y, N_features)

The input data is in the form of a csv file with the following columns:
x0 | y0 | t1 | t2 | t3 | ... | tN
x0 | y1 | t1 | t2 | t3 | ... | tN
x0 | y2 | t1 | t2 | t3 | ... | tN
...

where t1, t2, t3, ... are the time steps.

The output data is then stored as hdf5 files with the structure of the-well format.
In our case, theta and p_cap are parameters while the strucID is a starting condition,
i.e. the trajectory.

In [4]:
import re
from pathlib import Path
from typing import Dict, Tuple, Any

import pandas as pd
import numpy as np
import h5py

def search_comsol_data(data_path: Path, params: list[str]) -> Dict[Tuple[int, int, float], Dict[str, Any]]:
    """
    Search through velocities, pressure, and phase_boundary folders to find and group
    files with matching parameter combinations.

    Args:
        data_path (Path): Base path containing the subdirectories

        params (list[str]): List of parameters to extract from the filenames

    Returns:
        Dict: Dictionary with parameter combinations as keys and file information as values
    """
    # Define the subdirectories to search
    subdirs = ["velocities", "pressure", "phase_boundary"]
    data = {}

    # Function to extract parameters from filename
    def extract_params(filename: str, params: list[str]) -> tuple:
        param1 = re.search(rf"{params[0]}_(\d+)", filename).group(1)
        param2 = re.search(rf"{params[1]}_(\d+)", filename).group(1)
        param3 = re.search(rf"{params[2]}_(\d+\.?\d*)", filename).group(1)

        # Transform theta from radians to degrees
        param3 = float(param3)
        param3 = np.rad2deg(param3)
        # round theta to integer
        param3 = int(np.round(param3))

        return (int(param1), int(param2), param3)

    # Search through each subdirectory
    for subdir in subdirs:
        subdir_path = data_path / subdir
        if not subdir_path.exists():
            print(f"Warning: Directory {subdir} not found")
            continue

        files = subdir_path.glob("*.csv")
        for file in files:
            # Extract parameters
            param1, param2, param3 = extract_params(file.stem, params)

            # Create parameter combination key, only include p_cap and theta
            # strucID is not included since we want to aggregate over all strucIDs
            param_key = (param2, param3)

            # Initialize nested dictionaries if they don't exist
            if param_key not in data:
                data[param_key] = {
                    "vel_u": {},
                    "vel_v": {},
                    "pressure": {},
                    "phase_boundary": {},
                }

            # Add file path under appropriate category
            if "vel_u" in str(file):
                data[param_key]["vel_u"][param1] = file
            elif "vel_v" in str(file):
                data[param_key]["vel_v"][param1] = file
            elif "pressure" in str(file):
                data[param_key]["pressure"][param1] = file
            elif "phase_boundary" in str(file):
                data[param_key]["phase_boundary"][param1] = file

    return data


def load_comsol_csv(file_path: Path) -> np.ndarray:
    """
    Load COMSOL CSV data and reshape it into (time, x, y) format.
    """
    # Read CSV and sort by x then y coordinates
    df = pd.read_csv(file_path, sep=";", dtype=np.float32, header=None)
    df = df.sort_values(by=[df.columns[0], df.columns[1]]).reset_index(drop=True)

    # Get coordinate information
    x_coords = df[0].unique()
    y_coords = df[1].unique()

    # Calculate expected grid size and validate
    grid_size = len(x_coords) * len(y_coords)
    if len(df) != grid_size:
        raise ValueError(
            f"Data grid mismatch: {len(df)} points vs expected {grid_size} ({len(x_coords)}x{len(y_coords)})"
        )

    # Reshape directly to (x, y, time) then transpose to (time, y, x)
    time_steps = df.columns[2:]
    time_data = df[time_steps].values
    data = time_data.reshape(len(x_coords), len(y_coords), len(time_steps))

    return data.transpose(2, 0, 1)

In [5]:
def create_hdf5_dataset(output_path: Path, param_key: tuple, data_dict: dict):
    """
    Create HDF5 file with the specified format.
    """
    p_cap, theta = param_key
    filename = f"porous_twophase_flow_p_cap_{p_cap}_theta_{theta}.hdf5"

    with h5py.File(output_path / filename, "w") as f:
        # Root attributes
        f.attrs["simulation_parameters"] = ["p_cap", "theta"]
        f.attrs["p_cap"] = p_cap
        f.attrs["theta"] = theta
        f.attrs["dataset_name"] = "COMSOL_TwoPhaseFlow"
        f.attrs["grid_type"] = "cartesian"
        f.attrs["n_spatial_dims"] = 2
        f.attrs["n_trajectories"] = 1  # Number of strucIDs

        # Load data from first file to get dimensions
        first_file = next(iter(data_dict["files"].values()))
        x_coords, y_coords, time_coords, _ = load_comsol_csv(first_file)

        # Create dimensions group
        dims = f.create_group("dimensions")
        dims.attrs["spatial_dims"] = ["x", "y"]

        time_dset = dims.create_dataset("time", data=time_coords)
        time_dset.attrs["sample_varying"] = False

        x_dset = dims.create_dataset("x", data=x_coords)
        x_dset.attrs["sample_varying"] = False
        x_dset.attrs["time_varying"] = False

        y_dset = dims.create_dataset("y", data=y_coords)
        y_dset.attrs["sample_varying"] = False
        y_dset.attrs["time_varying"] = False

        # Create boundary conditions group
        bc = f.create_group("boundary_conditions")
        x_bc = bc.create_group("X_boundary")
        x_bc.attrs["associated_dims"] = ["x"]
        x_bc.attrs["associated_fields"] = []
        x_bc.attrs["bc_type"] = "wall"
        x_bc.attrs["sample_varying"] = False
        x_bc.attrs["time_varying"] = False

        # Create mask dataset (assuming periodic in x)
        mask = np.zeros_like(x_coords, dtype=bool)
        mask[0] = mask[-1] = True
        x_bc.create_dataset("mask", data=mask)
        x_bc.create_dataset("values", data=np.array([]))

        # Create scalars group
        scalars = f.create_group("scalars")
        scalars.attrs["field_names"] = ["p_cap", "theta"]

        p_cap_dset = scalars.create_dataset("p_cap", data=p_cap)
        p_cap_dset.attrs["sample_varying"] = False
        p_cap_dset.attrs["time_varying"] = False

        theta_dset = scalars.create_dataset("theta", data=theta)
        theta_dset.attrs["sample_varying"] = False
        theta_dset.attrs["time_varying"] = False

        # Create t0_fields group for pressure
        t0_fields = f.create_group("t0_fields")
        t0_fields.attrs["field_names"] = ["pressure"]

        # Load and store pressure field
        _, _, _, pressure_data = load_comsol_csv(data_dict["files"]["pressure"])
        pressure_dset = t0_fields.create_dataset(
            "pressure", data=pressure_data[np.newaxis, ...]
        )
        pressure_dset.attrs["dim_varying"] = [True, True]
        pressure_dset.attrs["sample_varying"] = True
        pressure_dset.attrs["time_varying"] = True

        # Create t1_fields group for velocities
        t1_fields = f.create_group("t1_fields")
        t1_fields.attrs["field_names"] = ["velocity"]

        # Load velocity components
        _, _, _, vel_u = load_comsol_csv(data_dict["files"]["vel_u"])
        _, _, _, vel_v = load_comsol_csv(data_dict["files"]["vel_v"])

        # Combine velocity components and store
        velocity = np.stack([vel_u, vel_v], axis=-1)
        velocity_dset = t1_fields.create_dataset(
            "velocity", data=velocity[np.newaxis, ...]
        )
        velocity_dset.attrs["dim_varying"] = [True, True]
        velocity_dset.attrs["sample_varying"] = True
        velocity_dset.attrs["time_varying"] = True

        # Create empty t2_fields group
        t2_fields = f.create_group("t2_fields")
        t2_fields.attrs["field_names"] = []


In [None]:
# Load the data
raw_data_path = Path("/home/flwi01/Coding/MetaPARC/data/raw_comsol")
data_path = Path("/home/flwi01/Coding/MetaPARC/data/tasks/porous_twophase_flow")
data_path.mkdir(parents=True, exist_ok=True)
param_names = ["strucID", "p_cap", "theta"]

data_files: dict[tuple[int, int], dict[str, dict[int, Path]]] = search_comsol_data(raw_data_path, param_names)

for params_key, features in data_files.items():
    data = {}
    data["params"] = params_key
    for feature_name, trajectories in features.items():
        trajectory_data = []
        for trajectory_id, file_path in trajectories.items():
            feature_array = load_comsol_csv(file_path)
            trajectory_data.append(feature_array)


        trajectory_data = np.stack(trajectory_data, axis=0) # (n_trajectories, n_timesteps, x, y)
        data[feature_name] = trajectory_data

    # Add metadata from the last trajectory
    data["n_trajectories"] = trajectory_data.shape[0]
    data["n_timesteps"] = trajectory_data.shape[1]
    data["x_coords"] = trajectory_data.shape[2]
    data["y_coords"] = trajectory_data.shape[3]

    print(data["vel_u"].shape)
    print(data["vel_v"].shape)
    print(data["pressure"].shape)
    print(data["phase_boundary"].shape)

    # join vel_u and vel_v
    data["velocity"] = np.stack([data["vel_u"], data["vel_v"]], axis=-1)
    # remove vel_u and vel_v
    del data["vel_u"]
    del data["vel_v"]

    # data_path.mkdir(parents=True, exist_ok=True)
    # np.save(data_path / f"p_cap_{params_key[1]}_theta_{params_key[2]}.npy", data)


(2, 401, 256, 128)
(2, 401, 256, 128)
(2, 401, 256, 128)
(2, 401, 256, 128)
(2, 401, 256, 128)
(2, 401, 256, 128)
(2, 401, 256, 128)
(2, 401, 256, 128)


In [7]:
import matplotlib.pyplot as plt

# Plot 4 data timesteps to check if correct

data_files = np.load(
    data_path
    / "strucID_0_p_cap_0_theta_110.npy"
)
feature = 3

fig, axs = plt.subplots(2, 2)
axs[0, 0].imshow(data_files[0, :, :, feature])
axs[0, 1].imshow(data_files[100, :, :, feature])
axs[1, 0].imshow(data_files[200, :, :, feature])
axs[1, 1].imshow(data_files[300, :, :, feature])

plt.show()


FileNotFoundError: [Errno 2] No such file or directory: '/home/flwi01/Coding/MetaPARC/data/tasks/porous_twophase_flow/strucID_0_p_cap_0_theta_110.npy'