# Convert COMSOL data to PARC data

This notebook is used to convert COMSOL simulation data into tensors suitable for deep learning.
It aggregates the data from a parametric sweep into tensors of shape (N_features, N_timesteps, X, Y)

The input data is in the form of a csv file with the following columns:
x0 | y0 | t1 | t2 | t3 | ... | tN
x0 | y1 | t1 | t2 | t3 | ... | tN
x0 | y2 | t1 | t2 | t3 | ... | tN
...

where t1, t2, t3, ... are the time steps.

In [56]:
import re
from pathlib import Path
from typing import Dict, Tuple, Any

import pandas as pd
import numpy as np

def search_comsol_data(data_path: Path, params: list[str]) -> Dict[Tuple[int, int, float], Dict[str, Any]]:
    """
    Search through velocities, pressure, and phase_boundary folders to find and group
    files with matching parameter combinations.

    Args:
        data_path (Path): Base path containing the subdirectories

        params (list[str]): List of parameters to extract from the filenames

    Returns:
        Dict: Dictionary with parameter combinations as keys and file information as values
    """
    # Define the subdirectories to search
    subdirs = ["velocities", "pressure", "phase_boundary"]
    data = {}

    # Function to extract parameters from filename
    def extract_params(filename: str, params: list[str]) -> tuple:
        param1 = re.search(rf"{params[0]}_(\d+)", filename).group(1)
        param2 = re.search(rf"{params[1]}_(\d+)", filename).group(1)
        param3 = re.search(rf"{params[2]}_(\d+\.?\d*)", filename).group(1)
        return (int(param1), int(param2), float(param3))

    # Search through each subdirectory
    for subdir in subdirs:
        subdir_path = data_path / subdir
        if not subdir_path.exists():
            print(f"Warning: Directory {subdir} not found")
            continue

        files = subdir_path.glob("*.csv")
        for file in files:
            # Extract parameters
            param1, param2, param3 = extract_params(file.stem, params)

            # Create parameter combination key
            param_key = (param1, param2, param3)

            # Initialize dict for this parameter combination if not exists
            if param_key not in data:
                data[param_key] = {
                    f"{params[0]}": param1,
                    f"{params[1]}": param2,
                    f"{params[2]}": param3,
                    "files": {},
                }

            # Add file path under appropriate category
            if "vel_u" in str(file):
                data[param_key]["files"]["vel_u"] = file
            elif "vel_v" in str(file):
                data[param_key]["files"]["vel_v"] = file
            elif "pressure" in str(file):
                data[param_key]["files"]["pressure"] = file
            elif "phase_boundary" in str(file):
                data[param_key]["files"]["phase_boundary"] = file

    return data


def load_comsol_csv(file_path: Path) -> np.ndarray:
    """
    Load COMSOL CSV data and reshape it into (time, x, y) format.
    """
    # Read CSV and sort by x then y coordinates
    df = pd.read_csv(file_path, sep=";", dtype=np.float32, header=None)
    df = df.sort_values(by=[df.columns[0], df.columns[1]]).reset_index(drop=True)

    # Get coordinate information
    x_coords = df[0].unique()
    y_coords = df[1].unique()

    # Calculate expected grid size and validate
    grid_size = len(x_coords) * len(y_coords)
    if len(df) != grid_size:
        raise ValueError(
            f"Data grid mismatch: {len(df)} points vs expected {grid_size} ({len(x_coords)}x{len(y_coords)})"
        )

    # Reshape directly to (x, y, time) then transpose to (time, x, y)
    time_steps = df.columns[2:]
    time_data = df[time_steps].values
    data = time_data.reshape(len(x_coords), len(y_coords), len(time_steps))

    return data.transpose(2, 0, 1)

In [57]:
# Load the data
data_path = Path("/home/flwi01/Coding/MetaPARC/data/raw_comsol")
data = search_comsol_data(data_path, ["strucID", "p_cap", "theta"])


for data_key, data_value in data.items():
    data_arrays = []
    print("Loading data for", data_key)
    for file_key, file_value in data_value["files"].items():
        data_array = load_comsol_csv(file_value)
        data_arrays.append(data_array)

    data_arrays = np.array(data_arrays)

    print(data_arrays.shape)


Loading data for (0, 0, 1.91986217719376)
(4, 401, 256, 128)
Loading data for (0, 0, 1.5707963267949)
(4, 401, 256, 128)


In [58]:
# Plot the data to check if correct
