# Analyze the datasets

This notebook is used to analyze and generate statistics about the datasets.
Most importantly, it computes the number of samples in each dataset.


In [3]:
# Import the necessary libraries
from pathlib import Path
import h5py


In [14]:
# Find all datasets
data_dir = Path(r"C:\Users\zsa8rk\Coding\MetaPARC\data\datasets")

# list all dirs in the data_dir
datasets = []
for dir in data_dir.iterdir():
    if dir.is_dir():
        datasets.append(dir)

# Print the number of samples in each dataset

dataset_stats = {}

for dataset in datasets:
    dataset_name = dataset.name
    dataset_stats[dataset_name] = {}

    # gather the size of the dataset
    dataset_size = 0
    for file in dataset.glob("**/*"):
        if file.is_file():
            dataset_size += file.stat().st_size

    # Convert bytes to GB for better readability
    dataset_stats[dataset_name]["size"] = f"{dataset_size / (1024**3):.2f} GB"

    # find all h5 files in the dataset
    h5_files = list(dataset.glob("**/*.hdf5"))

    n_traj = 0
    for h5_file in h5_files:
        with h5py.File(h5_file, "r") as f:
            # traj is attribute of the file
            n_traj += int(f.attrs["n_trajectories"])

    # use the last h5 file to get the number of timesteps, x, and y
    with h5py.File(h5_files[-1], "r") as f:
        # time is a dataset in the group "dims"
        n_timesteps = len(f["dimensions"]["time"])
        # x is a dataset in the group "dims"
        n_x = len(f["dimensions"]["x"])
        # y is a dataset in the group "dims"
        n_y = len(f["dimensions"]["y"])

    dataset_stats[dataset_name]["n_traj"] = n_traj
    dataset_stats[dataset_name]["n_timesteps"] = n_timesteps
    dataset_stats[dataset_name]["n_x"] = n_x
    dataset_stats[dataset_name]["n_y"] = n_y

for dataset_name, stats in dataset_stats.items():
    print(f"{dataset_name}:")
    for key, value in stats.items():
        print(f"  {key}: {value}")
    print()







cylinder_pipe_flow_water:
  size: 87.41 GB
  n_traj: 363
  n_timesteps: 501
  n_x: 336
  n_y: 128

cylinder_sym_flow_water:
  size: 78.99 GB
  n_traj: 328
  n_timesteps: 501
  n_x: 336
  n_y: 128

heated_object_pipe_flow_air:
  size: 48.62 GB
  n_traj: 159
  n_timesteps: 501
  n_x: 256
  n_y: 128

object_periodic_flow_water:
  size: 33.03 GB
  n_traj: 180
  n_timesteps: 501
  n_x: 256
  n_y: 128

object_sym_flow_air:
  size: 46.97 GB
  n_traj: 256
  n_timesteps: 501
  n_x: 256
  n_y: 128

object_sym_flow_water:
  size: 25.50 GB
  n_traj: 139
  n_timesteps: 501
  n_x: 256
  n_y: 128

rayleigh_benard:
  size: 341.80 GB
  n_traj: 1750
  n_timesteps: 200
  n_x: 512
  n_y: 128

shear_flow:
  size: 546.88 GB
  n_traj: 1120
  n_timesteps: 200
  n_x: 256
  n_y: 512

turbulent_radiative_layer_2D:
  size: 6.66 GB
  n_traj: 90
  n_timesteps: 101
  n_x: 128
  n_y: 384

