In [1]:
import h5py
import numpy as np
from tqdm import tqdm
from scipy import ndimage
import os

# 📁 Percorso ai file HDF5 caricati su Kaggle
base_path = "/kaggle/input/hdf5-chunk1"  # Assicurati che il nome del dataset sia giusto

# 📁 Directory di output scrivibile su Kaggle
output_dir = "/kaggle/working"

dataset_types = ['train', 'val', 'test']

for dataset_type in dataset_types:
    input_path = os.path.join(base_path, f"gas_and_brake_{dataset_type}_comma_chunk_1_w_imgs.h5py")
    output_path = os.path.join(output_dir, f"filtered_chunk1_{dataset_type}.hdf5")

    print(f"🔄 Processing {dataset_type} from: {input_path}")

    if not os.path.exists(input_path):
        print(f"❌ File not found: {input_path}")
        continue

    with h5py.File(input_path, "r") as h5_file, h5py.File(output_path, "w") as h_out:
        keys = list(h5_file.keys())

        for key in tqdm(keys, desc=f"Processing {dataset_type}"):
            group_in = h5_file[key]

            #if 'desired_dist' not in group_in:
             #   continue

            #desired = np.array(group_in['desired_dist'][()])
            #filtered = ndimage.median_filter(desired, size=12)

            #if (filtered == 0).mean() > 0.2:
             #   continue  # scarta campioni poco informativi

            group_out = h_out.create_group(key)

            for col in group_in.keys():
                dt = np.float32 if col != 'image' else int
                group_out.create_dataset(
                    col,
                    data=group_in[col],
                    compression='gzip',
                    compression_opts=6,
                    chunks=True
                )

    print(f"✅ Salvato in: {output_path}\n")

🔄 Processing train from: /kaggle/input/hdf5-chunk1/gas_and_brake_train_comma_chunk_1_w_imgs.hdf5


Processing train: 100%|██████████| 70/70 [13:40<00:00, 11.72s/it]


✅ Salvato in: /kaggle/working/filtered_chunk1_train.hdf5

🔄 Processing val from: /kaggle/input/hdf5-chunk1/gas_and_brake_val_comma_chunk_1_w_imgs.hdf5


Processing val: 100%|██████████| 12/12 [02:16<00:00, 11.36s/it]


✅ Salvato in: /kaggle/working/filtered_chunk1_val.hdf5

🔄 Processing test from: /kaggle/input/hdf5-chunk1/gas_and_brake_test_comma_chunk_1_w_imgs.hdf5


Processing test: 100%|██████████| 9/9 [01:45<00:00, 11.75s/it]

✅ Salvato in: /kaggle/working/filtered_chunk1_test.hdf5




