# V2 Dataset Format

# This document describes the V2 dataset format. This format is used by the updated Dataloader for increased speed especially on large datasets.

# TL:DR
files on disk now contain 1000 samples not 1

Files on disk are now tensors not numpy arrays

Files on disk now have shape [1000, 1, 128, 88] not [128, 88]

Datsets now have minimum size of 1000 samples. And total samples must be a multiple of 1000?

# CHANGES FROM V1
- Instead of each file being saved on disk as a single numpy array, files are now saved in 'bundles' of 1000 samples each. This allows for faster loading of the dataset by reducing load calls from disk by 1000x which was a previous bottleneck.

- data is now saved to disk as tensors instead of numpy arrays. removing the need for conversion to tensors in the dataloader.???

- Channel dim is already included in the data. This means that the data is now of shape (X, C, Y)????(C, X, Y) instead of (X, Y). Which removes this step from the dataloader.

- Sparse??

- NPZ support??



In [1]:

import os
import numpy as np
import torch
from tqdm.auto import tqdm
from concurrent.futures import ThreadPoolExecutor

# User Inputs
bundle_size = 10000    # 1K for V2 Spec, 10K for V3
dataset_full_filepath = r"N:\Yr 3 Project Datasets\PDT 10K\\Data\\"
dataset_output_filepath = r"N:\Yr 3 Project Datasets\\V3_PDT_10K\\Data\\"

# Create the output directory if it doesn't exist
if not os.path.exists(dataset_output_filepath):
    os.makedirs(dataset_output_filepath)
else:
    raise Exception("Output directory already exists, please delete it before running this script")

# Get a list of all the files in the dataset
dataset_files = os.listdir(dataset_full_filepath)

# Create batches of 'bundle_size' file paths and add each batch to a list
file_paths_on_disk = [dataset_files[i:i+bundle_size] for i in range(0, len(dataset_files), bundle_size)]

def process_bundle(bundle_idx, batch_of_file_paths_on_disk):
    # Stack 'bundle_size' numpy arrays into one array
    data_bundle = np.stack([np.load(os.path.join(dataset_full_filepath, file_path)) for file_path in batch_of_file_paths_on_disk], axis=0)

    # Convert numpy array to tensor
    data_bundle = torch.tensor(data_bundle, dtype=torch.float64)

    # Add channel dim
    data_bundle = data_bundle.unsqueeze(1)

    # Name file
    file_name = f'test_{bundle_idx}.pt'

    # Save bundle to disk
    torch.save(data_bundle, os.path.join(dataset_output_filepath, file_name))

# Use ThreadPoolExecutor to parallelize the processing
with ThreadPoolExecutor() as executor:
    list(tqdm(executor.map(lambda args: process_bundle(*args), enumerate(file_paths_on_disk)), total=len(file_paths_on_disk), desc="Processing bundles", colour="yellow"))


# make sure all threads are done
executor.shutdown(wait=True)

print("All bundles processed and saved successfully")

Processing bundles:   0%|          | 0/1 [00:00<?, ?it/s]

All bundles processed and saved successfully


FileNotFoundError: [Errno 2] No such file or directory: 'N:\\Yr 3 Project Datasets\\\\SPEEDTESTDEL1_[V3]_RDT_50K\\Data\\\\test_50.pt'