# V2 Dataset Format

# This document describes the V2 dataset format. This format is used by the updated Dataloader for increased speed especially on large datasets.

# TL:DR
files on disk now contain 1000 samples not 1

Files on disk are now tensors not numpy arrays

Files on disk now have shape [1000, 1, 128, 88] not [128, 88]

Datsets now have minimum size of 1000 samples. And total samples must be a multiple of 1000?

# CHANGES FROM V1
- Instead of each file being saved on disk as a single numpy array, files are now saved in 'bundles' of 1000 samples each. This allows for faster loading of the dataset by reducing load calls from disk by 1000x which was a previous bottleneck.

- data is now saved to disk as tensors instead of numpy arrays. removing the need for conversion to tensors in the dataloader.???

- Channel dim is already included in the data. This means that the data is now of shape (X, C, Y)????(C, X, Y) instead of (X, Y). Which removes this step from the dataloader.

- Sparse??

- NPZ support??



In [18]:


#%% - User Inputs
bundle_size = 1000    # 1K for V2 Spec
dataset_full_filepath = r"N:\Yr 3 Project Datasets\RDT 1M\RDT 1M\\Data\\"
dataset_output_filepath = r"A:\[V2]RDT 1M\Data\\"

#%%
import os
import numpy as np
import torch    
from tqdm.auto import tqdm







# Append \Data\ to the end of both filepaths
#dataset_full_filepath = os.path.join(dataset_full_filepath, "Data")
#dataset_output_filepath = os.path.join(dataset_output_filepath, "Data")

# Create the output directory if it doesn't exist
if not os.path.exists(dataset_output_filepath):
    os.makedirs(dataset_output_filepath)
else:
    raise Exception("Output directory already exists, please delete it before running this script")

# Get a list of all the files in the dataset
dataset_files = os.listdir(dataset_full_filepath)

# create batches of 'bundle_size' file paths and add each batch to a list
file_paths_on_disk = [dataset_files[i:i+bundle_size] for i in range(0, len(dataset_files), bundle_size)]

for bundle_idx, batch_of_file_paths_on_disk in enumerate(tqdm(file_paths_on_disk, desc="Processing bundles", colour="yellow")):
    
    #iterativly load 1000 numpy arrys into one array
    data_bundle = np.stack([np.load(dataset_full_filepath + file_path) for file_path in (batch_of_file_paths_on_disk)], axis=0)

    # turn to tensor
    data_bundle = torch.tensor(data_bundle, dtype=torch.float64)

    # add channel dim
    data_bundle = data_bundle.unsqueeze(1)

    # name file
    file_name = f'test_{bundle_idx}.pt'

    # save to disk
    torch.save(data_bundle, os.path.join(dataset_output_filepath, file_name))





Processing bundles: 100%|[33m██████████[0m| 1000/1000 [4:58:28<00:00, 17.91s/it] 
