In [None]:
#default_exp dataset

# Compile Dataset

> Processes all included sources and compiles them into a single dataset file `data/dataset/dataset.zip`.

In [None]:
import os
import shutil
import pandas as pd

In [None]:
#export

def sync_audio_files(source_dir: str, target_dir: str):
    for filename in os.listdir(source_dir):
        if filename.endswith(".wav"):
            target_file_path = os.path.join(target_dir, filename)
            if not os.path.exists(target_file_path):
                shutil.copy2(os.path.join(source_dir, filename), target_file_path)

In [None]:
#export

DATA_PATH = "data/"

class Dataset:
    def __init__(self, sources: list[str], data_path: str = DATA_PATH):
        self.sources = sources

        self.data_path = data_path

        self.original_path = os.path.join(data_path, "original/")
        self.intermediate_path = os.path.join(data_path, "intermediate/")

        self.output_path = os.path.join(data_path, "dataset/")
        self.audio_output_path = os.path.join(self.output_path, "audio/")
        self.metadata_output_path = os.path.join(self.output_path, "metadata.csv")

    def _prepare_output(self):
        if not os.path.exists(self.audio_output_path):
            os.makedirs(self.audio_output_path)

    def _compile_source(self, source: str):
        source_path = os.path.join(self.intermediate_path, source)
        sync_audio_files(source_path, self.audio_output_path)
        return pd.read_csv(os.path.join(source_path, "metadata.csv"))
    
    def _package_data(self):
        # It syncs with existing dataset, only zipping changed files
        os.system("cd {} ; zip -qq -FSr dataset.zip {}".format(self.data_path, self.output_path))
    
    def compile(self) -> pd.DataFrame:
        """Compiles a dataset and returns the newly created metadata (already saved)."""
        self._prepare_output()

        dataset_metadata = pd.DataFrame()
        for source in self.sources:
            source_metadata = self._compile_source(source)
            dataset_metadata = dataset_metadata.append(source_metadata)
        dataset_metadata.to_csv(self.metadata_output_path, index=False)

        self._package_data()

        return dataset_metadata

In [None]:
#export

def load_version(version_path: str = "version"):
    with open(version_path, "r") as f:
        return int(f.read())

In [None]:
#export

def bump_version(version_path: str = "version"):
    with open(version_path, "w") as f:
        version = load_version(version_path)
        f.write(str(version + 1))
        print("Version moved from {} to {}".format(version, version + 1))

## Setup the environment and load version

In [None]:
sources = ["space_divers_mini", "footsteps_one_ppsfx_004", "footsteps_two_ppsfx_008", "edward_v1.1"]

## Copy all audio data to dataset

Let's confirm we didn't have any files that failed to copy.  
If we do, it could be due to:  
* Genuine failure to copy  
* Some files in the target folder need deleting  
  * Please delete them, no code yet
* Hash conflict (same file from different sources)  
  * In this case, we must debug the sources and make sure there are no duplicates

In [None]:
#assert len(os.listdir(AUDIO_PATH)) == len(dataset_metadata)