In [1]:
from pathlib import Path
import re
import time

In [2]:
audios_dir : Path = Path("/datasets/AudioSet/dvc-audioset/audioset")

In [3]:
def filepath_to_info(filename):
    # Existing pattern to extract YTID without changing it
    ytid_pattern = r'^.*/([A-Za-z0-9_-]+?)(?=_[0-9]+(?:\.[0-9]+)?(?:[-_][0-9]+(?:\.[0-9]+)?)?\.wav$)'
    ytid_match = re.match(ytid_pattern, filename)
    
    if ytid_match:
        ytid = ytid_match.group(1)
        
        # Pattern to extract start_seconds
        # Looks for an underscore followed by digits, possibly with a decimal
        start_pattern = r'_(\d+(?:\.\d+)?)'
        start_match = re.search(start_pattern, filename)
        
        if start_match:
            start_seconds_str = start_match.group(1)
        else:
            print(f"Failed to extract start seconds from filename: {filename}")
            return None
        
        # Pattern to extract end_seconds
        # Looks for an underscore or hyphen followed by digits, possibly with a decimal, before '.wav'
        end_pattern = r'[-_](\d+(?:\.\d+)?)\.wav$'
        end_match = re.search(end_pattern, filename)
        
        if end_match:
            end_seconds_str = end_match.group(1)
        else:
            print(f"Failed to extract end seconds from filename: {filename}")
            return None
        
        try:
            # Convert the extracted strings to integers (after converting to float to handle decimals)
            start_seconds =float(start_seconds_str)
            end_seconds = float(end_seconds_str)
            return ytid, start_seconds, end_seconds
        except ValueError:
            print(f"Error converting times in file {filename}")
            return None
    else:
        print(f"Failed to parse YTID from filename: {filename}")
        return None

In [4]:
start : float = time.time()
slice_dict : dict[str, dict[tuple, str]] = {}
num_files : int = 0
for file in audios_dir.glob("*/*.wav"):
    ytid, start_seconds, end_seconds = filepath_to_info(str(file))

    if ytid not in slice_dict:
        slice_dict[ytid] = {}
    
    slice_tuple : tuple = (start_seconds, end_seconds)
    
    if slice_tuple not in slice_dict[ytid]:
        slice_dict[ytid][slice_tuple] = file
    # print(f"{}")
    num_files+=1

end : float = time.time()

print(f"Duration is {end - start}")
print(f"{num_files=}")

Duration is 13.34546422958374
num_files=1265753


In [15]:
len(slice_dict.keys())


1262888

In [16]:
slice_dict

{'qdEsV8nK7Yo': {(0.0,
   10.0): PosixPath('/datasets/AudioSet/dvc-audioset/audioset/Bark/qdEsV8nK7Yo_0_10.wav')},
 'PoIt8NVmg2M': {(30.0,
   40.0): PosixPath('/datasets/AudioSet/dvc-audioset/audioset/Bark/PoIt8NVmg2M_30_40.wav')},
 'wjlv5YG6XSM': {(20.0,
   30.0): PosixPath('/datasets/AudioSet/dvc-audioset/audioset/Bark/wjlv5YG6XSM_20_30.wav')},
 '-5rnGk2e1ao': {(0.0,
   10.0): PosixPath('/datasets/AudioSet/dvc-audioset/audioset/Bark/-5rnGk2e1ao_0.0-10.0.wav')},
 'uq_Rn_6FVRM': {(6.0,
   170.0): PosixPath('/datasets/AudioSet/dvc-audioset/audioset/Bark/uq_Rn_6FVRM_160_170.wav')},
 'gPoZZLAnz7Q': {(50.0,
   60.0): PosixPath('/datasets/AudioSet/dvc-audioset/audioset/Bark/gPoZZLAnz7Q_50_60.wav')},
 't1M765YoIQE': {(220.0,
   230.0): PosixPath('/datasets/AudioSet/dvc-audioset/audioset/Bark/t1M765YoIQE_220_230.wav')},
 '-9P1IlNEgsM': {(0.0,
   8.0): PosixPath('/datasets/AudioSet/dvc-audioset/audioset/Bark/-9P1IlNEgsM_0.0-8.0.wav')},
 'uz7S8q-Dq3Q': {(0.0,
   7.0): PosixPath('/datasets/Audio