## Filter Dataset based on duration

In [1]:
import pandas as pd
from multiprocessing import Pool
from glob import glob
from pydub import AudioSegment
import os

## Filter dataset based on duration
### Start with v1
audios = pd.DataFrame(glob("v1/*/v1a/train/*.wav"), columns=['path'])
audios['path'] = audios['path'].apply(lambda x: x.strip())

def get_duration(x):
    return AudioSegment.from_file(x).duration_seconds

## Using Multithreading
with Pool(os.cpu_count()//2) as p:
    audios['duration'] = list(p.imap(get_duration, audios['path']))

In [2]:
audios.sort_values('duration', ascending=False, inplace=True)
audios.reset_index(drop=True, inplace=True)

validation_index = int(len(audios) * 0.9)
train, val = audios[:validation_index], audios[validation_index:]

train['path'].to_csv("train.data.txt", header=False, index=False)
val['path'].to_csv("validation.data.txt", header=False, index=False)

## Profiling of loading of dataset

In [3]:
import torchaudio, torch, os
from transformers import Wav2Vec2FeatureExtractor
from safetensors.torch import save_file, load_file
MODEL_NAME = "facebook/wav2vec2-large-xlsr-53"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_NAME)

def process_audio(audio_path):
    audio, sr = torchaudio.load(audio_path)
    audio = torchaudio.functional.resample(audio, sr, feature_extractor.sampling_rate)
    inputs = feature_extractor(
        audio[0], 
        sampling_rate=feature_extractor.sampling_rate, 
        max_length=20*feature_extractor.sampling_rate, 
        return_tensors="pt",
        truncation=True
    )
    batch = {
        "input_values": inputs.input_values[0],
    }
    if inputs.attention_mask is not None:
        batch["attention_mask"] = inputs.attention_mask[0]
    
    return batch

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
%%time
batch = process_audio("v1/Assamese/v1a/train/2533274790396330.wav")

CPU times: user 415 ms, sys: 19.7 ms, total: 434 ms
Wall time: 138 ms


In [9]:
from safetensors.torch import load_file, save_file
save_file(batch, "v1/Assamese/v1a/train/2533274790396330.safetensors")

In [10]:
%%time
batch = load_file("v1/Assamese/v1a/train/2533274790396330.safetensors")

CPU times: user 6.49 ms, sys: 0 ns, total: 6.49 ms
Wall time: 52.1 ms
