In [2]:
import os
from glob import glob
import ntpath
import shutil

import pandas as pd
import librosa
import soundfile as sf
from tqdm import tqdm
import numpy as np

In [3]:
def preprocess_audio(wave_paths, sample_rate, sample_buffer, out_path):
    for filename in tqdm(wave_paths):                    
        audio, _ = librosa.load(filename, sr=sample_rate)                              
        file_length = librosa.get_duration(y=audio, sr=sample_rate)    
        name = ntpath.basename(filename[:-4]) 

        samples_total = file_length * sample_rate                               

        if samples_total < sample_buffer:
            pad_short(audio, sample_rate, sample_buffer, samples_total, out_path, name)
            loop_short(audio, sample_rate, sample_buffer, samples_total, out_path, name)
        elif file_length >= chunk_length:
            chunk_long(audio, sample_rate, sample_buffer, samples_total, out_path, name)

            
def loop_short(audio, sample_rate, sample_buffer, samples_total, out_path, name):
        # loop files that are shorter than chunk length
        count = int(sample_buffer / samples_total) + (sample_buffer % samples_total > 0)
        i = 1                                                              
        loop = audio                                                    

        while i < count:
            loop = np.concatenate([loop, audio])             
            i += 1                                                    

        loop = loop[: int(sample_buffer)]                                  
        sf.write(f'{out_path+name}_loop.wav', loop, sample_rate)                              

        
def pad_short(audio, sample_rate, sample_buffer, samples_total, out_path, name):
        # pad files that are shorter than chunk length
        pad = int(sample_buffer - samples_total)
        wave = np.pad(audio, (0, pad))
            
        sf.write(f'{out_path+name}_padded.wav', wave, sample_rate)     
        
        
def chunk_long(audio, sample_rate, sample_buffer, samples_total, out_path, name):
        # split longer files into chunks
        samples_wrote = 0
        counter = 1                                                                
        while samples_wrote < samples_total:
            if (samples_total - samples_wrote) >= sample_buffer:                           
                chunk = audio[samples_wrote: int(samples_wrote + sample_buffer)]        
                sf.write(f'{out_path+name}_chunk{counter}.wav', chunk, sample_rate)                             
                samples_wrote = int(samples_wrote + sample_buffer - overlap_samples)
                counter += 1                                                       

            # wrap audio for end chunks
            if (samples_total - samples_wrote) < sample_buffer:                          
                if (samples_total - samples_wrote) > min_samples:                  
                    wrap_length = int(sample_buffer - (samples_total - samples_wrote))    
                    wrap = audio[0: int(wrap_length)]                                   
                    chunk = audio[samples_wrote: int(samples_wrote + sample_buffer)]        
                    wrapped_file = np.concatenate([chunk, wrap])                 
                    sf.write(f'{out_path+name}_wrap{counter}.wav', wrapped_file, sample_rate)                       
                    counter += 1                                                    
                samples_wrote = int(samples_wrote + sample_buffer - overlap_samples)   

In [None]:
windows = [(5.5, 2.75), (6, 3), (6.5, 3.25), (7, 3.5)]

for w in windows:
    sample_rate = 44100
    chunk_length = w[0]
    chunk_overlap = w[1]
    min_length = 1.25

    # calculate global variables
    sample_buffer = chunk_length * sample_rate         # number of samples per chunk
    overlap_samples = chunk_overlap * sample_rate      # overlap of chunks in samples
    min_samples = min_length * sample_rate             # minimum end samples

    dsets = ['train', 'val', 'test']
    outdir = f'../../data/production_data/{str(chunk_length).replace(".", "-")}s_crop/'
    os.makedirs(outdir, exist_ok=False)
    for ds in dsets:
        os.makedirs(f'{outdir}/{ds}', exist_ok=False)
        paths = glob(f'../../data/{ds}/*.wav')
        out_path = f'{outdir}/{ds}/'
        preprocess_audio(paths, sample_rate, sample_buffer, out_path)

    df = pd.read_csv(f'../../data/metadata.csv')
    df = df[['file_name', 'unique_file', 'path', 'label', 'subset']]

    all_dfs = []
    for i in tqdm(range(len(df))):
        name = ntpath.basename(df.iloc[i]['path'][:-4]) 
        subset = df.iloc[i]['subset'] 
        subset = 'val' if subset == 'validation' else subset
        chunks = glob(f'{outdir}/{subset}/{name}*.wav')
        n_chunks = len(chunks)
        new_df = pd.DataFrame(np.tile(df.iloc[i].values, n_chunks).reshape(n_chunks, len(df.columns)) , columns=df.columns)
        new_df['path'] = chunks
        all_dfs.append(new_df)

    pp_df = pd.concat(all_dfs)
    pp_df.to_csv(f'{outdir}/metadata.csv', index=False)