In [None]:
import os
import librosa
import numpy as np
import noisereduce as nr
import soundfile as sf
from scipy.signal import butter, sosfilt
from tqdm import tqdm
from multiprocessing import Pool
import time
from pydub import AudioSegment


def load_audio(file_path, target_sr=16000):
    try:
        if file_path.lower().endswith(".mp3"):
            audio = AudioSegment.from_file(file_path, format="mp3").set_channels(1)
            samples = (
                np.array(audio.get_array_of_samples()).astype(np.float32) / 32768.0
            )
            sr = audio.frame_rate
            if sr != target_sr:
                samples = librosa.resample(samples, orig_sr=sr, target_sr=target_sr)
            return samples, target_sr
        else:
            y, sr = librosa.load(file_path, sr=target_sr, mono=True)
            return y, sr
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        return None, None


def normalize_volume(samples):
    return librosa.util.normalize(samples)


def reduce_noise(samples, sr):
    return nr.reduce_noise(y=samples, sr=sr, prop_decrease=0.5)


def bandpass_filter(samples, sr, low=80, high=8000):
    sos = butter(10, [low, high], btype="band", fs=sr, output="sos")
    return sosfilt(sos, samples)


def remove_silence_from_array(y, sr, silence_thresh=-35, min_silence_len=300):
    non_silent = librosa.effects.split(
        y, top_db=-silence_thresh, frame_length=2048, hop_length=512
    )
    if not non_silent.size:
        print("No speech detected")
        return y
    return np.concatenate([y[start:end] for start, end in non_silent])


def preprocess_audio(file_path, output_path):
    if os.path.exists(output_path):
        print(f"Skipping {file_path}: already processed")
        return
    print(f"Processing: {file_path}")
    start_time = time.time()
    try:
        y, sr = load_audio(file_path)
        if y is None or sr is None:
            return

        y = reduce_noise(y, sr)
        y = normalize_volume(y)
        y = bandpass_filter(y, sr)
        y = remove_silence_from_array(y, sr)

        sf.write(output_path, y, sr)
        print(f"Saved to: {output_path} in {time.time() - start_time:.2f}s")
    except Exception as e:
        print(f"Error processing {file_path}: {e}")


def preprocess_audio_wrapper(args):
    file_path, output_path = args
    preprocess_audio(file_path, output_path)


def preprocess_folder(folder_path="data", output_folder="processed"):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    files = [f for f in os.listdir(folder_path) if f.lower().endswith((".wav", ".mp3"))]
    print(f"Found {len(files)} files")

    tasks = [
        (
            os.path.join(folder_path, f),
            os.path.join(output_folder, f.replace(".mp3", ".wav")),
        )
        for f in files
    ]

    with Pool(processes=os.cpu_count() - 1) as pool:
        for _ in tqdm(
            pool.imap_unordered(preprocess_audio_wrapper, tasks),
            total=len(tasks),
            desc="Processing files",
        ):
            pass


if __name__ == "__main__":
    preprocess_folder("data", "processed")

Found 209791 files


Processing files:   0%|          | 0/209791 [00:00<?, ?it/s]