In [11]:
import os
import librosa
import numpy as np
import noisereduce as nr
import soundfile as sf
from pydub import AudioSegment
from pydub.silence import split_on_silence
from joblib import Parallel, delayed
import multiprocessing


def load_audio(file_path, target_sr=16000):
    if file_path.lower().endswith(".mp3"):
        audio = AudioSegment.from_file(file_path, format="mp3").set_channels(1)
        samples = np.array(audio.get_array_of_samples()).astype(np.float32) / 32768.0
        sr = audio.frame_rate
        y = librosa.resample(samples, orig_sr=sr, target_sr=target_sr)
        return y, target_sr
    else:
        y, sr = librosa.load(file_path, sr=target_sr)
        return y, sr


def normalize_volume(samples):
    return librosa.util.normalize(samples)


def reduce_noise(samples, sr):
    return nr.reduce_noise(y=samples, sr=sr)


def bandpass_filter(samples, sr, low=80, high=8000):
    fft = librosa.stft(samples)
    freqs = librosa.fft_frequencies(sr=sr)
    mask = (freqs >= low) & (freqs <= high)
    fft[~mask, :] = 0
    return librosa.istft(fft)


def remove_silence_from_array(y, sr, silence_thresh=-35, min_silence_len=300):
    temp_path = f"temp_for_silence_{os.getpid()}.wav"
    sf.write(temp_path, y, sr)

    sound = AudioSegment.from_wav(temp_path)
    chunks = split_on_silence(
        sound,
        min_silence_len=min_silence_len,
        silence_thresh=silence_thresh,
        keep_silence=100,
    )

    if not chunks:
        print("No speech detected")
        os.remove(temp_path)
        return y

    combined = AudioSegment.empty()
    for chunk in chunks:
        combined += chunk

    os.remove(temp_path)
    samples = np.array(combined.get_array_of_samples()).astype(np.float32) / 32768.0
    return samples


def preprocess_audio(file_path, output_path):
    print(f"Processing: {file_path}")
    try:
        y, sr = load_audio(file_path)
        y = reduce_noise(y, sr)
        y = normalize_volume(y)
        y = bandpass_filter(y, sr)
        y = remove_silence_from_array(y, sr)
        sf.write(output_path, y, sr)
        print(f"Saved to: {output_path}")
    except Exception as e:
        print(f"Error processing {file_path}: {e}")


def process_file(filename, folder_path, output_folder):
    if filename.lower().endswith((".wav", ".mp3")):
        input_path = os.path.join(folder_path, filename)
        output_path = os.path.join(output_folder, filename.replace(".mp3", ".wav"))
        preprocess_audio(input_path, output_path)


def preprocess_folder(folder_path="data", output_folder="processed"):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    files = os.listdir(folder_path)
    print(f"Found files in {folder_path}: {files}")

    # Use all available CPU cores
    num_cores = multiprocessing.cpu_count()
    Parallel(n_jobs=num_cores, verbose=10)(
        delayed(process_file)(filename, folder_path, output_folder)
        for filename in files
    )


if __name__ == "__main__":
    folders = [
        # ("./data_batches/Female_Fifties", "./processed/Female_Fifties"),
        # ("./data_batches/Female_Twenties", "./processed/Female_Twenties"),
        # ("./data_batches/Male_Fifties", "./processed/Male_Fifties"),
        ("./data_batches/Male_Twenties", "./processed/Male_Twenties"),
    ]

    for input_folder, output_folder in folders:
        preprocess_folder(input_folder, output_folder)

Found files in ./data_batches/Male_Twenties: ['.gitkeep', 'common_voice_en_100307.mp3', 'common_voice_en_100319.mp3', 'common_voice_en_100320.mp3', 'common_voice_en_100336.mp3', 'common_voice_en_100342.mp3', 'common_voice_en_100377.mp3', 'common_voice_en_100379.mp3', 'common_voice_en_100380.mp3', 'common_voice_en_100384.mp3', 'common_voice_en_100396.mp3', 'common_voice_en_100398.mp3', 'common_voice_en_100400.mp3', 'common_voice_en_100406.mp3', 'common_voice_en_100422.mp3', 'common_voice_en_100450.mp3', 'common_voice_en_100461.mp3', 'common_voice_en_100462.mp3', 'common_voice_en_100467.mp3', 'common_voice_en_100470.mp3', 'common_voice_en_100482.mp3', 'common_voice_en_100493.mp3', 'common_voice_en_100504.mp3', 'common_voice_en_10058.mp3', 'common_voice_en_10059.mp3', 'common_voice_en_100706.mp3', 'common_voice_en_100717.mp3', 'common_voice_en_100723.mp3', 'common_voice_en_100731.mp3', 'common_voice_en_100733.mp3', 'common_voice_en_101145.mp3', 'common_voice_en_101146.mp3', 'common_voice_

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 tasks      | elapsed:   13.1s
[Parallel(n_jobs=8)]: Done   9 tasks      | elapsed:   13.2s
[Parallel(n_jobs=8)]: Done  16 tasks      | elapsed:   14.0s
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:   14.9s
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:   15.9s
[Parallel(n_jobs=8)]: Done  45 tasks      | elapsed:   16.6s
[Parallel(n_jobs=8)]: Done  56 tasks      | elapsed:   17.5s
[Parallel(n_jobs=8)]: Done  69 tasks      | elapsed:   18.7s
[Parallel(n_jobs=8)]: Done  82 tasks      | elapsed:   19.6s
[Parallel(n_jobs=8)]: Done  97 tasks      | elapsed:   20.7s
[Parallel(n_jobs=8)]: Done 112 tasks      | elapsed:   22.0s
[Parallel(n_jobs=8)]: Done 129 tasks      | elapsed:   23.2s
[Parallel(n_jobs=8)]: Done 146 tasks      | elapsed:   24.5s
[Parallel(n_jobs=8)]: Done 165 tasks      | elapsed:   26.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:   27.4s
[Parallel(

KeyboardInterrupt: 