In [3]:
import os
import librosa
import soundfile as sf
import numpy as np
import random
from tqdm import tqdm
import json

# Paths
CLEAN_RAW = "data/clean"
NOISE_RAW = "data/noise"

CLEAN_FIXED = "preprocessed/clean_fixed"
NOISE_FIXED = "preprocessed/noise_fixed"
MIXED_OUT = "preprocessed/mixed"
PAIRS_OUT = "preprocessed/pairs"
FRAMES_OUT = "preprocessed/frames"

# Create folders if not exist
for p in [CLEAN_FIXED, NOISE_FIXED, MIXED_OUT, PAIRS_OUT, FRAMES_OUT]:
    os.makedirs(p, exist_ok=True)

TARGET_SR = 16000
TARGET_LEN = TARGET_SR * 4   # 4 sec audios
FRAME_SIZE = 16384


In [4]:
def load_audio(path, sr=TARGET_SR):
    audio, _ = librosa.load(path, sr=sr, mono=True)
    return audio

def normalize(audio):
    peak = np.max(np.abs(audio))
    if peak > 0:
        audio = audio / peak
    return audio

def trim_silence(audio):
    trimmed, _ = librosa.effects.trim(audio, top_db=40)
    return trimmed

def fix_length(audio, target_len=TARGET_LEN):
    if len(audio) < target_len:
        repeats = int(np.ceil(target_len / len(audio)))
        audio = np.tile(audio, repeats)
    return audio[:target_len]

def random_crop(audio, target_len=TARGET_LEN):
    if len(audio) <= target_len:
        return fix_length(audio, target_len)
    start = random.randint(0, len(audio) - target_len)
    return audio[start:start + target_len]


In [5]:
clean_files = sorted(os.listdir(CLEAN_RAW))

for file in tqdm(clean_files):
    path = os.path.join(CLEAN_RAW, file)
    audio = load_audio(path)
    audio = trim_silence(audio)
    audio = fix_length(audio)
    audio = normalize(audio)

    sf.write(os.path.join(CLEAN_FIXED, file), audio, TARGET_SR)


100%|██████████| 1000/1000 [01:15<00:00, 13.23it/s]


In [6]:
noise_list = []

for root, dirs, files in os.walk(NOISE_RAW):
    for f in files:
        if f.endswith((".wav", ".mp3")):
            noise_list.append(os.path.join(root, f))

print("Total noise files found:", len(noise_list))

for npath in tqdm(noise_list):
    audio = load_audio(npath)
    audio = normalize(audio)
    audio = trim_silence(audio)
    audio = fix_length(audio)  # Each noise also becomes exactly 4 sec

    outname = os.path.basename(npath).replace(" ", "_")
    sf.write(os.path.join(NOISE_FIXED, outname), audio, TARGET_SR)


Total noise files found: 260


100%|██████████| 260/260 [00:01<00:00, 140.34it/s]


In [7]:
def apply_snr(clean, noise, snr_db):
    clean_power = np.mean(clean**2)
    noise_power = np.mean(noise**2)

    target_noise_power = clean_power / (10**(snr_db / 10))
    scaling = np.sqrt(target_noise_power / noise_power)

    return clean + noise * scaling


In [8]:
clean_files = sorted(os.listdir(CLEAN_FIXED))
noise_files = sorted(os.listdir(NOISE_FIXED))

metadata = {}

for fname in tqdm(clean_files):
    clean_path = os.path.join(CLEAN_FIXED, fname)
    clean = load_audio(clean_path)

    # pick two different noises
    n1, n2 = random.sample(noise_files, 2)

    noiseA = load_audio(os.path.join(NOISE_FIXED, n1))
    noiseB = load_audio(os.path.join(NOISE_FIXED, n2))

    snrA = random.randint(0, 20)
    snrB = random.randint(0, 20)

    noisyA = apply_snr(clean, noiseA, snrA)
    noisyB = apply_snr(clean, noiseB, snrB)

    base = fname.replace(".wav", "")

    sf.write(f"{MIXED_OUT}/{base}_A.wav", noisyA, TARGET_SR)
    sf.write(f"{MIXED_OUT}/{base}_B.wav", noisyB, TARGET_SR)

    np.save(f"{PAIRS_OUT}/{base}_A.npy", noisyA)
    np.save(f"{PAIRS_OUT}/{base}_B.npy", noisyB)

    metadata[base] = {
        "clean": fname,
        "noise_A": n1,
        "noise_B": n2,
        "snr_A": snrA,
        "snr_B": snrB
    }

json.dump(metadata, open("preprocessed/metadata.json","w"), indent=4)


100%|██████████| 1000/1000 [00:07<00:00, 129.33it/s]


In [9]:
for npy_file in tqdm(os.listdir(PAIRS_OUT)):
    if not npy_file.endswith(".npy"):
        continue
    
    arr = np.load(os.path.join(PAIRS_OUT, npy_file))

    base = npy_file.replace(".npy","")
    
    for i in range(0, len(arr)-FRAME_SIZE, FRAME_SIZE):
        frame = arr[i:i+FRAME_SIZE]
        outname = f"{base}_frame{i}.npy"
        np.save(os.path.join(FRAMES_OUT, outname), frame)


100%|██████████| 2000/2000 [00:04<00:00, 460.02it/s]
