In [12]:
import os
import numpy as np
import librosa
import soundfile as sf
from tqdm import tqdm

In [13]:
# ================= CONFIG =================
SAMPLE_RATE = 16000          # standard for speech
N_FFT = 1024
HOP_LENGTH = 256
WIN_LENGTH = 1024
WINDOW = "hann"

CLIP_DURATION = 4.0          # seconds
NUM_SAMPLES = int(SAMPLE_RATE * CLIP_DURATION)

# ================= PATHS =================
DATASET_DIR = "data"
CLEAN_DIR = os.path.join(DATASET_DIR, "clean")
NOISE_DIR = os.path.join(DATASET_DIR, "noise")

OUT_DIR = "processed_data"
os.makedirs(OUT_DIR, exist_ok=True)

In [14]:
def load_audio(path, sr=SAMPLE_RATE):
    audio, _ = librosa.load(path, sr=sr, mono=True)
    return audio

def fix_length(audio, length=NUM_SAMPLES):
    if len(audio) > length:
        return audio[:length]
    elif len(audio) < length:
        return np.pad(audio, (0, length - len(audio)))
    return audio

def compute_stft(audio):
    stft = librosa.stft(
        audio,
        n_fft=N_FFT,
        hop_length=HOP_LENGTH,
        win_length=WIN_LENGTH,
        window=WINDOW
    )
    magnitude = np.abs(stft)
    return magnitude

def collect_files_recursive(rootdir, extensions):
    files = []
    extensions = tuple(ext.lower() for ext in extensions)

    for root, _, filenames in os.walk(rootdir):
        for fname in filenames:
            if fname.lower().endswith(extensions):
                files.append(os.path.join(root, fname))

    return sorted(files)

def add_noise(clean, noise, snr_db=5):
    clean_power = np.mean(clean ** 2)
    noise_power = np.mean(noise ** 2)

    snr = 10 ** (snr_db / 10)
    scale = np.sqrt(clean_power / (snr * noise_power))

    noisy = clean + scale * noise
    return noisy


In [15]:
clean_files = collect_files_recursive(CLEAN_DIR, (".wav", ".mp3"))
noise_files = collect_files_recursive(NOISE_DIR, (".wav",))

print(f"Clean files: {len(clean_files)}")
print(f"Noise files: {len(noise_files)}")

Clean files: 1000
Noise files: 260


In [16]:
K = 3                         # noisy variants per clean
SNR_LIST = [0, 5, 10]         # optional but recommended

X_noisy = []
Y_clean = []

for clean_path in clean_files:
    # Load clean once
    clean_audio = load_audio(clean_path)
    clean_audio = fix_length(clean_audio)

    # Precompute clean STFT once
    clean_mag = compute_stft(clean_audio)

    for i in range(K):
        # Random noise every time
        noise_path = np.random.choice(noise_files)
        noise_audio = load_audio(noise_path)
        noise_audio = fix_length(noise_audio)

        snr = SNR_LIST[i] if i < len(SNR_LIST) else 5

        noisy_audio = add_noise(clean_audio, noise_audio, snr_db=snr)
        noisy_mag = compute_stft(noisy_audio)

        X_noisy.append(noisy_mag)
        Y_clean.append(clean_mag)

    
    # STFT
    clean_mag = compute_stft(clean_audio)
    noisy_mag = compute_stft(noisy_audio)

    # Store
    X_noisy.append(noisy_mag)
    Y_clean.append(clean_mag)

In [17]:
X_noisy = np.array(X_noisy, dtype=np.float32)
Y_clean = np.array(Y_clean, dtype=np.float32)

print("Noisy shape :", X_noisy.shape)
print("Clean shape :", Y_clean.shape)

Noisy shape : (4000, 513, 251)
Clean shape : (4000, 513, 251)


In [19]:
X_noisy = X_noisy[:, np.newaxis, :, :]
Y_clean = Y_clean[:, np.newaxis, :, :]

print("CNN input shape:", X_noisy.shape)

CNN input shape: (4000, 1, 1, 513, 251)


In [20]:
np.save(os.path.join(OUT_DIR, "X_noisy.npy"), X_noisy)
np.save(os.path.join(OUT_DIR, "Y_clean.npy"), Y_clean)

print("Saved preprocessed data.")

Saved preprocessed data.


In [22]:
data = np.load('processed_data/X_noisy.npy')

In [23]:
stft_sample = data[0]

In [24]:
magnitude = np.abs(stft_sample)
stft_db = 20 * np.log10(magnitude + 1e-6) # Add small epsilon to avoid log(0)

In [26]:
import matplotlib.pyplot as plt

In [28]:
plt.figure(figsize=(10, 4))
stft_db = librosa.amplitude_to_db(stft_mag, ref=np.max)
plt.imshow(stft_db, aspect='auto', origin='lower', cmap='magma')
plt.title('STFT Magnitude (dB) - First Audio Clip')
plt.ylabel('Frequency Bin')
plt.xlabel('Time Frame')
plt.colorbar(format='%+2.0f dB')
plt.tight_layout()
plt.show()

NameError: name 'stft_mag' is not defined

<Figure size 1000x400 with 0 Axes>