In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/spcup-noisedataset/noise-free-sound-0162.wav
/kaggle/input/spcup-noisedataset/noise-free-sound-0177.wav
/kaggle/input/spcup-noisedataset/noise-free-sound-0161.wav
/kaggle/input/spcup-noisedataset/noise-free-sound-0336.wav
/kaggle/input/spcup-noisedataset/noise-free-sound-0034.wav
/kaggle/input/spcup-noisedataset/noise-free-sound-0261.wav
/kaggle/input/spcup-noisedataset/noise-free-sound-0223.wav
/kaggle/input/spcup-noisedataset/noise-free-sound-0014.wav
/kaggle/input/spcup-noisedataset/noise-free-sound-0354.wav
/kaggle/input/spcup-noisedataset/noise-free-sound-0141.wav
/kaggle/input/spcup-noisedataset/noise-free-sound-0249.wav
/kaggle/input/spcup-noisedataset/noise-free-sound-0520.wav
/kaggle/input/spcup-noisedataset/noise-free-sound-0080.wav
/kaggle/input/spcup-noisedataset/noise-free-sound-0526.wav
/kaggle/input/spcup-noisedataset/noise-free-sound-0216.wav
/kaggle/input/spcup-noisedataset/noise-free-sound-0079.wav
/kaggle/input/spcup-noisedataset/noise-free-sound-0517.w

In [None]:
### 1. INSTALLATIONS ###
!pip install pyroomacoustics librosa pystoi pesq


### 2. IMPORTS ###
import pyroomacoustics as pra
import librosa
import numpy as np
import os
import glob
from scipy.signal import stft, istft
import warnings
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv2D, BatchNormalization, ReLU, Flatten, Dense
import random
from scipy.linalg import eigh
from pesq import pesq
from pystoi import stoi
import soundfile as sf

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')
print("Libraries imported.")


### 3. DATA SETUP (MODIFIED FOR KAGGLE) ###

# Download and extract LibriSpeech (requires Internet to be ON in Kaggle)
print("Downloading LibriSpeech test-clean (OpenSLR 12)...")
!wget -q https://www.openslr.org/resources/12/test-clean.tar.gz

print("Extracting LibriSpeech...")
# Extract to the current working directory. The '>' redirects verbose output
!tar -xzvf test-clean.tar.gz > /dev/null
print("Download and extraction complete.")

# --- Define File Paths ---
# The data was extracted to /kaggle/working/LibriSpeech/
SPEECH_DIR = "/kaggle/working/LibriSpeech/test-clean/"

# ‼️ IMPORTANT: KAGGLE DATASET PATH FOR NOISE ‼️
# 1. Zip your local "Noise" folder.
# 2. Create a new Kaggle Dataset and upload that zip.
# 3. Add that dataset to this Kaggle notebook.
# 4. Update the path below to match your dataset name.
#
# Example: If you named your dataset 'my-noise-files' and it contains the 'Noise' folder:
# NOISE_DIR = "/kaggle/input/my-noise-files/Noise/"
#
# --- UPDATE THIS LINE ---
NOISE_DIR = "/kaggle/input/spcup-noisedataset" # <-- ‼️ UPDATE ME

# --- Find Audio Files ---
print("Locating audio files...")

# Find speech files
speech_files = glob.glob(os.path.join(SPEECH_DIR, "**", "*.flac"), recursive=True)

# Find noise files
extensions_to_check = ["*.wav", "*.mp3", "*.flac", "*.m4a"]
noise_files = []
for ext in extensions_to_check:
    noise_files.extend(glob.glob(os.path.join(NOISE_DIR, "**", ext), recursive=True))

# --- Final Check ---
if not speech_files:
    print(f"❌ ERROR: No speech files found in {SPEECH_DIR}. Check path.")
elif not noise_files:
    print(f"❌ ERROR: No noise files found in {NOISE_DIR}. Check your Kaggle dataset path.")
else:
    print(f"✅ Found {len(speech_files)} speech files.")
    print(f"✅ Found {len(noise_files)} noise files.")
    print("\nReady to proceed.")


### 4. STFT AND SIGNAL PARAMETERS ###

# --- STFT Parameters from Heymann et al. paper ---
fs_16k = 16000     # Sampling rate
N_FFT = 1024       # Frame size
HOP_LENGTH = 256
WIN_LENGTH = 1024

# --- Parameters for Data/Model Shape ---
N_BINS = (N_FFT // 2) + 1  # 513
N_MASKS = 2
WINDOW_SIZE = 11


### 5. CORE FUNCTIONS (SIMULATION, MASKING, DATA PREP) ###

def get_stft(signal):
    """Calculates the STFT of a signal."""
    _, _, Zxx = stft(signal,
                       fs=fs_16k,
                       nperseg=WIN_LENGTH,
                       noverlap=WIN_LENGTH - HOP_LENGTH,
                       nfft=N_FFT)
    return Zxx

def get_power(signal):
    """Calculates the average power of a signal."""
    return np.mean(signal**2)

def set_power(signal, ref_power):
    """Scales a signal to a desired power."""
    current_power = get_power(signal)
    if current_power == 0:
        return signal
    scale_factor = np.sqrt(ref_power / current_power)
    return signal * scale_factor

def add_wgn(signal, snr_db):
    """Adds White Gaussian Noise to a signal at a specific SNR."""
    signal_power = get_power(signal)
    noise_power = signal_power / (10**(snr_db / 10))

    # Generate noise
    wgn = np.random.normal(0, np.sqrt(noise_power), signal.shape)

    # Return the noisy signal and the noise itself
    return signal + wgn, wgn

def generate_anechoic_sample(speech_file, noise_file):
    """
    Generates one training sample for the anechoic case (Task 1).
    Returns the mixed STFT (input) and the ideal masks (target).
    """

    # === 1. Define Simulation Parameters (from SP Cup Doc) ===
    room_dim = [4.9, 4.9, 4.9]
    mic_pos_1 = [2.41, 2.45, 1.5]
    mic_pos_2 = [2.49, 2.45, 1.5]
    mic_locations = np.c_[mic_pos_1, mic_pos_2]
    speech_pos = [2.45, 3.45, 1.5]
    noise_pos = [3.22, 3.06, 1.5]

    # === 2. Load and Resample Audio ===
    try:
        speech_audio, _ = librosa.load(speech_file, sr=fs_16k, mono=True)
        noise_audio, _ = librosa.load(noise_file, sr=fs_16k, mono=True)
    except Exception as e:
        print(f"Error loading audio: {e}")
        return None, None

    # Truncate to the shorter audio length
    min_len = min(len(speech_audio), len(noise_audio))
    speech_audio = speech_audio[:min_len]
    noise_audio = noise_audio[:min_len]

    if min_len < WIN_LENGTH:
        return None, None # Skip short files

    # === 3. Run Simulation (Speech Only) ===
    room_speech = pra.ShoeBox(room_dim, fs=fs_16k, max_order=0)
    room_speech.add_microphone_array(mic_locations)
    room_speech.add_source(speech_pos, signal=speech_audio)
    room_speech.simulate()
    X_mic = room_speech.mic_array.signals.T

    # === 4. Run Simulation (Noise Only) ===
    room_noise = pra.ShoeBox(room_dim, fs=fs_16k, max_order=0)
    room_noise.add_microphone_array(mic_locations)
    room_noise.add_source(noise_pos, signal=noise_audio)
    room_noise.simulate()
    I_mic = room_noise.mic_array.signals.T

    # === 5. Truncate and Align ===
    X_mic = X_mic[:min_len, :]
    I_mic = I_mic[:min_len, :]

    # === 6. Mix Signals (0dB SIR, 5dB SNR) ===
    target_power = get_power(X_mic[:, 0])
    I_mic_scaled = set_power(I_mic, target_power)
    mixture_si = X_mic + I_mic_scaled
    mixture_final, wgn = add_wgn(mixture_si, snr_db=5.0)

    # === 7. Generate Masks (Heymann et al. method) ===
    N_total = I_mic_scaled + wgn
    X_stft = get_stft(X_mic[:, 0])
    N_stft = get_stft(N_total[:, 0])
    X_power = np.abs(X_stft)**2
    N_power = np.abs(N_stft)**2
    IBM_X = (X_power > N_power).astype(float)
    IBM_N = (N_power >= X_power).astype(float)

    # Get the STFT of the final mixed signal (for network input)
    mixture_stft_ch0 = get_stft(mixture_final[:, 0])
    mixture_stft_ch1 = get_stft(mixture_final[:, 1])

    # Stack the magnitude of the 2-channel STFTs
    network_input = np.stack([
        np.abs(mixture_stft_ch0),
        np.abs(mixture_stft_ch1)
    ], axis=-1)

    # Stack the target masks
    target_masks = np.stack([IBM_X, IBM_N], axis=-1)

    return network_input, target_masks

def create_windowed_dataset(network_input, target_masks, window_size=WINDOW_SIZE):
    """
    Converts full-length spectrograms into windowed patches
    for the CNN/FF networks.
    """
    # Pad the input so we can get a window centered on the first/last frames
    padding = window_size // 2

    # Pad the time axis (axis=0)
    input_padded = np.pad(
        network_input,
        ((padding, padding), (0, 0), (0, 0)),
        mode='reflect'
    )

    # Use a built-in Keras utility to do the sliding window
    X_patches = tf.image.extract_patches(
        images=np.expand_dims(input_padded, axis=0), # Add batch dim
        sizes=[1, window_size, N_BINS, 1],
        strides=[1, 1, 1, 1],
        rates=[1, 1, 1, 1],
        padding='VALID'
    ).numpy()

    # Squeeze and reshape
    X_patches = np.squeeze(X_patches, axis=(0, 2))
    n_frames = network_input.shape[0]
    n_channels = network_input.shape[2]
    X_patches = X_patches.reshape((n_frames, window_size, N_BINS, n_channels))

    # --- Create the target (y)
    # Reshape from (513, n_frames, 2) to (n_frames, 513, 2)
    y_swapped = np.swapaxes(target_masks, 0, 1)

    # Reshape to (n_frames, 513 * 2) -> (n_patches, 1026)
    y_masks = y_swapped.reshape((n_frames, N_BINS * N_MASKS))

    return X_patches, y_masks

def generate_data_batch(num_samples, speech_list, noise_list):
    """
    Generates a batch of (X, y) data by randomly pairing
    speech and noise files from the provided lists.
    """
    all_X_data = []
    all_y_data = []

    if not speech_list or not noise_list:
        print("❌ Error: Received empty speech_list or noise_list.")
        return None, None

    for i in range(num_samples):
        speech_file = random.choice(speech_list)
        noise_file = random.choice(noise_list)

        network_input, target_masks = generate_anechoic_sample(speech_file, noise_file)
        if network_input is None:
            continue # Skip this sample

        inp_transposed = np.transpose(network_input, (1, 0, 2))
        X_ch0, y_ch0 = create_windowed_dataset(inp_transposed[..., 0:1], target_masks)
        X_ch1, y_ch1 = create_windowed_dataset(inp_transposed[..., 1:2], target_masks)

        all_X_data.append(X_ch0)
        all_X_data.append(X_ch1)
        all_y_data.append(y_ch0)
        all_y_data.append(y_ch1)

    if not all_X_data:
        print("❌ Error: No data was generated.")
        return None, None

    X_batch = np.concatenate(all_X_data, axis=0)
    y_batch = np.concatenate(all_y_data, axis=0)

    return X_batch, y_batch

def reconstruct_masks(patched_masks, n_frames):
    """
    Converts the model's patch-based output back into
    full-length spectrogram masks.
    """
    # 1. Reshape to (n_frames, 513, 2)
    full_masks = patched_masks.reshape((n_frames, N_BINS, N_MASKS))

    # 2. Swap axes to (513, n_frames, 2)
    full_masks = np.transpose(full_masks, (1, 0, 2))

    # 3. Separate the two masks
    predicted_mask_X = full_masks[..., 0]
    predicted_mask_N = full_masks[..., 1]

    return predicted_mask_X, predicted_mask_N

def gev_beamformer(mixture_stft_2ch, mask_X, mask_N):
    """
    Applies the GEV beamformer using the predicted masks.
    """
    n_bins, n_frames, n_ch = mixture_stft_2ch.shape
    enhanced_stft = np.zeros((n_bins, n_frames), dtype=np.complex128)
    epsilon = 1e-8 # Add a small value to prevent division by zero

    # Process each frequency bin independently
    for f in range(n_bins):
        Y = mixture_stft_2ch[f, :, :].T  # Now (2, n_frames)
        m_x = mask_X[f, :]
        m_n = mask_N[f, :]

        # Calculate PSD matrices
        Phi_XX = (Y * m_x) @ Y.conj().T + epsilon * np.eye(n_ch)
        Phi_NN = (Y * m_n) @ Y.conj().T + epsilon * np.eye(n_ch)

        # Solve GEV problem
        try:
            eigenvalues, eigenvectors = eigh(Phi_XX, Phi_NN)
            filter_f = eigenvectors[:, -1] # Filter is eigenvector for largest eigenvalue
            enhanced_stft[f, :] = filter_f.conj().T @ Y
        except np.linalg.LinAlgError:
            enhanced_stft[f, :] = Y[0] # Fallback if GEV fails

    return enhanced_stft

def enhance_audio_file(model, speech_file, noise_file):
    """
    Runs the full simulation and enhancement pipeline for evaluation.
    """
    # --- 1. Simulate a Test Sample ---
    # A. Load audio
    speech_audio, _ = librosa.load(speech_file, sr=fs_16k, mono=True)
    noise_audio, _ = librosa.load(noise_file, sr=fs_16k, mono=True)
    min_len = min(len(speech_audio), len(noise_audio))
    speech_audio = speech_audio[:min_len]
    noise_audio = noise_audio[:min_len]

    # B. Set up room
    room_dim = [4.9, 4.9, 4.9]
    mic_locations = np.c_[[2.41, 2.45, 1.5], [2.49, 2.45, 1.5]]
    speech_pos = [2.45, 3.45, 1.5]
    noise_pos = [3.22, 3.06, 1.5]

    # C. Simulate Speech
    room_speech = pra.ShoeBox(room_dim, fs=fs_16k, max_order=0)
    room_speech.add_microphone_array(mic_locations)
    room_speech.add_source(speech_pos, signal=speech_audio)
    room_speech.simulate()
    X_mic = room_speech.mic_array.signals.T[:min_len, :]

    # D. Simulate Noise
    room_noise = pra.ShoeBox(room_dim, fs=fs_16k, max_order=0)
    room_noise.add_microphone_array(mic_locations)
    room_noise.add_source(noise_pos, signal=noise_audio)
    room_noise.simulate()
    I_mic = room_noise.mic_array.signals.T[:min_len, :]

    # E. Mix signals
    target_power = get_power(X_mic[:, 0])
    I_mic_scaled = set_power(I_mic, target_power)
    mixture_si = X_mic + I_mic_scaled
    mixture_final, wgn = add_wgn(mixture_si, snr_db=5.0)

    # --- 2. Get STFTs ---
    mixture_stft_ch0 = get_stft(mixture_final[:, 0])
    mixture_stft_ch1 = get_stft(mixture_final[:, 1])
    mixture_stft_2ch = np.stack([mixture_stft_ch0, mixture_stft_ch1], axis=-1)
    network_input = np.stack([np.abs(mixture_stft_ch0), np.abs(mixture_stft_ch1)], axis=-1)
    n_frames = network_input.shape[1]

    # --- 3. Prepare Patches for CNN ---
    inp_transposed = np.transpose(network_input, (1, 0, 2))
    # Create dummy masks (not used)
    dummy_masks = np.zeros((N_BINS, n_frames, N_MASKS)) 
    X_ch0, _ = create_windowed_dataset(inp_transposed[..., 0:1], dummy_masks)
    X_ch1, _ = create_windowed_dataset(inp_transposed[..., 1:2], dummy_masks)

    # --- 4. Predict Masks ---
    pred_masks_ch0 = model.predict(X_ch0, verbose=0)
    pred_masks_ch1 = model.predict(X_ch1, verbose=0)
    avg_patched_masks = (pred_masks_ch0 + pred_masks_ch1) / 2.0

    # --- 5. Reconstruct Masks ---
    pred_mask_X, pred_mask_N = reconstruct_masks(avg_patched_masks, n_frames)

    # --- 6. Apply GEV Beamformer ---
    enhanced_stft = gev_beamformer(mixture_stft_2ch, pred_mask_X, pred_mask_N)

    # --- 7. Reconstruct Audio ---
    _, enhanced_audio = istft(
        enhanced_stft,
        fs=fs_16k,
        nperseg=WIN_LENGTH,
        noverlap=WIN_LENGTH - HOP_LENGTH,
        nfft=N_FFT
    )
    
    clean_speech_audio = X_mic[:, 0]
    min_len_out = min(len(enhanced_audio), len(clean_speech_audio))
    
    return enhanced_audio[:min_len_out], clean_speech_audio[:min_len_out]

def calculate_snr(signal, noise):
    """Calculates the Signal-to-Noise Ratio (SNR) in decibels (dB)."""
    signal_power = np.mean(signal**2)
    noise_power = np.mean(noise**2)
    if noise_power == 0:
        return float('inf')
    snr_db = 10 * np.log10(signal_power / noise_power)
    return snr_db


### 6. MODEL DEFINITION ###

def build_cnn_model(input_shape=(WINDOW_SIZE, N_BINS, 1)):
    """
    Builds a Keras CNN model based on the intent of the Heymann et al. paper.
    Input shape is (frames, freqs, 1)
    """
    inputs = Input(shape=input_shape)
    # --- Conv Block 1 ---
    x = Conv2D(filters=32, kernel_size=(3, 3), padding='same')(inputs)
    x = BatchNormalization()(x)
    x = ReLU()(x)
    # --- Conv Block 2 ---
    x = Conv2D(filters=32, kernel_size=(3, 3), padding='same')(x)
    x = BatchNormalization()(x)
    x = ReLU()(x)
    # --- Flatten & Dense ---
    x = Flatten()(x)
    x = Dense(513)(x)
    x = BatchNormalization()(x)
    x = ReLU()(x)
    # --- Output Layer ---
    # Output is 1026 (513 bins * 2 masks)
    outputs = Dense(N_BINS * N_MASKS, activation='sigmoid')(x) 
    
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer='adam', loss='binary_crossentropy')
    return model


### 7. MAIN EXECUTION SCRIPT ###

# Check if data files were found before proceeding
if speech_files and noise_files:

    # --- 7a. Define Model ---
    print("\n--- Building Model ---")
    cnn_model = build_cnn_model(input_shape=(WINDOW_SIZE, N_BINS, 1))
    cnn_model.summary()

    # --- 7b. Generate Fixed Validation Set ---
    VAL_SAMPLES_FIXED = 50 # Reduced from 100 for speed in Kaggle
    print(f"\n--- Generating FIXED validation set with {VAL_SAMPLES_FIXED} samples... ---")
    
    X_val_fixed, y_val_fixed = generate_data_batch(
        VAL_SAMPLES_FIXED, speech_files, noise_files
    )

    if X_val_fixed is not None:
        print(f"✅ Fixed validation set created.")
        print(f"   X_val_fixed shape: {X_val_fixed.shape}")
        print(f"   y_val_fixed shape: {y_val_fixed.shape}")

        # --- 7c. Generate Static Training Set ---
        N_TRAINING_SAMPLES = 200 # Reduced from 500 for speed in Kaggle
        print(f"\n--- Generating STATIC training set with {N_TRAINING_SAMPLES} samples... ---")
        
        X_train_static, y_train_static = generate_data_batch(
            N_TRAINING_SAMPLES, speech_files, noise_files
        )

        if X_train_static is not None:
            print(f"\n✅ Static Training Set created.")
            print(f"   X_train_static shape: {X_train_static.shape}")
            print(f"   y_train_static shape: {y_train_static.shape}")

            # --- 7d. Train the Model ---
            EPOCHS = 30
            BATCH_SIZE = 32
            print(f"\n--- Starting training for {EPOCHS} epochs... ---")
            
            history = cnn_model.fit(
                X_train_static,
                y_train_static,
                epochs=EPOCHS,
                batch_size=BATCH_SIZE,
                validation_data=(X_val_fixed, y_val_fixed),
                shuffle=True
            )
            print("\n✅✅✅ Static Training complete! ✅✅✅")

            # --- 7e. Evaluate the Model ---
            print("\n--- Running Final Evaluation ---")
            test_speech_file = random.choice(speech_files)
            test_noise_file = random.choice(noise_files)

            print("Running full pipeline on new test files:")
            print(f"   Speech: {os.path.basename(test_speech_file)}")
            print(f"   Noise:  {os.path.basename(test_noise_file)}")

            enhanced_audio, clean_audio = enhance_audio_file(
                cnn_model,
                test_speech_file,
                test_noise_file
            )

            print("\n--- Evaluation Metrics ---")
            stoi_score = stoi(clean_audio, enhanced_audio, fs_16k)
            print(f"STOI Score: {stoi_score:.4f} (Higher is better, 0 to 1)")
            
            pesq_score = pesq(fs_16k, clean_audio, enhanced_audio, 'nb')
            print(f"PESQ Score: {pesq_score:.4f} (Higher is better, -0.5 to 4.5)")

            # Save files to /kaggle/working/
            sf.write("enhanced_output.wav", enhanced_audio, fs_16k)
            sf.write("original_clean.wav", clean_audio, fs_16k)
            print("\nSaved 'enhanced_output.wav' and 'original_clean.wav' to /kaggle/working/")

        else:
            print("❌ Error creating training set. Skipping training.")
    else:
        print("❌ Error creating validation set. Skipping training.")

    # --- 7f. SNR Function Example ---
    print("\n--- SNR Calculation Example ---")
    t = np.linspace(0, 1, 1000)
    signal = np.sin(2 * np.pi * 5 * t)
    noise = np.random.normal(0, 0.5, 1000)
    snr_value = calculate_snr(signal, noise)
    print(f"Signal Power: {np.mean(signal**2):.4f}")
    print(f"Noise Power:  {np.mean(noise**2):.4f}")
    print(f"SNR: {snr_value:.2f} dB")

else:
    print("\n‼️ Skipping script execution because data files were not found.")
    print("Please check your SPEECH_DIR and NOISE_DIR paths.")

Collecting pyroomacoustics
  Downloading pyroomacoustics-0.8.6.tar.gz (35.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.1/35.1 MB[0m [31m51.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pystoi
  Downloading pystoi-0.4.1-py2.py3-none-any.whl.metadata (4.0 kB)
Collecting pesq
  Downloading pesq-0.0.4.tar.gz (38 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading pystoi-0.4.1-py2.py3-none-any.whl (8.2 kB)
Building wheels for collected packages: pyroomacoustics, pesq
  Building wheel for pyroomacoustics (pyproject.toml) ... [?25l[?25hdone
  Created wheel for pyroomacoustics: filename=pyroomacoustics-0.8.6-cp311-cp311-linux_x86_64.whl size=46975507 sha256=195ef2e10cedbbabcc50cc092a6256e92af25edb8d92ee18751a5e222fbf0e2a
  Stored in directory: /root/.cache/

2025-11-06 17:48:23.745917: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1762451304.169288      37 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1762451304.338619      37 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Libraries imported.
Downloading LibriSpeech test-clean (OpenSLR 12)...
Extracting LibriSpeech...
Download and extraction complete.
Locating audio files...
✅ Found 2620 speech files.
✅ Found 617 noise files.

Ready to proceed.

--- Building Model ---


I0000 00:00:1762451345.769394      37 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13942 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1762451345.770090      37 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13942 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5



--- Generating FIXED validation set with 50 samples... ---
✅ Fixed validation set created.
   X_val_fixed shape: (38238, 11, 513, 1)
   y_val_fixed shape: (38238, 1026)

--- Generating STATIC training set with 200 samples... ---

✅ Static Training Set created.
   X_train_static shape: (132776, 11, 513, 1)
   y_train_static shape: (132776, 1026)

--- Starting training for 30 epochs... ---
Epoch 1/30


I0000 00:00:1762451406.573055     168 service.cc:148] XLA service 0x7c3f1000f8d0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1762451406.575858     168 service.cc:156]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1762451406.575880     168 service.cc:156]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5
I0000 00:00:1762451407.139876     168 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m   5/4150[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m2:09[0m 31ms/step - loss: 0.6606

I0000 00:00:1762451411.421027     168 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m4150/4150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m151s[0m 34ms/step - loss: 0.1804 - val_loss: 0.6033
Epoch 2/30
[1m4150/4150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 33ms/step - loss: 0.1306 - val_loss: 1.3110
Epoch 3/30
[1m4150/4150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 33ms/step - loss: 0.1123 - val_loss: 0.8449
Epoch 4/30
[1m4150/4150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 33ms/step - loss: 0.0965 - val_loss: 3.4788
Epoch 5/30
[1m4150/4150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 33ms/step - loss: 0.0822 - val_loss: 3.3339
Epoch 6/30
[1m4150/4150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 33ms/step - loss: 0.0695 - val_loss: 0.3362
Epoch 7/30
[1m4150/4150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 33ms/step - loss: 0.0595 - val_loss: 6.9587
Epoch 8/30
[1m4003/4150[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m4s[0m 32ms/step - loss: 0.0516