In [12]:
!pip install -q tensorflow==2.11.* tensorflow_io==0.28.* tensorflow-hub librosa soundfile numpy matplotlib

^C


  You can safely remove it manually.
ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'C:\\Users\\aurel\\miniconda3\\envs\\sentinelai\\Lib\\site-packages\\matplotlib\\_c_internal_utils.cp310-win_amd64.pyd'
Consider using the `--user` option or check the permissions.



## Inference Notebook

In [31]:
import tensorflow as tf
import tensorflow_hub as hub
import librosa
import numpy as np
import os
import soundfile as sf # Often used by librosa under the hood, good to have

print(f"TensorFlow Version: {tf.__version__}")
print(f"Librosa Version: {librosa.__version__}")

TensorFlow Version: 2.19.0
Librosa Version: 0.11.0


In [32]:
# %% [markdown]
# ## Cell 2: Audio Loading Function
#
# Define the function to load and preprocess audio files consistently with the training process.

# %%
def load_wav_16k_mono(filename):
    """
    Loads a WAV file specified by the filename, converts it to 16kHz sample rate,
    ensures it's mono channel, and returns the audio data as a float32 NumPy array.

    Args:
        filename (str or tf.Tensor): Path to the WAV file or a tf.Tensor containing the path.

    Returns:
        np.ndarray: A float32 NumPy array containing the audio waveform,
                    or None if an error occurs during loading.
    """
    # If filename is a tf.Tensor, convert it to a string path
    if isinstance(filename, tf.Tensor):
        path = filename.numpy().decode('utf-8')
    else:
        path = filename # Assume it's already a string path

    if not os.path.exists(path):
        print(f"Error: Audio file not found at {path}")
        return None

    # Load using librosa, ensure 16kHz sample rate and mono channel
    try:
        # use soundfile=False if you encounter issues, but default is often fine
        wav, sample_rate = librosa.load(path, sr=16000, mono=True)
        # Ensure float32 dtype, which is expected by YAMNet/TF models
        return wav.astype(np.float32)
    except Exception as e:
        print(f"Error loading or processing file {path}: {e}")
        return None

# Quick test of the function (optional)
# Create a dummy wav file for testing if needed
# sample_rate = 44100; duration = 1; freq = 440
# t = np.linspace(0., duration, int(sample_rate * duration))
# amplitude = np.iinfo(np.int16).max * 0.5
# data = (amplitude * np.sin(2. * np.pi * freq * t)).astype(np.int16)
# sf.write('dummy_audio.wav', data, sample_rate)
# test_load = load_wav_16k_mono('dummy_audio.wav')
# if test_load is not None:
#    print(f"Dummy audio loaded successfully, shape: {test_load.shape}, dtype: {test_load.dtype}")
#    os.remove('dummy_audio.wav') # Clean up dummy file
# else:
#    print("Dummy audio loading failed.")

In [34]:
# %% [markdown]
# ## Cell 3: Load Pre-trained Models (Keras 3 Compatible)
#
# Load the YAMNet model from TensorFlow Hub and your custom-trained scream detector model saved in TensorFlow SavedModel format.
# This version uses `TFSMLayer` for compatibility with Keras 3 loading SavedModel directories.
#
# **Important:** This cell assumes your trained model is saved in a directory named `human_scream_detector` in the same environment where you are running this notebook.

# %%
import tensorflow as tf
import tensorflow_hub as hub
import os

# Define model paths/handles
yamnet_handle = 'https://tfhub.dev/google/yamnet/1'
saved_model_path = 'human_scream_detector' # <-- Make sure this path is correct!

yamnet_model_inf = None
scream_detector_model = None

# Load YAMNet model
print("Loading YAMNet model from TensorFlow Hub...")
try:
    yamnet_model_inf = hub.load(yamnet_handle)
    print("YAMNet model loaded successfully.")
except Exception as e:
    print(f"Error loading YAMNet model: {e}")
    # yamnet_model_inf remains None

# Load your custom scream detector model using TFSMLayer
print(f"\nLoading custom scream detector model from: {saved_model_path} (using TFSMLayer for Keras 3)")
if os.path.exists(saved_model_path):
    try:
        # Load the inference function from the SavedModel as a Keras Layer
        # 'serving_default' is the typical endpoint for models saved from Keras
        scream_detector_layer = tf.keras.layers.TFSMLayer(saved_model_path, call_endpoint='serving_default')

        # To make it easily usable with `.predict()`, wrap it in a Sequential model.
        # We need to explicitly define the input shape expected by the original model.
        # Your original model's input layer was Input(shape=(1024,), name='yamnet_embedding')
        scream_detector_model = tf.keras.Sequential([
            tf.keras.layers.Input(shape=(1024,), dtype=tf.float32, name='yamnet_embedding_input'), # Explicit Input layer
            scream_detector_layer
        ])

        print("Custom scream detector model loaded successfully via TFSMLayer and wrapped.")
        # Optional: Display model architecture
        # scream_detector_model.summary()

    except Exception as e:
        print(f"Error loading custom model from {saved_model_path} using TFSMLayer: {e}")
        # scream_detector_model remains None
else:
    print(f"ERROR: Saved model directory not found at '{saved_model_path}'.")
    print("Please ensure the path is correct and the model was saved previously.")
    # scream_detector_model remains None

# Verify models are loaded before proceeding
if yamnet_model_inf is None:
    print("\nWARNING: YAMNet model failed to load.")
if scream_detector_model is None:
    print("\nWARNING: Custom scream detector model failed to load.")

Loading YAMNet model from TensorFlow Hub...
YAMNet model loaded successfully.

Loading custom scream detector model from: human_scream_detector (using TFSMLayer for Keras 3)
Custom scream detector model loaded successfully via TFSMLayer and wrapped.


In [35]:
# %% [markdown]
# ## Cell 4: Define the Inference Function
#
# This function takes a path to a WAV file and uses the loaded models to predict whether it contains a scream.

# %%
def predict_scream(wav_file_path, yamnet_model, classifier_model, threshold=0.5):
    """
    Analyzes a WAV file to predict if it contains a human scream.

    Args:
        wav_file_path (str): The full path to the input WAV audio file.
        yamnet_model: The loaded YAMNet model instance from TensorFlow Hub.
        classifier_model: Your loaded custom Keras classifier model.
        threshold (float): The probability threshold to classify as 'Scream'.
                           Defaults to 0.5.

    Returns:
        tuple: A tuple containing:
               - str: The prediction label ('Scream', 'Non-Scream', or 'Error').
               - float: The predicted probability of the audio being a scream (0.0 to 1.0).
                 Returns 0.0 if an error occurred before prediction.
               Returns (None, None) if models are not loaded.
    """
    # Pre-requisite checks
    if yamnet_model is None or classifier_model is None:
        print("Error: One or both models are not loaded. Cannot perform prediction.")
        return "Error: Model not loaded", 0.0

    # 1. Load and preprocess audio
    waveform_np = load_wav_16k_mono(wav_file_path)

    if waveform_np is None: # Check if loading failed in the helper function
        return "Error: Audio Load Failed", 0.0
    if waveform_np.size == 0: # Check for empty audio data after loading
        print(f"Warning: Audio file {wav_file_path} resulted in empty data.")
        return "Error: Empty Audio Data", 0.0

    # Convert NumPy array to TensorFlow tensor
    waveform = tf.constant(waveform_np, dtype=tf.float32)

    # 2. Extract YAMNet embeddings
    try:
        # YAMNet returns scores, embeddings, and log_mel_spectrogram
        _, embeddings, _ = yamnet_model(waveform)
        # embeddings shape is (N, 1024), where N is the number of frames
    except Exception as e:
        print(f"Error getting YAMNet embeddings for {wav_file_path}: {e}")
        return "Error: YAMNet Failed", 0.0

    if tf.size(embeddings) == 0:
         print(f"Warning: YAMNet produced empty embeddings for {wav_file_path}. Might be too short?")
         # Handle very short files - predict non-scream? Or return error?
         # Let's return non-scream with 0 probability for this case.
         return "Non-Scream", 0.0 # Or potentially "Error: Short Audio"

    # 3. Aggregate embeddings (calculate clip-level embedding)
    #    Using reduce_mean, consistent with the training preprocessing
    clip_embedding = tf.reduce_mean(embeddings, axis=0) # Shape -> (1024,)

    # 4. Prepare for classifier (add batch dimension)
    model_input = tf.expand_dims(clip_embedding, axis=0) # Shape -> (1, 1024)

    # 5. Make prediction using the custom classifier
    try:
        probability = classifier_model.predict(model_input, verbose=0)[0, 0] # Get scalar prob
    except Exception as e:
        print(f"Error during classifier prediction for {wav_file_path}: {e}")
        return "Error: Classifier Failed", 0.0

    # 6. Determine label based on the threshold
    prediction_label = "Scream" if probability >= threshold else "Non-Scream"

    return prediction_label, float(probability)

print("Inference function 'predict_scream' defined.")

Inference function 'predict_scream' defined.


In [40]:
# %% [markdown]
# ## Cell 4: Define the Inference Function (with Debugging)
#
# This function takes a path to a WAV file and uses the loaded models to predict whether it contains a scream. Includes extra print statements for debugging.

# %%
import tensorflow as tf # Ensure TF is imported if running this cell independently
import numpy as np # Ensure numpy is imported
import os # Ensure os is imported

def predict_scream(wav_file_path, yamnet_model, classifier_model, threshold=0.5):
    """
    Analyzes a WAV file to predict if it contains a human scream.
    Includes enhanced debugging prints.

    Args:
        wav_file_path (str): The full path to the input WAV audio file.
        yamnet_model: The loaded YAMNet model instance from TensorFlow Hub.
        classifier_model: Your loaded custom Keras classifier model (wrapped TFSMLayer).
        threshold (float): The probability threshold to classify as 'Scream'. Defaults to 0.5.

    Returns:
        tuple: A tuple containing:
               - str: The prediction label ('Scream', 'Non-Scream', or 'Error:...').
               - float: The predicted probability (0.0 to 1.0).
    """
    print(f"--- Debug: Starting prediction for {wav_file_path} ---")

    # Pre-requisite checks
    if yamnet_model is None or classifier_model is None:
        print("Debug: Error - One or both models are not loaded.")
        return "Error: Model not loaded", 0.0

    # 1. Load and preprocess audio
    print(f"Debug: Loading audio...")
    waveform_np = load_wav_16k_mono(wav_file_path) # Assumes load_wav_16k_mono is defined (Cell 2)

    if waveform_np is None:
        print("Debug: Error - Audio loading returned None.")
        return "Error: Audio Load Failed", 0.0
    if waveform_np.size == 0:
        print("Debug: Error - Audio data is empty after loading.")
        return "Error: Empty Audio Data", 0.0
    print(f"Debug: Audio loaded, shape: {waveform_np.shape}, dtype: {waveform_np.dtype}")

    # Convert NumPy array to TensorFlow tensor
    waveform = tf.constant(waveform_np, dtype=tf.float32)

    # 2. Extract YAMNet embeddings
    print(f"Debug: Extracting YAMNet embeddings...")
    try:
        scores, embeddings, spectrogram = yamnet_model(waveform)
        print(f"Debug: YAMNet embeddings extracted, shape: {embeddings.shape}")
        # Check for NaNs/Infs in embeddings
        if tf.reduce_any(tf.math.is_nan(embeddings)) or tf.reduce_any(tf.math.is_inf(embeddings)):
            print("Debug: WARNING - NaNs or Infs found in raw YAMNet embeddings!")
            # Optional: return error or try to proceed? Let's try proceeding for now.
    except Exception as e:
        print(f"Debug: Error during YAMNet embedding extraction: {e}")
        return "Error: YAMNet Failed", 0.0

    if tf.size(embeddings) == 0:
         print(f"Debug: Error - YAMNet produced empty embeddings. Audio might be too short.")
         return "Error: Empty Embeddings", 0.0

    # 3. Aggregate embeddings
    print(f"Debug: Aggregating embeddings...")
    clip_embedding = tf.reduce_mean(embeddings, axis=0)
    print(f"Debug: Clip embedding calculated, shape: {clip_embedding.shape}")
    if tf.reduce_any(tf.math.is_nan(clip_embedding)) or tf.reduce_any(tf.math.is_inf(clip_embedding)):
        print("Debug: WARNING - NaNs or Infs found in aggregated clip embedding!")
        # Decide how to handle this, maybe return an error?
        # return "Error: Invalid Embedding", 0.0

    # 4. Prepare for classifier (add batch dimension)
    model_input = tf.expand_dims(clip_embedding, axis=0)
    print(f"Debug: Prepared model input, shape: {model_input.shape}, dtype: {model_input.dtype}")

    # 5. Make prediction using the custom classifier
    print(f"Debug: Making prediction with classifier...")
    try:
        # --- IMPORTANT DEBUG STEP ---
        # Call predict and inspect the raw output *before* indexing
        raw_output = classifier_model.predict(model_input, verbose=0)
        print(f"    Debug: Raw classifier output: {raw_output}")
        print(f"    Debug: Type of raw output: {type(raw_output)}")
        if hasattr(raw_output, 'shape'):
            print(f"    Debug: Shape of raw output: {raw_output.shape}")
        # --- End Debug Step ---

        # Now, try to extract the probability, adjusting indexing if needed based on debug output
        # Original attempt:
        probability = raw_output['dense_2'][0, 0]
        # If raw_output shape is (1,), use:
        # probability = raw_output[0]
        # If raw_output is a dict, access the correct key:
        # probability = raw_output['output_key_name'][0, 0] # Replace 'output_key_name'

        print(f"Debug: Probability extracted: {probability}")

    except Exception as e:
        print(f"!!! Debug: Error during classifier prediction or result extraction !!!")
        print(f"    Exception Type: {type(e)}")
        print(f"    Exception Args: {e.args}")
        print(f"    Full Exception: {e}")
        # Also print the input shape just before prediction attempt
        print(f"    Input shape to classifier was: {model_input.shape}")
        return "Error: Classifier Failed", 0.0

    # 6. Determine label based on the threshold
    prediction_label = "Scream" if probability >= threshold else "Non-Scream"
    print(f"Debug: Final Label: {prediction_label}, Probability: {float(probability)}")
    print(f"--- Debug: Prediction finished for {wav_file_path} ---")

    return prediction_label, float(probability)

print("Inference function 'predict_scream' (with debugging) defined.")

Inference function 'predict_scream' (with debugging) defined.


In [42]:
predict_scream('screaming.wav', yamnet_model_inf, scream_detector_model, threshold=0.5) # Example call to test the function

--- Debug: Starting prediction for screaming.wav ---
Debug: Loading audio...
Debug: Audio loaded, shape: (160000,), dtype: float32
Debug: Extracting YAMNet embeddings...
Debug: YAMNet embeddings extracted, shape: (20, 1024)
Debug: Aggregating embeddings...
Debug: Clip embedding calculated, shape: (1024,)
Debug: Prepared model input, shape: (1, 1024), dtype: <dtype: 'float32'>
Debug: Making prediction with classifier...
    Debug: Raw classifier output: {'dense_2': array([[0.7935498]], dtype=float32)}
    Debug: Type of raw output: <class 'dict'>
Debug: Probability extracted: 0.7935497760772705
Debug: Final Label: Scream, Probability: 0.7935497760772705
--- Debug: Prediction finished for screaming.wav ---


('Scream', 0.7935497760772705)