In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Preprocessing of the audios

In [2]:
# converting file from .acc to 16 kHz mono .wav file
from pydub import AudioSegment

def convert_aac_to_wav(input_path, output_path, target_sr=16000):
    """
    Converts a .aac audio file to a 16 kHz mono .wav file.

    Args:
        input_path (str): Path to input .aac file
        output_path (str): Path to save output .wav file
        target_sr (int): Target sample rate (default 16,000 Hz)

    Returns:
        str: Path to the saved .wav file
    """

    # Load AAC file
    audio = AudioSegment.from_file(input_path, format="aac")

    # Set frame rate (sampling rate), and convert to mono
    audio = audio.set_frame_rate(target_sr).set_channels(1)

    # Export as WAV
    audio.export(output_path, format="wav")

    return output_path


In [5]:
acc_file = "/content/drive/MyDrive/31 Jul, 12.42 pm​.aac"
wav_file = "/content/drive/MyDrive/CNN_test_file1.wav"
converted = convert_aac_to_wav(acc_file, wav_file)
print(f"Saved WAV at: {converted}")


Saved WAV at: /content/drive/MyDrive/CNN_test_file1.wav


In [6]:
# Extracting the mel-spectrogram
import librosa
import numpy as np

def extract_fixed_length_melspectrogram(
        file_path,
        sr=16000,
        n_mels=40,
        n_fft=512,
        hop_length=256,
        duration=2.0  # seconds
    ):
    """
    Extract a fixed-length mel-spectrogram in decibels from an audio file.

    Args:
        file_path (str): Path to WAV audio file.
        sr (int): Sampling rate to load audio.
        n_mels (int): Number of mel bands.
        n_fft (int): FFT window size.
        hop_length (int): Samples between successive frames.
        duration (float): Duration in seconds to which audio is trimmed/padded.

    Returns:
        np.ndarray: Mel-spectrogram in shape (n_mels, time_frames) as float32.
    """

    # Load audio, trim/pad to fixed length
    samples = int(sr * duration)
    y, _ = librosa.load(file_path, sr=sr, mono=True, duration=duration)

    # Pad with zeros if shorter than duration
    if len(y) < samples:
        padding = samples - len(y)
        y = np.pad(y, (0, padding), mode='constant')

    # Or truncate if longer
    else:
        y = y[:samples]

    # Compute mel spectrogram (power)
    mel_spec = librosa.feature.melspectrogram(
        y=y,
        sr=sr,
        n_fft=n_fft,
        hop_length=hop_length,
        n_mels=n_mels,
        power=2.0
    )

    # Convert to log scale (dB)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)

    return mel_spec_db.astype(np.float32)


In [7]:
import numpy as np

def crop_or_pad_melspectrogram(mel_spec, target_shape=(40, 63)):
    """
    Crop or pad mel-spectrogram to the target shape.

    Args:
        mel_spec (np.ndarray): Mel-spectrogram array with shape (n_mels, time_frames)
        target_shape (tuple): Desired shape (n_mels, time_frames), e.g. (40, 63)

    Returns:
        np.ndarray: Mel-spectrogram cropped/padded to target shape
    """

    n_mels, time_frames = mel_spec.shape
    target_mels, target_frames = target_shape

    # Crop mel bins if needed (usually equal)
    if n_mels > target_mels:
        mel_spec = mel_spec[:target_mels, :]
    elif n_mels < target_mels:
        pad_width = target_mels - n_mels
        mel_spec = np.pad(mel_spec, ((0, pad_width), (0, 0)), mode='constant')

    # Crop or pad time frames dimension
    if time_frames > target_frames:
        mel_spec = mel_spec[:, :target_frames]
    elif time_frames < target_frames:
        pad_width = target_frames - time_frames
        mel_spec = np.pad(mel_spec, ((0, 0), (0, pad_width)), mode='constant')

    return mel_spec


In [9]:
from tensorflow.keras.models import load_model

In [10]:
model = load_model('/content/drive/MyDrive/cnn_audio_classifier_approach (1).h5')



In [11]:
# Preprocessing a single audio file
file_path = "/content/drive/MyDrive/CNN_test_file1.wav"

mel_spec = extract_fixed_length_melspectrogram(file_path)
mel_spec = crop_or_pad_melspectrogram(mel_spec, target_shape=(40, 63))


In [12]:
# ---- Prepare for model input: reshape to (1, 40, 63, 1) ----
input_data = np.expand_dims(mel_spec, axis=(0, -1))  # shape becomes (1, 40, 63, 1)


**-------------------------------------------------Testing of cnn_audio_classifier_approach (1).h5-------------------------------------------------------**

In [14]:
# prediction
prediction = model.predict(input_data)
print("Raw prediction:", prediction)
threshold = 0.5
predicted_class = int(prediction[0][0] >= threshold)

# Optional: Human-readable label
class_labels = ['non-emergency', 'emergency']
print("Predicted class:", predicted_class)
print("Predicted label:", class_labels[predicted_class])


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
Raw prediction: [[0.21432546]]
Predicted class: 0
Predicted label: non-emergency


**--------------------------------------------------Testing of cnn_with_preprocessing_2.h5---------------------------------------------------**

In [16]:
# Due to STFTLayer
custom_objects = {
    'STFTLayer': STFTLayer,
    'MelSpectrogramLayer': MelSpectrogramLayer,
    'LogScaleLayer': LogScaleLayer,
    'ExpandDimsLayer': ExpandDimsLayer,
    'CroppingTimeLayer': CroppingTimeLayer,
    'TransposeLayer': TransposeLayer
}

model = load_model("cnn_with_preprocessing_2.h5", custom_objects=custom_objects)


NameError: name 'STFTLayer' is not defined

In [None]:
# prediction
prediction = preprocessed_model.predict(input_data)
print("Raw prediction:", prediction)
threshold = 0.5
predicted_class = int(prediction[0][0] >= threshold)

# Optional: Human-readable label
class_labels = ['non-emergency', 'emergency']
print("Predicted class:", predicted_class)
print("Predicted label:", class_labels[predicted_class])