# 🚨 Model_V6_Hybrid: Siren Detection with CNN + LSTM

In [1]:

import os
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, Dropout, LSTM, Reshape, concatenate
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split
import pyaudio
from tensorflow.keras.models import load_model
import time

In [3]:
def extract_mfcc(file_path, max_pad_len=173):
    y, sr = librosa.load(file_path, sr=22050)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
    if mfcc.shape[1] < max_pad_len:
        mfcc = np.pad(mfcc, pad_width=((0, 0), (0, max_pad_len - mfcc.shape[1])), mode='constant')
    else:
        mfcc = mfcc[:, :max_pad_len]
    mfcc = (mfcc - np.mean(mfcc)) / np.std(mfcc)
    return mfcc

data = []
labels = []

base_path = "E:\Campus\Semester\FYP\siren_detection_project\dataset"
for label, folder in enumerate(["non_siren", "siren"]):
    folder_path = os.path.join(base_path, folder)
    for file in os.listdir(folder_path):
        try:
            mfcc = extract_mfcc(os.path.join(folder_path, file))
            data.append(mfcc)
            labels.append(label)
        except Exception as e:
            print("Error loading:", file, e)

X = np.array(data)[..., np.newaxis]  # Add channel dimension
y = np.array(labels)


  base_path = "E:\Campus\Semester\FYP\siren_detection_project\dataset"
  "cipher": algorithms.TripleDES,
  "class": algorithms.Blowfish,
  "class": algorithms.TripleDES,


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [5]:
input_shape = (40, 173, 1)
inputs = Input(shape=input_shape)

# CNN branch
x = Conv2D(32, (3, 3), activation='relu', padding='same')(inputs)
x = MaxPooling2D((2, 2))(x)
x = Dropout(0.3)(x)

x = Conv2D(64, (3, 3), activation='relu', padding='same')(x)
x = MaxPooling2D((2, 2))(x)
x = Dropout(0.3)(x)

x = Reshape((10, -1))(x)  # Prepare for LSTM

# LSTM branch
x = LSTM(64)(x)

# Fully connected
x = Dense(64, activation='relu')(x)
x = Dropout(0.3)(x)
outputs = Dense(1, activation='sigmoid')(x)

model = Model(inputs, outputs)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

In [6]:
history = model.fit(X_train, y_train, epochs=25, batch_size=16, validation_data=(X_test, y_test))

Epoch 1/25
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 54ms/step - accuracy: 0.7412 - loss: 0.4864 - val_accuracy: 0.9662 - val_loss: 0.1336
Epoch 2/25
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 49ms/step - accuracy: 0.9596 - loss: 0.1346 - val_accuracy: 0.9122 - val_loss: 0.2614
Epoch 3/25
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 49ms/step - accuracy: 0.9596 - loss: 0.1760 - val_accuracy: 0.9595 - val_loss: 0.1181
Epoch 4/25
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 46ms/step - accuracy: 0.9611 - loss: 0.1460 - val_accuracy: 0.9392 - val_loss: 0.1896
Epoch 5/25
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 46ms/step - accuracy: 0.9859 - loss: 0.0552 - val_accuracy: 0.9662 - val_loss: 0.1045
Epoch 6/25
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 47ms/step - accuracy: 0.9812 - loss: 0.0652 - val_accuracy: 0.9730 - val_loss: 0.0932
Epoch 7/25
[1m37/37[0m [32m━━━━

In [7]:
model.evaluate(X_test, y_test)
model.save("siren_detection_model.h5")

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 0.9846 - loss: 0.0802




In [None]:
# Load your trained model
model = load_model("siren_detection_model.h5")  # Replace with your actual model file path

# Audio settings
CHUNK = 22050  # 1 second of audio
RATE = 22050
FORMAT = pyaudio.paInt16
CHANNELS = 1

# Feature extraction function
def extract_features_from_audio(audio, sr=22050, n_mfcc=40, max_len=173):
    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)
    mfcc = (mfcc - np.mean(mfcc)) / np.std(mfcc)
    if mfcc.shape[1] < max_len:
        mfcc = np.pad(mfcc, ((0, 0), (0, max_len - mfcc.shape[1])), mode='constant')
    else:
        mfcc = mfcc[:, :max_len]
    return mfcc[..., np.newaxis]  # Add channel dimension

# Setup microphone stream
p = pyaudio.PyAudio()
stream = p.open(format=FORMAT,
                channels=CHANNELS,
                rate=RATE,
                input=True,
                frames_per_buffer=CHUNK)

print("🎤 Listening... Press Ctrl+C to stop")

try:
    while True:
        audio_data = stream.read(CHUNK, exception_on_overflow=False)
        audio_np = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32)
        audio_np = audio_np / np.max(np.abs(audio_np))  # Normalize
        
        # Extract features and predict
        features = extract_features_from_audio(audio_np)
        features = np.expand_dims(features, axis=0)  # Add batch dimension
        prediction = model.predict(features)[0][0]
        
        if prediction > 0.5:
            print("🚨 SIREN DETECTED!")
        else:
            print("✅ No Siren")

        time.sleep(0.5)  # Adjust speed as needed

except KeyboardInterrupt:
    print("\n🛑 Stopped by user.")
    stream.stop_stream()
    stream.close()
    p.terminate()



🎤 Listening... Press Ctrl+C to stop
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 168ms/step
✅ No Siren
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
✅ No Siren
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
✅ No Siren
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
✅ No Siren
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
✅ No Siren
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
✅ No Siren
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
✅ No Siren
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
✅ No Siren
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
✅ No Siren
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
✅ No Siren
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
🚨 SIREN DETECTED!
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[