In [88]:
import librosa
import numpy as np
import tensorflow as tf
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import load_model
from sklearn.preprocessing import MinMaxScaler
import scipy.fftpack as fftpack
import scipy.signal as signal
import soundfile as sf

In [55]:
def load_audio(audio_file_path):
    audio, sr = librosa.load(audio_file_path, sr=44100)
    return audio, sr

def load_model(model_path):
    model = tf.keras.models.load_model(model_path)
    return model

In [56]:
def extract_features(audio, sr):
    """
    Extracts multiple audio features (MFCC, Chroma STFT, Chroma CQT, Chroma CENS) from the audio signal.
    Returns a 3D vector (time, frequency, feature).
    """
    # Feature 1: MFCC
    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
    mfcc_mean = np.mean(mfcc, axis=1)

    # Feature 2: Chroma STFT
    chroma_stft = librosa.feature.chroma_stft(y=audio, sr=sr)
    chroma_stft_mean = np.mean(chroma_stft, axis=1)

    # Feature 3: Chroma CQT with reduced bins
    chroma_cqt = librosa.feature.chroma_cqt(y=audio, sr=sr, bins_per_octave=24, n_octaves=7)
    chroma_cqt_mean = np.mean(chroma_cqt, axis=1)

    # Feature 4: Chroma CENS
    chroma_cens = librosa.feature.chroma_cens(y=audio, sr=sr)
    chroma_cens_mean = np.mean(chroma_cens, axis=1)

    # Combine all features into a single vector
    return np.hstack([mfcc_mean, chroma_stft_mean, chroma_cqt_mean, chroma_cens_mean])


In [96]:
audio_file_path = "178686-0-0-34.wav"
audio,sr=load_audio(audio_file_path)
features = extract_features(audio,sr)
features

array([-4.5140222e+02,  1.9458733e+02,  3.2716966e-01,  2.0734904e+00,
        1.5886181e+01,  7.7973294e+00, -6.3988641e-02,  7.4882021e+00,
        3.2788143e+00,  2.5555868e+00,  1.5345799e+00,  5.0446253e+00,
       -8.0071753e-01,  6.5317470e-01,  6.3637036e-01,  6.8927950e-01,
        6.6140944e-01,  6.6034132e-01,  6.9679976e-01,  6.7176622e-01,
        6.5601957e-01,  6.7960477e-01,  7.3309726e-01,  7.1521008e-01,
        6.8094528e-01,  6.2673324e-01,  9.1154474e-01,  6.4522040e-01,
        6.7497098e-01,  6.3973063e-01,  6.3435423e-01,  5.1672471e-01,
        5.8784765e-01,  6.1207128e-01,  5.2384591e-01,  5.3879994e-01,
        5.1614672e-01,  3.0679300e-01,  4.1880953e-01,  2.9451969e-01,
        2.7207908e-01,  2.9391468e-01,  2.6854828e-01,  2.0644854e-01,
        2.9108116e-01,  2.6673645e-01,  2.2177199e-01,  2.4278879e-01,
        2.3804136e-01], dtype=float32)

In [None]:
features=features.reshape(-1,1)
scaler = MinMaxScaler(feature_range=(0,1))
scaled_features = scaler.fit_transform(features)

In [98]:
scaled_features

array([[0.        ],
       [0.99999994],
       [0.69928277],
       [0.7019861 ],
       [0.7233683 ],
       [0.71084666],
       [0.69867724],
       [0.71036816],
       [0.70385194],
       [0.7027324 ],
       [0.70115185],
       [0.70658547],
       [0.69753677],
       [0.69978744],
       [0.6997614 ],
       [0.69984335],
       [0.6998002 ],
       [0.6997985 ],
       [0.69985497],
       [0.6998162 ],
       [0.69979185],
       [0.6998283 ],
       [0.6999112 ],
       [0.69988346],
       [0.6998304 ],
       [0.6997465 ],
       [0.7001874 ],
       [0.6997751 ],
       [0.6998212 ],
       [0.69976664],
       [0.6997583 ],
       [0.6995762 ],
       [0.6996863 ],
       [0.6997238 ],
       [0.6995872 ],
       [0.69961035],
       [0.6995753 ],
       [0.69925123],
       [0.6994246 ],
       [0.6992322 ],
       [0.6991975 ],
       [0.69923127],
       [0.69919205],
       [0.6990959 ],
       [0.6992269 ],
       [0.69918925],
       [0.6991196 ],
       [0.699

In [99]:
scaled_features = scaled_features.reshape(-1, 1, 49, 1)

In [103]:
model = load_model("trained_cnn_model.h5")
predictions = model.predict(scaled_features)
print(np.argmax(predictions))



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 148ms/step
2


In [104]:
# Step 1: Map prediction index to noise type
noise_classes = [
    "air_conditioner", "car_horn", "children_playing", "dog_bark",
    "drilling", "engine_idling", "gun_shot", "jackhammer",
    "siren", "street_music"
]

predicted_index = np.argmax(predictions)
predicted_noise_type = noise_classes[predicted_index]
print(f"Predicted Noise Type: {predicted_noise_type}")

# Step 2: Select threshold based on predicted noise type
noise_thresholds = {
    "air_conditioner": 0.05,
    "car_horn": 0.1,
    "children_playing": 0.15,
    "dog_bark": 0.2,
    "drilling": 0.25,
    "engine_idling": 0.1,
    "gun_shot": 0.3,
    "jackhammer": 0.35,
    "siren": 0.2,
    "street_music": 0.15
}

threshold = noise_thresholds[predicted_noise_type]
print(f"Selected Threshold for {predicted_noise_type}: {threshold}")

# Step 3: Compute FFT and create noise mask
signal_fft = fftpack.fft(audio)
signal_power = np.abs(signal_fft) ** 2

# Generate mask
mask = signal_power > threshold
mask = mask.astype(float)

# Apply mask to reduce noise
reduced_fft = signal_fft * mask
denoised_audio = np.real(fftpack.ifft(reduced_fft))

# Step 4: Save the denoised audio
sf.write("denoised_output.wav", denoised_audio, sr)
print("Denoised audio saved as 'denoised_output.wav'.")

Predicted Noise Type: children_playing
Selected Threshold for children_playing: 0.15
Denoised audio saved as 'denoised_output.wav'.
