In [7]:
import os
import librosa
import numpy as np
import pandas as pd
import IPython.display as ipd
import librosa.display
from tqdm import tqdm

In [8]:
import sys
print(sys.executable)

D:\TECH\jarvic\newenv\Scripts\python.exe


In [48]:
def extract_features_fixed(file_path):
    # Load audio
    audio, sr = librosa.load(file_path, res_type='kaiser_fast')
    
    # FIX: If the audio is shorter than n_fft (2048), pad it with zeros
    if len(audio) < 2048:
        # Pad with zeros to at least 2048 samples
        audio = np.pad(audio, (0, 2048 - len(audio)), mode='constant')
    
    # Now extract spectrogram or MFCC safely
    mel = librosa.feature.melspectrogram(y=audio, sr=sr, n_fft=2048, hop_length=512)
    return mel

In [50]:
import os
import librosa
import numpy as np
import pandas as pd
from tqdm import tqdm

def prepare_data_hybrid(metadata_path, audio_dir, n_mels=128, duration=4):
    X, y = [], []
    metadata = pd.read_csv(metadata_path)
    sr = 22050
    input_length = sr * duration

    for index, row in tqdm(metadata.iterrows(), total=metadata.shape[0]):
        file_path = os.path.join(audio_dir, f"fold{row['fold']}", row['slice_file_name'])
        
        try:
            # 1. Load audio
            audio, _ = librosa.load(file_path, sr=sr, res_type='kaiser_fast')
            
            # 2. FIX: Padding short signals to prevent n_fft error and maintain shape
            if len(audio) < input_length:
                audio = np.pad(audio, (0, input_length - len(audio)), mode='constant')
            else:
                audio = audio[:input_length]
            
            # 3. Extract Log-Mel Spectrogram (2D Feature)
            # n_fft=2048 is safe now because signal is at least 4 seconds
            mel = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=n_mels, n_fft=2048)
            log_mel = librosa.power_to_db(mel, ref=np.max)
            
            # Normalize (Min-Max)
            log_mel = (log_mel - log_mel.min()) / (log_mel.max() - log_mel.min() + 1e-6)
            
            X.append(log_mel)
            y.append(row['classID'])
            
        except Exception as e:
            continue

    # Convert to 4D array: (Samples, Mels, Time, 1)
    X = np.array(X)[..., np.newaxis]
    y = np.array(y)
    return X, y

# EXECUTE DATA PREP
# Update these paths to your D: drive paths
meta_path = r'D:\TECH\New folder iiser\New folder\UrbanSound8K\UrbanSound8K\metadata\UrbanSound8K.csv'
audio_path = r'D:\TECH\New folder iiser\New folder\UrbanSound8K\UrbanSound8K\audio'

X, y = prepare_data_hybrid(meta_path, audio_path)
print(f"Final Shapes: X={X.shape}, y={y.shape}") # Should be (8732, 128, 173, 1)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8732/8732 [06:28<00:00, 22.50it/s]


Final Shapes: X=(8732, 128, 173, 1), y=(8732,)


In [45]:
X.size

193361408

In [51]:
y


array([3, 2, 2, ..., 1, 1, 1], shape=(8732,))

In [52]:
# import librosa
# import numpy as np

# # Load audio
# y, sr = librosa.load(r"D:\TECH\New folder iiser\New folder\UrbanSound8K\UrbanSound8K\audio\fold4\7389-1-0-0.wav", sr=16000)

# # Generate mel-spectrogram
# S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)

# # Convert to log scale (optional, common for ML)
# S_db = librosa.power_to_db(S, ref=np.max)

# # Inspect shape
# print(S_db.shape)  # (128, N_frames)


In [53]:
import tensorflow as tf
from tensorflow.keras import layers, models

def build_hybrid_model(input_shape=(128, 173, 1), num_classes=10):
    inputs = layers.Input(shape=input_shape)

    # --- CNN BLOCK (Spatial Feature extraction) ---
    x = layers.Conv2D(32, (3, 3), activation='relu', padding='same')(inputs)
    x = layers.MaxPooling2D((2, 2))(x)
    x = layers.BatchNormalization()(x)
    
    x = layers.Conv2D(64, (3, 3), activation='relu', padding='same')(x)
    x = layers.MaxPooling2D((2, 2))(x)
    x = layers.BatchNormalization()(x)

    # --- RESHAPE FOR TRANSFORMER ---
    # Convert height/width into a sequence of feature vectors
    _, h, w, c = x.shape
    x = layers.Reshape((w, h * c))(x) 
    x = layers.Dense(128, activation='relu')(x)

    # --- TRANSFORMER BLOCK (Temporal relationships) ---
    attention = layers.MultiHeadAttention(num_heads=4, key_dim=128)(x, x)
    x = layers.Add()([x, attention]) 
    x = layers.LayerNormalization()(x)
    
    # Feed-forward
    ff = layers.Dense(256, activation='relu')(x)
    ff = layers.Dense(128)(ff)
    x = layers.Add()([x, ff])
    x = layers.LayerNormalization()(x)

    # --- CLASSIFIER ---
    x = layers.GlobalAveragePooling1D()(x)
    x = layers.Dropout(0.4)(x)
    outputs = layers.Dense(num_classes, activation='softmax')(x)

    model = models.Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# model = build_hybrid_model()

In [54]:
model = build_hybrid_model(input_shape=(128, 173, 1), num_classes=10)

In [55]:
from sklearn.model_selection import train_test_split

# 1. Split (Perfectly synchronized)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 2. Callbacks for better results
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# 3. Train
history = model.fit(
    X_train, y_train, 
    validation_data=(X_test, y_test), 
    epochs=50, 
    batch_size=32, 
    callbacks=[early_stop]
)

Epoch 1/50
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 186ms/step - accuracy: 0.3592 - loss: 1.7915 - val_accuracy: 0.1162 - val_loss: 5.3136
Epoch 2/50
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 183ms/step - accuracy: 0.5525 - loss: 1.2616 - val_accuracy: 0.1259 - val_loss: 4.6159
Epoch 3/50
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 186ms/step - accuracy: 0.6527 - loss: 1.0079 - val_accuracy: 0.4848 - val_loss: 1.6398
Epoch 4/50
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 201ms/step - accuracy: 0.7127 - loss: 0.8478 - val_accuracy: 0.4820 - val_loss: 1.9318
Epoch 5/50
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 202ms/step - accuracy: 0.7506 - loss: 0.7405 - val_accuracy: 0.4127 - val_loss: 2.1706
Epoch 6/50
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 200ms/step - accuracy: 0.8107 - loss: 0.5881 - val_accuracy: 0.6852 - val_loss: 0.9560
Epoch 7/50

In [56]:
# Save the entire model (Architecture + Weights + Optimizer state)
model.save(r'D:\TECH\New folder iiser\my_hybrid_model.keras')

In [61]:
import librosa
import numpy as np

def prepare_single_prediction(file_path):
    # 1. Load the audio (standardized to 4s and 22050Hz)
    audio, sr = librosa.load(file_path, sr=22050, duration=4)
    
    # 2. Ensure length is consistent (Padding)
    if len(audio) < sr * 4:
        audio = np.pad(audio, (0, sr * 4 - len(audio)))
    else:
        audio = audio[:sr * 4]
        
    # 3. Extract Log-Mel Spectrogram (Matching your training shape)
    mel = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=128)
    log_mel = librosa.power_to_db(mel, ref=np.max)
    
    # 4. Normalize (0 to 1)
    log_mel = (log_mel - log_mel.min()) / (log_mel.max() - log_mel.min() + 1e-6)
    
    # 5. Add "Batch" and "Channel" dimensions
    # Shape changes from (128, 173) to (1, 128, 173, 1)
    return log_mel[np.newaxis, ..., np.newaxis]

In [67]:
# 1. Define the path
file_path = r"D:\TECH\New folder iiser\New folder\UrbanSound8K\UrbanSound8K\audio\fold8\54383-0-0-5.wav"

# 2. Transform the file into model-ready data
some_new_audio_data = prepare_single_prediction(file_path)

# 3. Predict!
prediction = loaded_model.predict(some_new_audio_data)

# 4. Interpret the result
classes = ['air_conditioner', 'car_horn', 'children_playing', 'dog_bark', 
           'drilling', 'engine_idling', 'gun_shot', 'jackhammer', 'siren', 'street_music']

# The model returns a list of probabilities; np.argmax picks the highest one
predicted_index = np.argmax(prediction)
print(f"Predicted Sound: {classes[predicted_index]}")
print(f"Confidence: {prediction[0][predicted_index] * 100:.2f}%")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
Predicted Sound: air_conditioner
Confidence: 50.67%
