In [1]:
import os
import pandas as pd
import numpy as np
import librosa
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pickle

# Paths
metadata_path = "../data/noise_data.csv"  # Metadata file
audio_dir = "../data/noise_data_audio"    # Folder containing subfolders with audio files

# Load metadata
metadata = pd.read_csv(metadata_path)

# Map sound classes to Safe(0) and Unsafe(1)
# Example: quiet sounds like "air_conditioner" = Safe (0), loud like "jackhammer" = Unsafe (1)
safe_classes = ['air_conditioner', 'engine_idling', 'children_playing']
metadata['label'] = metadata['class'].apply(lambda x: 0 if x in safe_classes else 1)

X, y = [], []

# Extract MFCC features
for index, row in metadata.iterrows():
    file_path = os.path.join(audio_dir, f"fold{row['fold']}", row['slice_file_name'])
    try:
        # Load audio
        y_audio, sr = librosa.load(file_path, duration=5.0)
        # Extract MFCCs
        mfcc = librosa.feature.mfcc(y=y_audio, sr=sr, n_mfcc=40)
        mfcc_scaled = np.mean(mfcc.T, axis=0)
        X.append(mfcc_scaled)
        y.append(row['label'])
    except Exception as e:
        print(f"Error processing {file_path}: {e}")

# Convert to arrays
X = np.array(X)
y = np.array(y)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train RandomForest
print("Training Random Forest Model...")
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Save model
model_path = "../models/noise_model.pkl"
with open(model_path, 'wb') as f:
    pickle.dump(model, f)

print(f"✅ Noise Pollution Model Trained and Saved at {model_path}")




Training Random Forest Model...
✅ Noise Pollution Model Trained and Saved at ../models/noise_model.pkl
