In [None]:
import os
import csv
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
import scipy.signal
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization, ReLU
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift
from PIL import Image
from sklearn.model_selection import train_test_split
import shutil
from tqdm import tqdm
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import GlobalAveragePooling2D
from tensorflow.keras.layers import Input, Add
from tensorflow.keras.models import Model
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

# Directory structure for storing audio data, spectrograms, and training/validation datasets.
def setup_directories(base_dir):
    subdirs = {
        # Serves as Base directory for all data
        "base_dir": base_dir,

        # These are the Raw audio directories
        "animal_dir": os.path.join(base_dir, "Animal Sounds"),
        "env_dir": os.path.join(base_dir, "Environment"),

        # Spectrogram storage
        "spectrogram_dir": os.path.join(base_dir, "Spectrograms"),
        "animal_spectrogram_dir": os.path.join(base_dir, "Spectrograms", "Animal Sounds"),
        "env_spectrogram_dir": os.path.join(base_dir, "Spectrograms", "Environment"),
        "mixed_spectrogram_dir": os.path.join(base_dir, "Spectrograms", "Mixed Sounds"),

        # The Train and validation directories
        "train_dir": os.path.join(base_dir, "Spectrograms", "train"),
        "val_dir": os.path.join(base_dir, "Spectrograms", "validation"),
        "train_animal_dir": os.path.join(base_dir, "Spectrograms", "train", "Animal Sounds"),
        "train_env_dir": os.path.join(base_dir, "Spectrograms", "train", "Environment"),
        "val_animal_dir": os.path.join(base_dir, "Spectrograms", "validation", "Animal Sounds"),
        "val_env_dir": os.path.join(base_dir, "Spectrograms", "validation", "Environment"),
    }

    return create_directories(base_dir, subdirs)


def create_directories(base_dir, subdirs):
    directories = {key: os.path.join(base_dir, path) for key, path in subdirs.items()}

    for path in directories.values():
        os.makedirs(path, exist_ok=True)
        print(f"Directory created (or already exists): {path}")

    return directories


# Base directory
base_dir = r"C:\Users\riley\Documents"

# Setup directories
directories = setup_directories(base_dir)

# Processing paths for specific audio categories
animal_dir = directories["animal_dir"]
env_dir = directories["env_dir"]
animal_spectrogram_dir = directories["animal_spectrogram_dir"]
env_spectrogram_dir = directories["env_spectrogram_dir"]
mixed_spectrogram_dir = directories["mixed_spectrogram_dir"]
train_dir = directories["train_dir"]
val_dir = directories["val_dir"]

# Processing paths for training and validation subcategories
train_animal_dir = directories["train_animal_dir"]
train_env_dir = directories["train_env_dir"]
val_animal_dir = directories["val_animal_dir"]
val_env_dir = directories["val_env_dir"]


# Bandpass filter to retain frequencies between low and high
def apply_bandpass(y, sr, low=500, high=8000):
    sos = scipy.signal.butter(10, [low, high], btype='band', fs=sr, output='sos')
    return scipy.signal.sosfilt(sos, y)


# Normalize the audio signal to range between -1 and 1
def normalize(y):
    return y / np.max(np.abs(y))


# Remove silent sections from the audio signal based on amplitude threshold
def remove_silence(y, sr):
    intervals = librosa.effects.split(y, top_db=20)
    return np.concatenate([y[start:end] for start, end in intervals])


# Convert an audio signal to a mel spectrogram, save as an image, and resize
def saved_spectrogram(y, sr, output_path, size=(128, 128)):
     # Generates mel spectrogram
    S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, fmax=8000)
     # Convert power spectrogram to decibel (logarithmic scale)
    S_dB = librosa.power_to_db(S, ref=np.max)
    # Plot and saves the spectrogram
    plt.figure(figsize=(6, 6))  # Turn off the axis in order for clean image
    librosa.display.specshow(S_dB, sr=sr, x_axis='time', y_axis='mel', cmap='magma')  # Displays spectrogram
    plt.axis('off')  # Close the plot to free memory
    plt.savefig(output_path, bbox_inches='tight', pad_inches=0)  # Resizes and convert the image to RGB format
    plt.close()
    Image.open(output_path).convert('RGB').resize(size).save(output_path)


# Augmentation setup
augment = Compose([
    AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5), #Gaussian noise is added to the audio
    TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5), # Stretches or compresses time and makes sure it maintains pitch 
    PitchShift(min_semitones=-4, max_semitones=4, p=0.5),  #pitch is shifted up by up to +/- 4 semitones
    Shift(min_shift=-0.5, max_shift=0.5, p=0.5) #audio is shifted in time by a fraction of the total duration
])

# Process audio files to generate spectrograms
def processing_audio(input_dir, output_dir, augmentations):
    os.makedirs(output_dir, exist_ok=True)
    files = [f for f in os.listdir(input_dir) if f.endswith('.wav')] # Retrieve all .wav files in the input directory
    for file in tqdm(files, desc="Processing audio files", unit="file"):  # Loop through each audio file
        try:
            for i in range(5):  # Generate 5 augmented versions per file
                output_path = os.path.join(output_dir, f"{os.path.splitext(file)[0]}_aug_{i}.png")
                if os.path.exists(output_path):
                    continue # Skip processing if the file already exists
                file_path = os.path.join(input_dir, file)
                y, sr = librosa.load(file_path, sr=None)
                y_aug = augmentations(samples=y, sample_rate=sr)  # Load with the original sampling rate
                y_cleaned = remove_silence(normalize(apply_bandpass(y_aug, sr)), sr) # Apply audio augmentations
                saved_spectrogram(y_cleaned, sr, output_path) # Generate and save the spectrogram
        except Exception as e:
            print(f"Error processing {file}: {e}")  # Notify any errors during processing


# Generate mixed spectrograms by combining animal and environmental sounds
def generate_mixed_sounds(animal_dir, env_dir, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    animal_files = [f for f in os.listdir(animal_dir) if f.endswith('.wav')]
    env_files = [f for f in os.listdir(env_dir) if f.endswith('.wav')]

    for animal_file in tqdm(animal_files, desc="Generating mixed sounds", unit="file"):
        try:
            animal_path = os.path.join(animal_dir, animal_file)
            env_file = np.random.choice(env_files)
            env_path = os.path.join(env_dir, env_file)

            y_animal, sr_animal = librosa.load(animal_path, sr=None)
            y_env, sr_env = librosa.load(env_path, sr=None)

            if sr_animal != sr_env:
                y_env = librosa.resample(y_env, orig_sr=sr_env, target_sr=sr_animal)
                sr_env = sr_animal

            min_len = min(len(y_animal), len(y_env))
            y_animal, y_env = y_animal[:min_len], y_env[:min_len]
            y_mixed = normalize(y_animal + y_env)

            output_path = os.path.join(output_dir, f"{os.path.splitext(animal_file)[0]}_mixed.png")
            saved_spectrogram(y_mixed, sr_animal, output_path)
        except Exception as e:
            print(f"Error generating mixed sound for {animal_file}: {e}")


# Distribute mixed spectrograms into Animal and Environment categories
def mixed_spectrograms(mixed_dir, target_dirs):
     # retrieve list of all mixed spectrogram files in the directory
    mixed_files = [f for f in os.listdir(mixed_dir) if f.endswith('.png')]
    # Iterate through each mixed spectrogram file
    for file in mixed_files:
        target_dir = np.random.choice(target_dirs)  # Randomly choose a target directory
        # Construct full source and destination file paths
        src = os.path.join(mixed_dir, file)
        dest = os.path.join(target_dir, file)
         # Copys the file from the mixed directory to the target directory
        shutil.copy(src, dest)
     


# Process the animal, environmental, and mixed audio files to generate spectrograms
processing_audio(animal_dir, animal_spectrogram_dir, augment)
processing_audio(env_dir, env_spectrogram_dir, augment)
generate_mixed_sounds(animal_dir, env_dir, mixed_spectrogram_dir)

# Integrates the mixed spectrograms into the training and validation datasets
mixed_spectrograms(
    mixed_spectrogram_dir,
    [directories["train_animal_dir"], directories["train_env_dir"]]
)
mixed_spectrograms(
    mixed_spectrogram_dir,
    [directories["val_animal_dir"], directories["val_env_dir"]]
)

# Split data into training and validation sets
def split_data(input_dir, output_dirs, test_size=0.2):
     # Retrieve the list of spectrogram files in the input directory
    files = [f for f in os.listdir(input_dir) if f.endswith('.png')]
    train_files, val_files = train_test_split(files, test_size=test_size, random_state=42)

    for files, out_dir in zip([train_files, val_files], output_dirs):
        os.makedirs(out_dir, exist_ok=True) # Ensure the directory exists
        for file in files:
            src = os.path.join(input_dir, file)
            dest = os.path.join(out_dir, file)
            shutil.copy(src, dest)


# Run the updated split_data function for animal and environmental spectrograms
split_data(animal_spectrogram_dir, [directories["train_animal_dir"], directories["val_animal_dir"]])
split_data(env_spectrogram_dir, [directories["train_env_dir"], directories["val_env_dir"]])

# Data Generators
# Training data generator with real-time data augmentation
train_generator = ImageDataGenerator(
    rescale=1.0 / 255.0,  # Normalizes the pixel values to the range [0, 1]
    rotation_range=45,  # Random rotation in the range [-30, 30] degrees
    width_shift_range=0.5,  # Random horizontal shift by up to 30% of the image width
    height_shift_range=0.5,  # Random vertical shift by up to 30% of the image height
    zoom_range=0.5,  # Random zoom by up to 30%
    horizontal_flip=True,  # Random horizontal flipping of images
    vertical_flip=True,  # Vertical flipping
    shear_range=0.2,  # Shear transformations
    fill_mode='nearest'  # Fills empty pixels created by transformations
).flow_from_directory(
    train_dir,  # Directory containing training data
    target_size=(128, 128),  # Resizes the images to 128x128
    batch_size=16,  # Generates batches of 16 images
    class_mode='categorical',  # Multi-class classification
    classes=['Animal Sounds', 'Environment']  # Only two classes
)

# Validation data generator - only rescaling
val_gen = ImageDataGenerator(
    rescale=1.0 / 255.0  # Normalizes pixel values to the range [0, 1]
).flow_from_directory(
    val_dir,  # Directory containing validation data
    target_size=(128, 128),  # Resizes images to 128x128
    batch_size=16,  # Generates batches of 16 images
    class_mode='categorical',  # Multi-class classification
    classes=['Animal Sounds', 'Environment']  # Only two classes
)


# CNN Model Definition
def cnn_model(input_shape=(128, 128, 3), l2_strength=0.001, dropout_rate=0.4):
    # Input layer
    inputs = Input(shape=input_shape)

    # First convolutional block
    x = Conv2D(32, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_strength))(inputs)
    x = BatchNormalization()(x)
    x = ReLU()(x)
    x = MaxPooling2D((2, 2))(x)
    # Residual connection for the first block (apply MaxPooling2D to match spatial dimensions)
    residual_1 = Conv2D(32, (1, 1), padding='same', kernel_initializer='he_normal')(inputs)
    residual_1 = MaxPooling2D((2, 2))(residual_1)
    x = Add()([x, residual_1])

    # Second convolutional block
    x = Conv2D(64, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_strength))(x)
    x = BatchNormalization()(x)
    x = ReLU()(x)
    x = MaxPooling2D((2, 2))(x)
    # Residual connection for the second block
    residual_2 = Conv2D(64, (1, 1), padding='same', kernel_initializer='he_normal')(residual_1)
    residual_2 = MaxPooling2D((2, 2))(residual_2)
    x = Add()([x, residual_2])

    # Third convolutional block
    x = Conv2D(128, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_strength))(x)
    x = BatchNormalization()(x)
    x = ReLU()(x)
    x = MaxPooling2D((2, 2))(x)
    # Residual connection for the third block
    residual_3 = Conv2D(128, (1, 1), padding='same', kernel_initializer='he_normal')(residual_2)
    residual_3 = MaxPooling2D((2, 2))(residual_3)
    x = Add()([x, residual_3])

    # Fourth convolutional block
    x = Conv2D(256, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_strength))(x)
    x = BatchNormalization()(x)
    x = ReLU()(x)
    x = MaxPooling2D((2, 2))(x)
    # Residual connection for the fourth block
    residual_4 = Conv2D(256, (1, 1), padding='same', kernel_initializer='he_normal')(residual_3)
    residual_4 = MaxPooling2D((2, 2))(residual_4)
    x = Add()([x, residual_4])

    # Global Average Pooling
    x = GlobalAveragePooling2D()(x)

    # Fully connected layer
    x = Dense(128, kernel_initializer='he_normal', kernel_regularizer=l2(l2_strength))(x)
    x = BatchNormalization()(x)
    x = ReLU()(x)
    x = Dropout(dropout_rate)(x)

    # Output layer
    outputs = Dense(2, activation='softmax')(x)  # Adjust output to 2 classes (Animal, Environment)

    # Model instantiation
    model = Model(inputs=inputs, outputs=outputs)
    return model


# CNN model
model = cnn_model()

# Compile model
model.compile(
    optimizer=Adam(learning_rate=0.0001),  # Adam optimizer with low learning rate
    loss='categorical_crossentropy',  # Loss function for multi-class classification
    metrics=['accuracy']  # Evaluation metric
)

# Early stopping to prevent overfitting
stop_early = EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True
)

# Train the model
model.fit(
    train_generator,  # Training data generator
    validation_data=val_gen,  # Validation data generator
    epochs=20,  # Maximum number of epochs for training
    callbacks=[stop_early]  # Early stopping callback
)

# generate classifcation rerport
def gen_classification_report(model, data_generator):
    # Reset the generator
    data_generator.reset()
    
    # Predict on the data generator
    print("Generating predictions...")
    y_pred_probs = model.predict(data_generator, verbose=1)
    y_pred = np.argmax(y_pred_probs, axis=1)  # Convert probabilities to class indices
    
    # True labels
    y_true = data_generator.classes
    
    # Class labels
    class_labels = list(data_generator.class_indices.keys())
    
    # Generate classification report
    print("Classification Report:")
    report = classification_report(y_true, y_pred, target_names=class_labels)
    print(report)

    # display confusion matrix
    conf_matrix = confusion_matrix(y_true, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=class_labels)
    disp.plot(cmap='viridis', values_format='d')
    plt.title("Confusion Matrix")
    plt.show()


gen_classification_report(model, val_gen)


# Path to save the model
documents_path = os.path.expanduser(r"C:\Users\riley\Documents")  
model_save_path = os.path.join(documents_path, "audio_model.h5")

# Save the trained model to the Documents directory
model.save(model_save_path)  # Saves the model in HDF5 format
print(f"Model saved as '{model_save_path}'")





Directory created (or already exists): C:\Users\riley\Documents
Directory created (or already exists): C:\Users\riley\Documents\Animal Sounds
Directory created (or already exists): C:\Users\riley\Documents\Environment
Directory created (or already exists): C:\Users\riley\Documents\Spectrograms
Directory created (or already exists): C:\Users\riley\Documents\Spectrograms\Animal Sounds
Directory created (or already exists): C:\Users\riley\Documents\Spectrograms\Environment
Directory created (or already exists): C:\Users\riley\Documents\Spectrograms\Mixed Sounds
Directory created (or already exists): C:\Users\riley\Documents\Spectrograms\train
Directory created (or already exists): C:\Users\riley\Documents\Spectrograms\validation
Directory created (or already exists): C:\Users\riley\Documents\Spectrograms\train\Animal Sounds
Directory created (or already exists): C:\Users\riley\Documents\Spectrograms\train\Environment
Directory created (or already exists): C:\Users\riley\Documents\Spectrog

Processing audio files: 100%|██████████████████████████████████████████████████████| 32/32 [00:00<00:00, 3210.57file/s]
Processing audio files: 100%|██████████████████████████████████████████████████████| 33/33 [00:00<00:00, 3001.26file/s]
Generating mixed sounds:  28%|███████████████▊                                        | 9/32 [00:05<00:13,  1.71file/s]

In [18]:
import os
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from sklearn.metrics import accuracy_score
from tqdm import tqdm
from tensorflow.keras.models import Model

#load the trained model
def load_trained_model(model_path):

    model = load_model(model_path)
    print("Model loaded successfully.")
    return model


#generate spectrograms from audio files
def test_spectrograms(input_folder, output_folder):
    def save_spectrogram(y, sr, output_path):
        plt.figure(figsize=(2, 2)) #figure size 
        S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128) #Compute mel spectrogram
        S_DB = librosa.power_to_db(S, ref=np.max) #decibel scale
        librosa.display.specshow(S_DB, sr=sr, cmap='inferno') #Display 
        plt.axis('off') #Hide axes 
        plt.savefig(output_path, bbox_inches='tight', pad_inches=0) # Save spectrogram
        plt.close() # Close the plot 

    os.makedirs(output_folder, exist_ok=True) # Ensure the output folder exists
    files = [f for f in os.listdir(input_folder) if f.endswith('.wav')] # Get all .wav files

    for file in tqdm(files, desc="Processing test audio files"): # Show progress
        file_path = os.path.join(input_folder, file)   # Load audio file
        output_path = os.path.join(output_folder, f"{os.path.splitext(file)[0]}.png")
        if not os.path.exists(output_path): # Skip if the spectrogram already exists
            try:
                y, sr = librosa.load(file_path, sr=None) 
                save_spectrogram(y, sr, output_path) # Generate and save spectrogram
            except Exception as e:
                print(f"Error processing {file}: {e}")  # Handle errors

# predict classes based on spectrograms
def predict(model, spectrogram_folder, ground_truth_csv, class_names):
    
    def load_spectrogram(file_path):
        img = load_img(file_path, target_size=(128, 128)) # Resize the image
        img_array = img_to_array(img) / 255.0 # Normalize pixel values
        return np.expand_dims(img_array, axis=0)  # Add batch dimension

    test_files = [f for f in os.listdir(spectrogram_folder) if f.endswith('.png')]  # Get all spectrogram images
    ground_truth = pd.read_csv(ground_truth_csv) # Load ground truth labels


    actual_labels = [] #store class labels
    predicted_labels = [] #predcit class labels 

    for file in test_files:
        file_path = os.path.join(spectrogram_folder, file)
        try:
            spectrogram = load_spectrogram(file_path)  # Load and preprocess spectrogram
            prediction = model.predict(spectrogram) # Predicts class probabilities
            predicted_class = np.argmax(prediction, axis=1)[0] # predicted class index
            confidence = np.max(prediction) #confidence score

            ground_truth_row = ground_truth[ground_truth['file_name'] == file] # Finds ground truth label
            if not ground_truth_row.empty:
                actual_class = ground_truth_row['ground_truth'].values[0]
                actual_labels.append(actual_class)
                predicted_labels.append(class_names[predicted_class])

                if class_names[predicted_class] == actual_class:
                    print(f"Correct: {file} -> {class_names[predicted_class]} ({confidence:.2f} confidence)")
                else:
                    print(f"Incorrect: {file} -> Predicted: {class_names[predicted_class]} ({confidence:.2f} confidence), "
                          f"Actual: {actual_class}")
            else:
                print(f"Ground truth not found for {file}. Skipping.")
        except Exception as e:
            print(f"Error predicting {file}: {e}")
            
     # Calculate and display accuracy 
    if actual_labels:
        accuracy = accuracy_score(actual_labels, predicted_labels) * 100
        print(f"\nAccuracy: {accuracy:.2f}%")
    else:
        print("Ground truth not available!")



# Paths
test_audio_folder = r"C:\Users\riley\Documents\Test Sounds"  # Path to test audio files
spectrogram_folder = r"C:\Users\riley\Documents\Spectrograms\test"  # Path to save spectrograms
model_path = r"C:\Users\riley\Documents\audio_model.h5"  # Path to the trained model
ground_truth_csv = r"C:\Users\riley\Documents\ground_truth_1.csv"  # Path to ground truth CSV
class_names = {0: "Animal Sounds", 1: "Environment Sounds"}  # Class index-to-name mapping

# Load the trained model
model = load_trained_model(model_path)

# Generate spectrograms from test audio files
test_spectrograms(test_audio_folder, spectrogram_folder)

# Predict and evaluate
predict(model, spectrogram_folder, ground_truth_csv, class_names)





Model loaded successfully.


Processing test audio files: 100%|██████████████████████████████████████████████████| 37/37 [00:00<00:00, 18538.91it/s]


Incorrect: alligator hiss.png -> Predicted: Environment Sounds (0.85 confidence), Actual: Animal Sounds
Incorrect: baby pelican.png -> Predicted: Environment Sounds (0.67 confidence), Actual: Animal Sounds
Incorrect: baby woodstorks.png -> Predicted: Environment Sounds (0.84 confidence), Actual: Animal Sounds
Incorrect: chirpy grey bird.png -> Predicted: Environment Sounds (0.89 confidence), Actual: Animal Sounds
Incorrect: cribbler tree peeling.png -> Predicted: Animal Sounds (0.63 confidence), Actual: Environment Sounds
Correct: daintree bat.png -> Animal Sounds (0.94 confidence)
Incorrect: falling branches.png -> Predicted: Animal Sounds (0.98 confidence), Actual: Environment Sounds
Correct: Giant Banjo Frogs.png -> Animal Sounds (0.78 confidence)
Incorrect: Glider Squirrel.png -> Predicted: Environment Sounds (0.80 confidence), Actual: Animal Sounds
Correct: Golden Bowerbird.png -> Animal Sounds (0.72 confidence)
Correct: goldfinch.png -> Animal Sounds (0.66 confidence)
Correct: he