<a href="https://colab.research.google.com/github/AceCentre/SoundSwitch/blob/main/SoundDetect.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Positive Clips**
The positive_clips should contain audio samples of the sound you are interested in detecting. For example, if you are building a system to detect the sound of a specific word being spoken, your positive clips would contain various instances of that word being spoken by different people, in different tones, and possibly with background noise.

**Negative Clips**
The negative_clips should contain audio samples that are representative of the types of sounds that the system will encounter but should not react to. This could include background noise, other words being spoken, or any other sounds that are not the target sound. These clips are used to test the system's ability to correctly identify non-target sounds as negative.

**Template Clips**
The templates are pre-recorded audio clips that are used as a basis for comparison with incoming audio data. These could be the clearest examples of the sound you are trying to detect. In your code, these are loaded from files named Heather1.wav and Heather2.wav. The Mel spectrograms of these templates are computed and stored in S1 and S2.

In [None]:
!pip install fastdtw

import numpy as np
import librosa
from fastdtw import fastdtw
from scipy.spatial.distance import euclidean
import time
import os
from sklearn.metrics.pairwise import euclidean_distances

Collecting fastdtw
  Downloading fastdtw-0.3.4.tar.gz (133 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/133.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m133.1/133.4 kB[0m [31m3.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.4/133.4 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: fastdtw
  Building wheel for fastdtw (setup.py) ... [?25l[?25hdone
  Created wheel for fastdtw: filename=fastdtw-0.3.4-cp310-cp310-linux_x86_64.whl size=512378 sha256=ceeb4d66d18d5df175954551e4d66a32301e9a6488a8d4d2347341f7ef621704
  Stored in directory: /root/.cache/pip/wheels/73/c8/f7/c25448dab74c3acf4848bc25d513c736bb93910277e1528ef4
Successfully built fastdtw
Installing collected packages: fastdtw
Successfully installed fastdtw-0.3.4


In [None]:
#Load clips positive and negative

def load_clips(folder):
    clips = []
    for filename in os.listdir(folder):
        if filename.endswith(".wav"):
            filepath = os.path.join(folder, filename)
            audio, _ = librosa.load(filepath, sr=44100)
            clips.append(audio)
    return clips

from google.colab import drive
drive.mount('/content/drive')
negative_clips = load_clips("/content/drive/My Drive/SoundDetectSamples/Background Clips")
positive_clips = load_clips("/content/drive/My Drive/SoundDetectSamples/Positive Clips")

Mounted at /content/drive


In [None]:
def test_method(method, positive_clips, background_noises, *args):
    true_positives = 0
    false_positives = 0
    false_negatives = 0
    true_negatives = 0

    start_time = time.time()

    # Test on positive clips
    for clip in positive_clips:
        if method(clip, *args):
            true_positives += 1
        else:
            false_negatives += 1

    # Test on background noise clips
    for noise in background_noises:
        if method(noise, *args):
            false_positives += 1
        else:
            true_negatives += 1

    elapsed_time = time.time() - start_time

    print(f"Method: {method.__name__}")
    print(f"True Positives: {true_positives}")
    print(f"False Negatives: {false_negatives}")
    print(f"True Negatives: {true_negatives}")
    print(f"False Positives: {false_positives}")
    print(f"Time taken: {elapsed_time} seconds")


First function uses Mel spectrograms for feature extraction and Dynamic Time Warping (DTW) for comparing the features. The primary steps in both methods are:

Compute the Mel spectrogram of the audio clip.
Use DTW to find the minimum distance between the Mel spectrogram of the audio clip and the Mel spectrograms of the templates.
Compare the minimum distance to a threshold to make a classification decision.

In [None]:
def melspectrogram_detect(positive_clips, background_noises, templates, sr=44100, threshold=1000):
    results = {
        'true_positives': 0,
        'false_negatives': 0,
        'true_negatives': 0,
        'false_positives': 0
    }

    # Function to detect sound using Mel spectrogram and DTW
    def detect_sound(audio_signal, templates, sr, threshold):
        S = librosa.feature.melspectrogram(y=audio_signal, sr=sr, n_mels=128)
        min_distance = float('inf')

        for template in templates:
            distance, _ = fastdtw(S.T, template.T, dist=euclidean)
            min_distance = min(min_distance, distance)
            #print(f"Min distance: {min_distance}")


        return min_distance < threshold

    # Test on positive clips
    for clip in positive_clips:
        if detect_sound(clip, templates, sr, threshold):
            results['true_positives'] += 1
        else:
            results['false_negatives'] += 1

    for noise in negative_clips:
        if detect_sound(noise, templates, sr, threshold):
            results['false_positives'] += 1
        else:
            results['true_negatives'] += 1

    return results

 MFCCs (Mel-Frequency Cepstral Coefficients). MFCCs are often used in speech and audio processing to capture the timbral texture of the audio.  we use MFCCs instead of Mel spectrograms for feature extraction. We also use Euclidean distance for comparison instead of DTW. This should provide a different perspective on the performance of sound detection techniques.

In [None]:
def mfcc_detect(positive_clips, background_noises, templates, sr=44100, threshold=1000):
    results = {
        'true_positives': 0,
        'false_negatives': 0,
        'true_negatives': 0,
        'false_positives': 0
    }

    # Function to detect sound using MFCC and Euclidean distance
    def detect_sound(audio_signal, templates, sr, threshold):
        mfccs = librosa.feature.mfcc(y=audio_signal, sr=sr, n_mfcc=13)
        min_distance = float('inf')

        for template in templates:
            if mfccs.shape[1] != template.shape[1]:
                continue  # Skip this template if dimensions don't match
            distance = np.sum(euclidean_distances(mfccs.T, template.T))
            min_distance = min(min_distance, distance)

        return min_distance < threshold

    # Test on positive clips
    for clip in positive_clips:
        if detect_sound(clip, templates, sr, threshold):
            results['true_positives'] += 1
        else:
            results['false_negatives'] += 1

    # Test on background noise clips
    for noise in background_noises:
        if detect_sound(noise, templates, sr, threshold):
            results['false_positives'] += 1
        else:
            results['true_negatives'] += 1

    return results


In this version, we use MFCCs as features for the SVM classifier. We train the classifier using the mean MFCCs across time for each clip, labeling positive clips as 1 and negative clips as 0. After training, we test the classifier on both positive and negative clips and update the results dictionary accordingly.

In [1]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

def svm_detect(positive_clips, background_noises, sr=44100):
    results = {
        'true_positives': 0,
        'false_negatives': 0,
        'true_negatives': 0,
        'false_positives': 0
    }

    # Extract MFCC features for training
    X_train = []
    y_train = []

    for clip in positive_clips:
        mfccs = librosa.feature.mfcc(y=clip, sr=sr, n_mfcc=13)
        X_train.append(mfccs.mean(axis=1))
        y_train.append(1)  # Label for positive clips

    for noise in background_noises:
        mfccs = librosa.feature.mfcc(y=noise, sr=sr, n_mfcc=13)
        X_train.append(mfccs.mean(axis=1))
        y_train.append(0)  # Label for negative clips

    # Train the SVM classifier
    clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
    clf.fit(X_train, y_train)

    # Test on positive clips
    for clip in positive_clips:
        mfccs = librosa.feature.mfcc(y=clip, sr=sr, n_mfcc=13)
        prediction = clf.predict([mfccs.mean(axis=1)])
        if prediction == 1:
            results['true_positives'] += 1
        else:
            results['false_negatives'] += 1

    # Test on background noise clips
    for noise in background_noises:
        mfccs = librosa.feature.mfcc(y=noise, sr=sr, n_mfcc=13)
        prediction = clf.predict([mfccs.mean(axis=1)])
        if prediction == 0:
            results['true_negatives'] += 1
        else:
            results['false_positives'] += 1

    return results

In [3]:
# Load pre-recorded templates and compute their Mel spectrograms
audio1, sr = librosa.load("/content/drive/My Drive/SoundDetectSamples/Heather1.wav", sr=44100)
audio2, _ = librosa.load("/content/drive/My Drive/SoundDetectSamples/Heather2.wav", sr=44100)

S1 = librosa.feature.melspectrogram(y=audio1, sr=sr, n_mels=128)
S2 = librosa.feature.melspectrogram(y=audio2, sr=sr, n_mels=128)

templates = [S1, S2]

def evaluate_results(results):
    tp = results['true_positives']
    fp = results['false_positives']
    fn = results['false_negatives']
    tn = results['true_negatives']

    # Calculate metrics
    try:
        precision = tp / (tp + fp)
    except ZeroDivisionError:
        precision = 0.0

    try:
        recall = tp / (tp + fn)
    except ZeroDivisionError:
        recall = 0.0

    try:
        f1_score = 2 * (precision * recall) / (precision + recall)
    except ZeroDivisionError:
        f1_score = 0.0

    try:
        accuracy = (tp + tn) / (tp + tn + fp + fn)
    except ZeroDivisionError:
        accuracy = 0.0

    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1_score:.4f}")
    print(f"Accuracy: {accuracy:.4f}")

# Assuming you've run your tests and have results for each
mel_results = melspectrogram_detect(positive_clips, negative_clips, templates)

audio1, sr = librosa.load("/content/drive/My Drive/SoundDetectSamples/Heather1.wav", sr=44100)
audio2, _ = librosa.load("/content/drive/My Drive/SoundDetectSamples/Heather2.wav", sr=44100)
M1 = librosa.feature.mfcc(y=audio1, sr=sr, n_mfcc=13)
M2 = librosa.feature.mfcc(y=audio2, sr=sr, n_mfcc=13)
mfcc_templates = [M1, M2]

mcfc_results = mfcc_detect(positive_clips, negative_clips, templates)
svm_results = svm_detect(positive_clips, negative_clips)

# Evaluate and print metrics for each method
print("Metrics for Mel Spectrogram Detection:")
evaluate_results(mel_results)

print("\nMetrics for MCFC Detection:")
evaluate_results(mcfc_results)

print("\nMetrics for SVM Detection:")
evaluate_results(svm_results)

NameError: ignored