# Integrate YAMNet with any model

This code demonstrates the integration of the YAMNet sound event detection model with a custom, optimized audio classification model. By combining the capabilities of both, this integration forms the foundation of advanced sound event detection, paving the way for more precise audio classification tasks.

## **How it Works**:

1. **Load YAMNet Model**: Initializes the YAMNet model, which is a pre-trained deep net that predicts 521 audio event classes based on the AudioSet-YouTube corpus.

2. **Process Audio**: The provided audio file (`test.m4a`) is read and segmented into 1-second chunks.

3. **YAMNet Predictions**: Each chunk of audio data is passed through the YAMNet model. If the highest probability class from YAMNet is 'Animal' and the confidence is above 0.3, this chunk is further processed.

4. **Integrated Custom Model Prediction**: If YAMNet detects the presence of an 'Animal' sound, the chunk is sent to the custom model for a more detailed classification.

5. **Dataframe for Results**: A dataframe is maintained to store the results, capturing start time, end time, YAMNet prediction label and probability, and the custom model's prediction label and probability.

6. **Visualization**: The Mel spectrogram of the processed audio segment is displayed in real-time.

## **Usage**:

1. Ensure that you have all the necessary dependencies and modules imported.
2. Place your audio file (`test.m4a`) in the appropriate directory.
3. Run the script.
4. Inspect the dataframe `df` for the results. This dataframe captures the segment's start and end times, the label predicted by YAMNet, its confidence, and the label and confidence from the custom optimized model (if YAMNet detected an 'Animal').

## **Author**:
Rohit Dhanda

## **Note**:
The `predict_on_audio(binary_file.read())` function in the code should be defined elsewhere in your project. It's responsible for obtaining predictions from your custom model.

---

I hope this documentation helps explain the function and use of your integrated code. If any modifications are needed, please let me know!

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import soundfile as sf
import yamnet.params as params
import yamnet.yamnet as yamnet_model
import librosa
import tempfile
from collections import defaultdict


# Load YAMNet model
yamnet = yamnet_model.yamnet_frames_model(params)
yamnet.load_weights('yamnet/yamnet.h5')
yamnet_classes = yamnet_model.class_names('yamnet/yamnet_class_map.csv')

frame_len = int(params.SAMPLE_RATE * 1)  # 1sec

# Read the whole audio file
filename = 'test.m4a'
data, sr = librosa.load(filename, sr=params.SAMPLE_RATE)

# Split the audio data into 1 second chunks
chunks = np.array_split(data, len(data) // frame_len)

# Dataframe to store the results
df = pd.DataFrame(columns=['start_time', 'end_time', 'yamnet_label', 'yamnet_probability', 'your_model_label', 'your_model_probability'])


plt.ion()
for cnt, frame_data in enumerate(chunks):
    print(len(frame_data))
    start_time = cnt
    end_time = cnt + 1

    # model prediction
    scores, melspec = yamnet.predict(np.reshape(frame_data, [1, -1]), steps=1)
    yamnet_prediction = np.mean(scores, axis=0)

    # visualize input audio
    plt.imshow(melspec.T, cmap='jet', aspect='auto', origin='lower')
    plt.pause(0.001)
    plt.show()

    top5_i = np.argsort(yamnet_prediction)[::-1][:5]

    # If the top prediction is 'Animal', save the audio segment and send it to your model
    if yamnet_classes[top5_i[0]] == 'Animal' and yamnet_prediction[top5_i[0]] > 0.3:
        # Pad the audio if it's shorter than 1 second
        if len(frame_data) < frame_len:
            padding = frame_len - len(frame_data)
            frame_data = np.pad(frame_data, (0, padding), 'constant')

        # Create a temporary file to store the frame data
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as temp_audio_file:
            sf.write(temp_audio_file.name, frame_data, params.SAMPLE_RATE)
            # Reload the audio file as a binary file
            with open(temp_audio_file.name, 'rb') as binary_file:
                # Get prediction from your model
                your_model_prediction, your_model_probability = predict_on_audio(binary_file.read())

            # Add the results to the DataFrame
            df = df.append({
                'start_time': start_time, 
                'end_time': end_time,
                'yamnet_label': 'Animal',
                'yamnet_probability': yamnet_prediction[top5_i[0]],
                'your_model_label': your_model_prediction,
                'your_model_probability': your_model_probability
            }, ignore_index=True)
# ...

    else:
        # Add the results to the DataFrame
        df = df.append({
            'start_time': start_time, 
            'end_time': end_time,
            'yamnet_label': yamnet_classes[top5_i[0]],
            'yamnet_probability': yamnet_prediction[top5_i[0]],
            'your_model_label': None,
            'your_model_probability': None
        }, ignore_index=True)

# print the DataFrame
print(df)

## Optimised model 
for this we have changed ramdom_subslection, and have to made some changes to the def combined_pipeline

In [None]:
import numpy as np
import tensorflow as tf
import librosa
import base64
import io



# Make sure to use the correct configuration
config = {
    'AUDIO_SAMPLE_RATE': 48000,
    'AUDIO_CLIP_DURATION': 1,
    'AUDIO_NFFT': 2048,
    'AUDIO_STRIDE': 200,
    'AUDIO_MELS': 260,
    'AUDIO_FMIN': 20,
    'AUDIO_FMAX': 13000,
    'AUDIO_WINDOW': None,
    'AUDIO_TOP_DB': 80,
    'MODEL_INPUT_IMAGE_CHANNELS': 3,
    'MODEL_INPUT_IMAGE_WIDTH': 260,
    'MODEL_INPUT_IMAGE_HEIGHT': 260
}


class_names= ['Aegotheles Cristatus', 'Alauda Arvensis', 'Caligavis Chrysops', 'Capra Hircus', 'Cervus Unicolour', 'Colluricincla Harmonica', 'Corvus Coronoides',
              'Dama Dama', 'Eopsaltria Australis', 'Felis Catus', 'Pachycephala Rufiventris', 'Ptilotula Penicillata', 'Rattus Norvegicus', 'Strepera Graculina', 'Sus Scrofa']





# Load the model
model = tf.keras.models.load_model('models/echo_model/2/')

# Define the preprocessing steps as functions.



#####################################################################################
    # this function is adapted from generic_engine_pipeline.ipynb
    # TODO: need to create a pipeline library and link same code into engine
    ########################################################################################
def combined_pipeline(config, audio_clip):

    # Load the audio data with librosa(works only while give direct audio to it)
    #audio_clip, sample_rate = librosa.load(audio_clip, sr=config['AUDIO_SAMPLE_RATE'])
    
    #to use it with yamnet
    file = io.BytesIO(audio_clip)
    audio_clip, sample_rate = librosa.load(file, sr=config['AUDIO_SAMPLE_RATE'])
        
    # keep right channel only
    if audio_clip.ndim == 2 and audio_clip.shape[0] == 2:
        audio_clip = audio_clip[1, :]
        
    # cast to float32 type
    audio_clip = audio_clip.astype(np.float32)
        
    # analyse a random 5 second subsection
    audio_clip = load_random_subsection(audio_clip, duration_secs=config['AUDIO_CLIP_DURATION'])

    # Compute the mel-spectrogram
    image = librosa.feature.melspectrogram(
        y=audio_clip, 
        sr=config['AUDIO_SAMPLE_RATE'], 
        n_fft=config['AUDIO_NFFT'], 
        hop_length=config['AUDIO_STRIDE'], 
        n_mels=config['AUDIO_MELS'],
        fmin=config['AUDIO_FMIN'],
        fmax=config['AUDIO_FMAX'],
        win_length=config['AUDIO_WINDOW'])

    # Optionally convert the mel-spectrogram to decibel scale
    image = librosa.power_to_db(
        image, 
        top_db=config['AUDIO_TOP_DB'], 
        ref=1.0)
        
    # Calculate the expected number of samples in a clip
    expected_clip_samples = int(config['AUDIO_CLIP_DURATION'] * config['AUDIO_SAMPLE_RATE'] / config['AUDIO_STRIDE'])
        
    # swap axis and clip to expected samples to avoid rounding errors
    image = np.moveaxis(image, 1, 0)
    image = image[0:expected_clip_samples,:]
        
    # reshape into standard 3 channels to add the color channel
    image = tf.expand_dims(image, -1)
        
    # most pre-trained model classifer model expects 3 color channels
    image = tf.repeat(image, config['MODEL_INPUT_IMAGE_CHANNELS'], axis=2)
        
    # calculate the image shape and ensure it is correct
    expected_clip_samples = int(config['AUDIO_CLIP_DURATION'] * config['AUDIO_SAMPLE_RATE'] / config['AUDIO_STRIDE'])
    image = tf.ensure_shape(image, [expected_clip_samples, config['AUDIO_MELS'], config['MODEL_INPUT_IMAGE_CHANNELS']])
        
    # note here a high quality LANCZOS5 is applied to resize the image to match model image input size
    image = tf.image.resize(image, (config['MODEL_INPUT_IMAGE_WIDTH'], config['MODEL_INPUT_IMAGE_HEIGHT']), 
                            method=tf.image.ResizeMethod.LANCZOS5)


    # rescale to range [0,1]
    image = image - tf.reduce_min(image) 
    image = image / (tf.reduce_max(image)+0.0000001)
        
    return image, audio_clip, sample_rate



 ########################################################################################
    # Function to predict class and probability given a prediction
    ########################################################################################
def predict_class( predictions):
    # Get the index of the class with the highest predicted probability
    predicted_index = int(tf.argmax(tf.squeeze(predictions)).numpy())
    print(predicted_index, type(predicted_index))

    # Get the class name using the predicted index
    predicted_class = self.class_names[predicted_index]
    # Calculate the predicted probability for the selected class
    predicted_probability = 100.0 * tf.nn.softmax(predictions)[predicted_index].numpy()
    # Round the probability to 2 decimal places
    predicted_probability = round(predicted_probability, 2)
    return predicted_class, predicted_probability

# this method takes in binary audio data and encodes to string
def audio_to_string( audio_binary):
    base64_encoded_data = base64.b64encode(audio_binary)
    base64_message = base64_encoded_data.decode('utf-8')
    return base64_message    


########################################################################################
    # this method takes in string and ecodes to audio binary data
    ########################################################################################
def string_to_audio( audio_string):
    base64_img_bytes = audio_string.encode('utf-8')
    decoded_data = base64.decodebytes(base64_img_bytes)
    return decoded_data
    
def predict_class(predictions):
    predicted_index = int(tf.argmax(tf.squeeze(predictions)).numpy())
    predicted_class = class_names[predicted_index]
    predicted_probability = 100.0 * tf.nn.softmax(predictions)[0, predicted_index].numpy()
    predicted_probability = round(predicted_probability, 2)
    return predicted_class, predicted_probability






def load_random_subsection(sample, duration_secs):
    
    
        
        # Determine the audio file's duration in samples
    audio_duration_samples = tf.shape(sample)[0]
        
        # Determine the required padding length to reach 1-second duration
    padding_length = tf.maximum(0, duration_secs * config['AUDIO_SAMPLE_RATE'] - audio_duration_samples)
        
        # If padding_length is zero or negative, clip the audio to the desired length
    if padding_length <= 0:
        sample = sample[:duration_secs * config['AUDIO_SAMPLE_RATE']]
    else:
            # Apply padding if necessary (if padding_length is zero, this will not affect the sample)
        padding = tf.zeros([padding_length], dtype=sample.dtype)
        sample = tf.concat([sample, padding], axis=0)

    return sample







def predict_on_audio(audio_binary):
    # Preprocess the audio to be suitable for your model
    image, audio_clip, sample_rate = combined_pipeline(config, audio_binary)
    
    # Add a dimension to match the model's input shape
    image = tf.expand_dims(image, 0)
    
    # Make the prediction
    predictions_array = model.predict(image)[0]  # Assuming the model returns 2D array, take the first element
    
    # Pair the class names with the predictions
    paired_predictions = list(zip(class_names, predictions_array))
    
    # Sort the paired predictions based on probability
    sorted_predictions = sorted(paired_predictions, key=lambda x: x[1], reverse=True)
    
    return sorted_predictions[:2]





