In [2]:
import numpy as np
import tensorflow as tf
import librosa
import base64
import io



# Make sure to use the correct configuration
config = {
    'AUDIO_SAMPLE_RATE': 48000,
    'AUDIO_CLIP_DURATION': 5,
    'AUDIO_NFFT': 2048,
    'AUDIO_STRIDE': 200,
    'AUDIO_MELS': 260,
    'AUDIO_FMIN': 20,
    'AUDIO_FMAX': 13000,
    'AUDIO_WINDOW': None,
    'AUDIO_TOP_DB': 80,
    'MODEL_INPUT_IMAGE_CHANNELS': 3,
    'MODEL_INPUT_IMAGE_WIDTH': 260,
    'MODEL_INPUT_IMAGE_HEIGHT': 260
}


class_names= ['Aegotheles cristatus owlet-nightjar', 'Alauda arvensis European Skylark', 'Caligavis chrysops Yellow-faced honeyeater', 'Capra hircus Feral goat', 'Cervus unicolour Sambar deer', 'Colluricincla harmonica Grey shrikethrush', 'Corvus coronoides Australian raven',
              'Dama dama Fallow Deer', 'Eopsaltria australis Eastern yellow robin', 'Felis Catus Cat', 'Pachycephala rufiventris Rufous whistler', 'Ptilotula penicillata White-plumed honeyeater', 'Rattus norvegicus Brown rat', 'Strepera graculina Pied currawong', 'sus scrofa Wild pig']

# Load the model
model = tf.keras.models.load_model('models/echo_model/1/')

# Define the preprocessing steps as functions.



#####################################################################################
    # this function is adapted from generic_engine_pipeline.ipynb
    # TODO: need to create a pipeline library and link same code into engine
    ########################################################################################
def combined_pipeline(config, audio_clip):

    # Load the audio data with librosa(works only while give direct audio to it)
    #audio_clip, sample_rate = librosa.load(audio_clip, sr=config['AUDIO_SAMPLE_RATE'])
    
    #to use it with yamnet
    file = io.BytesIO(audio_clip)
    audio_clip, sample_rate = librosa.load(file, sr=config['AUDIO_SAMPLE_RATE'])
        
    # keep right channel only
    if audio_clip.ndim == 2 and audio_clip.shape[0] == 2:
        audio_clip = audio_clip[1, :]
        
    # cast to float32 type
    audio_clip = audio_clip.astype(np.float32)
        
    # analyse a random 5 second subsection
    audio_clip = load_random_subsection(audio_clip, duration_secs=config['AUDIO_CLIP_DURATION'])

    # Compute the mel-spectrogram
    image = librosa.feature.melspectrogram(
        y=audio_clip, 
        sr=config['AUDIO_SAMPLE_RATE'], 
        n_fft=config['AUDIO_NFFT'], 
        hop_length=config['AUDIO_STRIDE'], 
        n_mels=config['AUDIO_MELS'],
        fmin=config['AUDIO_FMIN'],
        fmax=config['AUDIO_FMAX'],
        win_length=config['AUDIO_WINDOW'])

    # Optionally convert the mel-spectrogram to decibel scale
    image = librosa.power_to_db(
        image, 
        top_db=config['AUDIO_TOP_DB'], 
        ref=1.0)
        
    # Calculate the expected number of samples in a clip
    expected_clip_samples = int(config['AUDIO_CLIP_DURATION'] * config['AUDIO_SAMPLE_RATE'] / config['AUDIO_STRIDE'])
        
    # swap axis and clip to expected samples to avoid rounding errors
    image = np.moveaxis(image, 1, 0)
    image = image[0:expected_clip_samples,:]
        
    # reshape into standard 3 channels to add the color channel
    image = tf.expand_dims(image, -1)
        
    # most pre-trained model classifer model expects 3 color channels
    image = tf.repeat(image, config['MODEL_INPUT_IMAGE_CHANNELS'], axis=2)
        
    # calculate the image shape and ensure it is correct
    expected_clip_samples = int(config['AUDIO_CLIP_DURATION'] * config['AUDIO_SAMPLE_RATE'] / config['AUDIO_STRIDE'])
    image = tf.ensure_shape(image, [expected_clip_samples, config['AUDIO_MELS'], config['MODEL_INPUT_IMAGE_CHANNELS']])
        
    # note here a high quality LANCZOS5 is applied to resize the image to match model image input size
    image = tf.image.resize(image, (config['MODEL_INPUT_IMAGE_WIDTH'], config['MODEL_INPUT_IMAGE_HEIGHT']), 
                            method=tf.image.ResizeMethod.LANCZOS5)


    # rescale to range [0,1]
    image = image - tf.reduce_min(image) 
    image = image / (tf.reduce_max(image)+0.0000001)
        
    return image, audio_clip, sample_rate



 ########################################################################################
    # Function to predict class and probability given a prediction
    ########################################################################################
def predict_class( predictions):
    # Get the index of the class with the highest predicted probability
    predicted_index = int(tf.argmax(tf.squeeze(predictions)).numpy())
    print(predicted_index, type(predicted_index))

    # Get the class name using the predicted index
    predicted_class = self.class_names[predicted_index]
    # Calculate the predicted probability for the selected class
    predicted_probability = 100.0 * tf.nn.softmax(predictions)[predicted_index].numpy()
    # Round the probability to 2 decimal places
    predicted_probability = round(predicted_probability, 2)
    return predicted_class, predicted_probability

# this method takes in binary audio data and encodes to string
def audio_to_string( audio_binary):
    base64_encoded_data = base64.b64encode(audio_binary)
    base64_message = base64_encoded_data.decode('utf-8')
    return base64_message    


########################################################################################
    # this method takes in string and ecodes to audio binary data
    ########################################################################################
def string_to_audio( audio_string):
    base64_img_bytes = audio_string.encode('utf-8')
    decoded_data = base64.decodebytes(base64_img_bytes)
    return decoded_data
    
def predict_class(predictions):
    predicted_index = int(tf.argmax(tf.squeeze(predictions)).numpy())
    predicted_class = class_names[predicted_index]
    predicted_probability = 100.0 * tf.nn.softmax(predictions)[0, predicted_index].numpy()
    predicted_probability = round(predicted_probability, 2)
    return predicted_class, predicted_probability



def load_random_subsection(audio_clip, duration_secs):
    clip_length = len(audio_clip)
    subsection_length = duration_secs * config['AUDIO_SAMPLE_RATE']
    
    if clip_length > subsection_length:
        start_idx = np.random.randint(0, clip_length - subsection_length)
        return audio_clip[start_idx:start_idx+subsection_length]
    elif clip_length < subsection_length:
        padding = np.zeros(int(subsection_length - clip_length))
        return np.concatenate((audio_clip, padding))
    else:
        return audio_clip











def predict_on_audio(audio_binary):
    # Preprocess the audio to be suitable for your model
    image, audio_clip, sample_rate = combined_pipeline(config, audio_binary)
    
    # Add a dimension to match the model's input shape
    image = tf.expand_dims(image, 0)
    
    # Make the prediction
    predictions_array = model.predict(image)[0]  # Assuming the model returns 2D array, take the first element
    
    # Pair the class names with the predictions
    paired_predictions = list(zip(class_names, predictions_array))
    
    # Sort the paired predictions based on probability
    sorted_predictions = sorted(paired_predictions, key=lambda x: x[1], reverse=True)
    
    return sorted_predictions[:3]







In [4]:
pip install numpy==1.24


Note: you may need to restart the kernel to use updated packages.


In [5]:
import os
import pandas as pd
import numpy as np
import soundfile as sf
import yamnet.params as params
import yamnet.yamnet as yamnet_model
import librosa
import tempfile

# Load YAMNet model
yamnet = yamnet_model.yamnet_frames_model(params)
yamnet.load_weights('yamnet/yamnet.h5')
yamnet_classes = yamnet_model.class_names('yamnet/yamnet_class_map.csv')

frame_len = int(params.SAMPLE_RATE * 1)  # 1sec

# Read the whole audio file
filename = 'test.m4a'
data, sr = librosa.load(filename, sr=params.SAMPLE_RATE)

# Split the audio data into 1 second chunks
chunks = np.array_split(data, len(data) // frame_len)

df_rows = []

for cnt, frame_data in enumerate(chunks):
    # Get YAMNet predictions
    scores, _ = yamnet.predict(np.reshape(frame_data, [1, -1]), steps=1)
    yamnet_prediction = np.mean(scores, axis=0)
    top5_i = np.argsort(yamnet_prediction)[::-1][:5]
    
    # Basic structure for dataframe row
    df_row = {
        'start_time': cnt,
        'end_time': cnt + 1,
        'yamnet_label_1': yamnet_classes[top5_i[0]],
        'yamnet_probability_1': yamnet_prediction[top5_i[0]],
        'yamnet_label_2': yamnet_classes[top5_i[1]],
        'yamnet_probability_2': yamnet_prediction[top5_i[1]],
        'yamnet_label_3': yamnet_classes[top5_i[2]],
        'yamnet_probability_3': yamnet_prediction[top5_i[2]],
        'your_model_label_1': None,
        'your_model_probability_1': None,
        'your_model_label_2': None,
        'your_model_probability_2': None,
        'your_model_label_3': None,
        'your_model_probability_3': None
    }

    # Check if the YAMNet classification triggers the other model
    if (yamnet_classes[top5_i[0]] in ['Animal', 'Bird'] and yamnet_prediction[top5_i[0]] > 0.2) or (yamnet_classes[top5_i[1]] in ['Animal', 'Bird'] and yamnet_prediction[top5_i[1]] > 0.2):

        # Extract segment data for your model
        segment_data = data[cnt*frame_len : (cnt+1)*frame_len]

        with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as temp_audio_file:
            sf.write(temp_audio_file.name, segment_data, params.SAMPLE_RATE)
            with open(temp_audio_file.name, 'rb') as binary_file:
                top3_predictions = predict_on_audio(binary_file.read())

        for i, pred in enumerate(top3_predictions):
            df_row[f'your_model_label_{i+1}'] = pred[0] if len(pred) > 0 else None
            df_row[f'your_model_probability_{i+1}'] = pred[1] if len(pred) > 1 else None

    df_rows.append(df_row)

df = pd.DataFrame(df_rows)
print(df)


  data, sr = librosa.load(filename, sr=params.SAMPLE_RATE)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
2023-09-06 17:50:44.082292: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2023-09-06 17:50:44.529288: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


    start_time  end_time                            yamnet_label_1  \
0            0         1                                   Silence   
1            1         2                                    Speech   
2            2         3                                    Speech   
3            3         4                                    Speech   
4            4         5                                    Speech   
5            5         6                                    Animal   
6            6         7                                      Crow   
7            7         8                                       Cat   
8            8         9                               Alarm clock   
9            9        10                                    Buzzer   
10          10        11                                    Rumble   
11          11        12                       Rodents, rats, mice   
12          12        13                                    Typing   
13          13      

In [6]:
df

Unnamed: 0,start_time,end_time,yamnet_label_1,yamnet_probability_1,yamnet_label_2,yamnet_probability_2,yamnet_label_3,yamnet_probability_3,your_model_label_1,your_model_probability_1,your_model_label_2,your_model_probability_2,your_model_label_3,your_model_probability_3
0,0,1,Silence,1.0,Speech,8.8e-05,Music,1e-06,,,,,,
1,1,2,Speech,0.961608,"Inside, small room",0.050261,Telephone,0.002443,,,,,,
2,2,3,Speech,0.988999,Typing,0.027424,"Inside, small room",0.019422,,,,,,
3,3,4,Speech,0.9547,"Inside, small room",0.014928,Tools,0.011103,,,,,,
4,4,5,Speech,0.948507,Clicking,0.030073,"Inside, small room",0.008835,,,,,,
5,5,6,Animal,0.968866,"Livestock, farm animals, working animals",0.939845,Fowl,0.924176,Felis Catus Cat,24.068342,Cervus unicolour Sambar deer,9.757881,Capra hircus Feral goat,6.327871
6,6,7,Crow,0.170514,Animal,0.118891,Speech,0.07944,,,,,,
7,7,8,Cat,0.589326,Meow,0.475042,"Domestic animals, pets",0.396556,,,,,,
8,8,9,Alarm clock,0.373726,"Inside, small room",0.236411,"Telephone dialing, DTMF",0.224682,,,,,,
9,9,10,Buzzer,0.319221,"Beep, bleep",0.264754,Car alarm,0.226553,,,,,,
