In [20]:
import pickle
import numpy as np
import pandas as pd
import soundfile as sf
import yamnet.params as params
import yamnet.yamnet as yamnet_model
import librosa
from tensorflow.keras.models import load_model
import tempfile
import tensorflow_hub as hub

# Load the necessary data and models
with open('yamnet/class_names.pkl', 'rb') as f:
    class_names = pickle.load(f)

with open('yamnet/label_encoder.pkl', 'rb') as f:
    le = pickle.load(f)

yamnet = yamnet_model.yamnet_frames_model(params)
yamnet.load_weights('yamnet/yamnet.h5')
yamnet_classes = yamnet_model.class_names('yamnet/yamnet_class_map.csv')
model = load_model('models/echo_model/2/model_2_79.h5')

# Load the YAMNet model
yamnet_model_handle = 'https://tfhub.dev/google/yamnet/1'
yamnet_model = hub.load(yamnet_model_handle)

def load_audio_file(file_path):
    wav, sr = librosa.load(file_path, sr=16000)
    return np.array([wav])

def extract_features(model, X):
    features = []
    for wav in X:
        scores, embeddings, spectrogram = model(wav)
        features.append(embeddings.numpy().mean(axis=0))
    return np.array(features)

def predict_on_audio(binary_audio):
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio_file:
        with open(temp_audio_file.name, 'wb') as f:
            f.write(binary_audio)
        X_new = load_audio_file(temp_audio_file.name)
        X_new_features = extract_features(yamnet_model, X_new)

        predictions = model.predict(X_new_features)
        top_two_prob_indices = np.argsort(predictions[0])[-2:]
        top_two_prob_values = predictions[0][top_two_prob_indices]

        top_two_class_names = le.inverse_transform(top_two_prob_indices)
        
        return [(class_names[top_two_prob_indices[1-i]], top_two_prob_values[1-i]) for i in range(2)]

def sound_event_detection(filepath):
    data, sr = librosa.load(filepath, sr=16000)

    

    frame_len = int(sr * 1)
    num_chunks = len(data) // frame_len
    chunks = [data[i*frame_len:(i+1)*frame_len] for i in range(num_chunks)]

    # Adding the last chunk which can be less than 1 second
    last_chunk = data[num_chunks*frame_len:]
    if len(last_chunk) > 0:
        chunks.append(last_chunk)

    animal_related_classes = [
        'Dog', 'Cat', 'Bird', 'Animal', 'Birdsong', 'Canidae', 'Feline', 'Livestock',
        'Rodents, Mice', 'Wild animals', 'Pets', 'Frogs', 'Insect', 'Snake', 
        'Domestic animals, pets', 'crow'
    ]

    df_rows = []
    buffer = []
    start_time = None
    for cnt, frame_data in enumerate(chunks):
        frame_data = np.reshape(frame_data, (-1,)) # Flatten the array to 1D
        frame_data = np.array([frame_data]) # Wrapping it back into a 2D array
        outputs = yamnet(frame_data)
        yamnet_prediction = np.mean(outputs[0], axis=0)
        top2_i = np.argsort(yamnet_prediction)[::-1][:2]

        if any(cls in animal_related_classes for cls in [yamnet_classes[top2_i[0]], yamnet_classes[top2_i[1]]]):
            if start_time is None:
                start_time = cnt
            buffer.append(frame_data)
        else:
            if start_time is not None:
                segment_data = np.concatenate(buffer, axis=1)[0]
                with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio_file:
                    sf.write(temp_audio_file.name, segment_data, sr)
                    with open(temp_audio_file.name, 'rb') as binary_file:
                        top2_predictions = predict_on_audio(binary_file.read())

                df_row = {'start_time': start_time, 'end_time': cnt}
                
                for i, pred in enumerate(top2_predictions[:2]):
                    df_row[f'echonet_label_{i+1}'] = pred[0] if pred[0] is not None else None
                    df_row[f'echonet_confidence_{i+1}'] = pred[1] if pred[1] is not None else None

                df_rows.append(df_row)
                buffer = []
                start_time = None

    # Handling the case where the last chunk contains an animal-related sound
    if start_time is not None:
        segment_data = np.concatenate(buffer, axis=1)[0]
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio_file:
            sf.write(temp_audio_file.name, segment_data, sr)
            with open(temp_audio_file.name, 'rb') as binary_file:
                top2_predictions = predict_on_audio(binary_file.read())

        df_row = {'start_time': start_time, 'end_time': len(chunks)}
        
        for i, pred in enumerate(top2_predictions[:2]):
            df_row[f'echonet_label_{i+1}'] = pred[0] if pred[0] is not None else None
            df_row[f'echonet_confidence_{i+1}'] = pred[1] if pred[1] is not None else None

        df_rows.append(df_row)

    df = pd.DataFrame(df_rows)
    return df


# Use the function
filename = 'test4.m4a'
df = sound_event_detection(filename)
print(df)


  data, sr = librosa.load(filepath, sr=16000)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
2023-09-13 20:20:41.348009: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2023-09-13 20:20:41.637401: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


   start_time  end_time     echonet_label_1  echonet_confidence_1  \
0           4         6         Felis Catus              0.669528   
1           8        11         Felis Catus              0.963906   
2          17        18  Strepera Graculina              0.500429   
3          29        30           Dama Dama              0.718673   

      echonet_label_2  echonet_confidence_2  
0  Strepera Graculina              0.170342  
1          Sus_Scrofa              0.029968  
2         Felis Catus              0.174560  
3         Felis Catus              0.191606  


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = um.true_divide(


In [21]:
df

Unnamed: 0,start_time,end_time,echonet_label_1,echonet_confidence_1,echonet_label_2,echonet_confidence_2
0,4,6,Felis Catus,0.669528,Strepera Graculina,0.170342
1,8,11,Felis Catus,0.963906,Sus_Scrofa,0.029968
2,17,18,Strepera Graculina,0.500429,Felis Catus,0.17456
3,29,30,Dama Dama,0.718673,Felis Catus,0.191606
