The function takes a name of audio file, it's path, and predicts the presence of 11 musical instruments in an audio file by calculating MFCC features and comparing the predictions to predefined thresholds saved in dictionary bound_instruments.

In [14]:
def audio_predict(path, wav_file):

    from IPython.display import Audio
    from IPython.display import Image

    import tensorflow as tf
    from tensorflow import keras
    import os
    import numpy as np 
    import librosa as lr
    import librosa.display
    import wave
    import soundfile as sf
    import math

    os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
    
    # modeli trebaju biti u istom folderu gdje se nalazi i funkcija !
    
    instruments = ["cel", "pia", "tru", "vio", "voi"]

    instruments_validation = [ "flu", "org", "sax", "gel", "gac", "cla"]
    
    bound_instruments = {}
    bound_instruments["cel"] = 0.9
    bound_instruments["cla"] = 0.7
    bound_instruments["gac"] = 0.35
    bound_instruments["pia"] = 0.3
    bound_instruments["tru"] = 0.6
    bound_instruments["vio"] = 0.5
    bound_instruments["voi"] = 0.25
    bound_instruments["flu"] = 0.7
    bound_instruments["org"] = 0.8
    bound_instruments["gel"] = 0.5
    bound_instruments["sax"] = 0.5
        
    file_path = os.path.join(path, wav_file)
    
    duration_seconds = librosa.get_duration(filename=file_path)
    
    n_intervals = math.ceil(duration_seconds / 3)
    
    pred_labels_temp = np.zeros(n_intervals)
    
    n_coef = 20
    
    hop_length = 1024
    
    dict_instruments = {}
    
    for instrument_of_interest in instruments:
        
        print(instrument_of_interest)
        
        model_name = instrument_of_interest + "best_model.h5"
                
        model_temp = keras.models.load_model(model_name)

        for j in range(n_intervals-1):

            y, sr = librosa.load(file_path, mono=True, sr=None, offset=j*3.0, duration=3.0)

            mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_coef, hop_length=hop_length)

            mfcc = mfcc.reshape(-1,n_coef,130,1)

            pred_labels_temp[j] = model_temp.predict(mfcc,)


        y, sr = librosa.load(file_path, mono=True, sr=None, offset=duration_seconds-3.0, duration=3.0)

        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_coef, hop_length=hop_length)

        mfcc = mfcc.reshape(-1,n_coef,130,1)
        
        pred_labels_temp[n_intervals-1] = model_temp.predict(mfcc,)
        
        print(pred_labels_temp)

        boundary = bound_instruments[instrument_of_interest]

        ones = np.count_nonzero(pred_labels_temp >= boundary)
        
        zeros = n_intervals - ones

        if (ones >= zeros):
            
            dict_instruments[instrument_of_interest] = 1
            
        else:
            
            dict_instruments[instrument_of_interest] = 0
            
    for instrument_of_interest in instruments_validation:
        
        print(instrument_of_interest)

        
        model_name = instrument_of_interest +'_best_model_vali_test_AUGMENTATED_FINAL_test2.h5' 
        
        model_temp = keras.models.load_model(model_name)

        for j in range(n_intervals-1):

            y, sr = librosa.load(file_path, mono=True, sr=None, offset=j*3.0, duration=3.0)

            mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_coef, hop_length=hop_length)

            mfcc = mfcc.reshape(-1,n_coef,130,1)

            pred_labels_temp[j] = model_temp.predict(mfcc,)


        y, sr = librosa.load(file_path, mono=True, sr=None, offset=duration_seconds-3.0, duration=3.0)

        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_coef, hop_length=hop_length)

        mfcc = mfcc.reshape(-1,n_coef,130,1)
        
        pred_labels_temp[n_intervals-1] = model_temp.predict(mfcc,)
        
        print(pred_labels_temp)

        boundary = bound_instruments[instrument_of_interest]

        ones = np.count_nonzero(pred_labels_temp >= boundary)
        
        zeros = n_intervals - ones
        
        if (ones>=zeros):
            
            dict_instruments[instrument_of_interest] = 1
            
        else:
            
            dict_instruments[instrument_of_interest] = 0
        

    return dict_instruments

Example of function usage 

In [45]:
dictionary = audio_predict(r"path")

cel
[0.11985229 0.23253794 0.10250711]
pia
[0.02992112 0.01351372 0.02340058]
tru
[0.06106376 0.37360007 0.30046079]
vio
[0.36096177 0.07457328 0.15797785]
voi
[0.88162661 0.25828743 0.03875243]
flu
[6.33203263e-06 2.95645499e-04 7.42302200e-06]
org
[6.18082995e-05 8.02815266e-05 7.70691186e-05]
sax
[1.16704927e-04 7.51517321e-11 2.79153989e-09]
gel
[0.9978444  0.19593173 0.07699127]
gac
[7.58910730e-08 7.89115147e-05 3.91848971e-07]
cla
[8.94495097e-05 2.47120298e-03 4.05156839e-04]


In [46]:
print( dictionary)

{'cel': 0, 'pia': 0, 'tru': 0, 'vio': 0, 'voi': 1, 'flu': 0, 'org': 0, 'sax': 0, 'gel': 0, 'gac': 0, 'cla': 0}


In [44]:
import librosa as lr
import librosa.display
from IPython.display import Audio

y,sr=librosa.load("track_15.wav", mono=True, sr=None)
Audio(data=y,rate=sr)