#### This notebook will guide you through a demo for audio melody extraction. the deep bilstm model which was saved at the end of deep learning experiments notebook, has been loaded here to be used for melody extraction. Run the full notebook from beginning to see the demo. 

#### I used google colab for creating this notebook. so in case there is some issue in running this notebook, pls upload this notebook on google colab along with necessary files (LibFMP library, the saved model and data files like audios). It will work there.

In [0]:
## if you already have these packages, dont run this block
!pip install llvmlite==0.31.0
import llvmlite
!pip install librosa
!pip install soundfile

Collecting soundfile
  Downloading https://files.pythonhosted.org/packages/eb/f2/3cbbbf3b96fb9fa91582c438b574cff3f45b29c772f94c400e2c99ef5db9/SoundFile-0.10.3.post1-py2.py3-none-any.whl
Installing collected packages: soundfile
Successfully installed soundfile-0.10.3.post1


In [0]:
import numpy as np
import librosa
import soundfile

In [0]:
import pickle
## load the saved model
loaded_model = pickle.load(open('deep_bilstm_model.sav', 'rb'))  

Using TensorFlow backend.


In [0]:
import importlib
moduleName = input('Enter name:') ## LibFMP without ''
importlib.import_module(moduleName)

Enter name:LibFMP


<module 'LibFMP' (namespace)>

In [0]:
from LibFMP.C8.S2_Salience import compute_salience_rep as salience
## to compute salience representation 
from LibFMP.C8.S2_F0 import sonify_trajectory_with_sinusoid as sonify
## to convert predicted melody trajectory into audible sound

In [0]:
def return_melody(wavfile, model):
    x,sr = librosa.load(wavfile)
    Z, F_coef_hertz, F_coef_cents = salience(x,sr,N=882, H=441, R=10, gamma=0, F_min=8.175799, F_max=1760)
    ## N=882=window, H=441=hop, same as MIR1K specifications
    ## resolution = 10 cents

    ## next convert Z into chunks. each chunk should have 200 timesteps. model trained that way
    ## if Z has 580 timesteps then convert it to 600 timesteps by zero padding
    ## then we can get 3 chunks out of Z

    chunk_size = 200
    timesteps = Z.shape[1]
    chunks = int(1+np.floor((timesteps-1)/chunk_size))
    Z_padded = np.zeros((Z.shape[0],chunks*chunk_size))
    Z_padded[:,0:timesteps] = Z
    
    X=[] ## this will store the chunks
    for i in range(chunks):
        X.append(Z_padded[:,(chunk_size*i):(chunk_size*(i+1))])
    X = np.array(X)
    
    ## suppose we get 3 chunks then X will have shape (3,931,200) 
    ## 931 comes from return_salience_and_melody(), where F_min = 8.175799 hz, F_max = 1760 hz
    ## and resolution = 10 cents so F_min = 0 cents (by default), F_max = 9300 cents
    ## and frequency bins = [0,10,20,30,....,9300] = 931 bins
    ## but for i/p to RNN, required shape = (samples,timesteps,features) = (3,200,931)
    ## so reshape X
    
    X_reshaped=[]
    for i in range(chunks):
        X_reshaped.append(X[i].T)
    X_reshaped = np.array(X_reshaped)
    
    y_pred = model.predict(X_reshaped) ## the softmax probabilities
    y_labels=[]
    for i in range(chunks):
        ## y_pred[i] will be o/p for ith chunk 
        ## 200 timesteps, each timestep can belong to one of 931 classes => 931 probabilities
        ## so y_pred[i] looks like (200,931). to get class labels we use np.argmax
        y_labels.append(np.argmax(y_pred[i], axis=1))
    y_labels = np.array(y_labels)
    
    ## now y_labels[i] contains class labels for ith chunk
    ## 200 timesteps => 200 labels => y_labels[i] = (200,)
    ## suppose audio had 580 timesteps => gives 3 chunks => y_labels = (3,200)
    ## convert y_labels into single array of length 600 then consider 1st 580 timesteps for RPA
    
    melody_pred = np.empty(chunks*chunk_size)
    for i in range(chunks):
        melody_pred[(chunk_size*i):(chunk_size*(i+1))] = y_labels[i]
    
    melody_pred_aligned = np.empty(timesteps)
    melody_pred_aligned = melody_pred[0:timesteps]
    
    prediction = melody_pred_aligned*10 
    ## back to normal resolution

    prediction_hertz = 8.175799*np.exp(np.log(2)*prediction/1200)
    ## conversion of cents to hertz (recall F_min is always referrence, and F_min = 8.175799 hz in salience())

    return prediction_hertz

In [0]:
audiofile = 'mir1k_16wavfiles/abjones_1_01.wav'  
## you can put whatever audio you want. but it should be in the same folder as these codes

In [0]:
predicted_melody = return_melody(audiofile, loaded_model)
traj = np.zeros((len(predicted_melody), 2))
traj[:,1] = predicted_melody
traj[:,0] = 0.02*np.arange(len(predicted_melody)) 
## H(hop) = 441 samples = 0.02 sec, since sr = 22050

x,sr = librosa.load(audiofile)
x_soni = sonify(traj,len(x))

In [0]:
## hear the original audiofile

import IPython.display
IPython.display.Audio(data=x, rate=sr)

In [0]:
## hear the melody predicted by trained deep bi-lstm model

IPython.display.Audio(data=x_soni, rate=sr)