In [9]:
import os
import tensorflow as tf
from tensorflow import keras

%matplotlib inline
from IPython.display import Audio
from ipywidgets import interactive

import matplotlib.pyplot as plt
import seaborn as sns

import pandas as pd
import numpy as np
import random, time, os
from sklearn.model_selection import train_test_split

from scipy.io import wavfile
import librosa
import tqdm
from tqdm import tqdm

sns.set()

SAMPLE_RATE = 16000       
T_MAX = 5
DT = 0.02

words = {
    'right':    0,                                             
    'five':     1,
    'zero':     2,
    'cat':      3,
    'yes':      4,
    'six':      5,
    'down':     6,
    'house':    7,
    'sheila':   8,
    'three':    9,
    'off':     10,
    'left':    11,
    'bed':     12,
    'happy':   13,
    'eight':   14,
    'bird':    15,
    'nine':    16,
    'tree':    17,
    'one':     18,
    'no':      19,
    'go':      20,
    'on':      21,
    'stop':    22,
    'seven':   23,
    'dog':     24,
    'four':    25,
    'wow':     26,
    'up':      27,
    'two':     28,
    'marvin':  29
    
}

In [3]:
def load_audio(audio_path):
    return librosa.load(audio_path, sr = None)


def displaylabel(prediction):
    listOfKeys = [key  for (key, value) in words.items() if value == prediction]
    return listOfKeys[0]

In [4]:
def logMelSpectrogram(audio, fe, dt):
    # Spectrogram
    stfts = np.abs(librosa.stft(audio,
                        n_fft = int(dt*fe),
                        hop_length = int(dt*fe),
                        center = True
                        )).T
    num_spectrogram_bins = stfts.shape[-1]
    # MEL filter
    linear_to_mel_weight_matrix = librosa.filters.mel(
                                sr=fe,
                                n_fft=int(dt*fe) + 1,
                                n_mels=num_spectrogram_bins,
                    ).T

    # Apply the filter to the spectrogram
    mel_spectrograms = np.tensordot(
                stfts,
                linear_to_mel_weight_matrix,
                1
            )
    return np.log(mel_spectrograms + 1e-6)


# Chargement du modèle

In [12]:
model2 = tf.keras.models.load_model('model/g-sw/g_singleword.h5')

In [13]:
model2.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d (Conv1D)              (None, 93, 64)            51584     
_________________________________________________________________
dropout (Dropout)            (None, 93, 64)            0         
_________________________________________________________________
leaky_re_lu (LeakyReLU)      (None, 93, 64)            0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 85, 128)           41088     
_________________________________________________________________
dropout_1 (Dropout)          (None, 85, 128)           0         
_________________________________________________________________
leaky_re_lu_1 (LeakyReLU)    (None, 85, 128)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 77, 256)           1

# Chargement des poids

In [14]:
model2.load_weights('model/g-sw/g_singleword_weights')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f87cb0bde90>

# Prédiction

In [15]:
dataT, feT = load_audio('datasets/g-sw/marvin/a879a2c3_nohash_0.wav')
dt=0.02
T_max=2
            
        # For the audio > T_max : Use just the fist T_max seconde, to have the right shape. 
if len(dataT)>= T_max*feT:
    dataT = dataT[:int(T_max*feT)]
        
        # For the audio < T_max : Add in the signal a zeros vector, to have the right shape.
else :
    dataT = np.concatenate([dataT, np.zeros(int(T_max*feT - len(dataT)))])
            
        # Apply the logMelSpectrogram function.    
spectre_audio = logMelSpectrogram(dataT, feT, dt)

  "Empty filters detected in mel frequency basis. "


In [16]:
Audio(dataT, rate = feT)

In [19]:
spectre_audio = spectre_audio.reshape(1,101,161)

In [21]:
prediction = model.predict(spectre_audio).argmax()
displaylabel(prediction)

'marvin'