In [9]:
import os
import tensorflow as tf
from tensorflow import keras

%matplotlib inline
from IPython.display import Audio
from ipywidgets import interactive

import matplotlib.pyplot as plt
import seaborn as sns

import pandas as pd
import numpy as np
import random, time, os
from sklearn.model_selection import train_test_split

from scipy.io import wavfile
import librosa
import tqdm
from tqdm import tqdm
import warnings
from pathlib import Path
sns.set()

SAMPLE_RATE = 16000       
T_MAX = 5
DT = 0.02

warnings.filterwarnings('ignore')

params = {
    'max_audio_length': T_MAX,    # T_MAX : Durée max d'un fichie audio
    'sampling_rate': SAMPLE_RATE,
}

# correspondances mots/numéro
words = {
    'neuf':    0,                                             
    'Hey':     1,
    'oui':     2,
    'Firefox': 3,
    'trois':   4,
    'sept':    5,
    'zéro':    6,
    'non':     7,
    'six':     8,
    'huit':    9,
    'quatre': 10,
    'cinq':   11,
    'un':     12,
    'deux':   13
}


In [4]:
def load_audio(audio_path):
    return librosa.load(audio_path, sr = None)


def displaylabel(prediction):
    listOfKeys = [key  for (key, value) in words.items() if value == prediction]
    return listOfKeys[0]

In [5]:
def logMelSpectrogram(audio, fe, dt):
    # Spectrogram
    stfts = np.abs(librosa.stft(audio,
                        n_fft = int(dt*fe),
                        hop_length = int(dt*fe),
                        center = True
                        )).T
    num_spectrogram_bins = stfts.shape[-1]
    # MEL filter
    linear_to_mel_weight_matrix = librosa.filters.mel(
                                sr=fe,
                                n_fft=int(dt*fe) + 1,
                                n_mels=num_spectrogram_bins,
                    ).T

    # Apply the filter to the spectrogram
    mel_spectrograms = np.tensordot(
                stfts,
                linear_to_mel_weight_matrix,
                1
            )
    return np.log(mel_spectrograms + 1e-6)

In [6]:
def load_rawdata(X_path, dt=0.02, T_max=2):
    all_data = []
    all_path =[]
    SR = params['sampling_rate']
    size_ =len(X_path)
    with tqdm(total=size_) as pbar:
        for index in (range(size_)):
            pbar.update(1)
            audio_path = X_path[index]
            # Load the audio file
            fname = Path(audio_path)
            if fname.exists() == True :
                # print(audio)
                X, sample_rate = librosa.load(audio_path, mono = True, sr = None)
                # Resampling at 16000 Hz
                X = librosa.resample(X, sample_rate, SR)
                all_data.append(X)
                
    return all_data


def ProcessRawData(raw_data):
    T_max = params['max_audio_length']
    fe = params['sampling_rate']
    size_ = len(raw_data)
    k = 0
    X_audio=[]
    with tqdm(total=size_) as pbar:
        for i in range(size_):
            pbar.update(1)
            k += 1
            data = raw_data[i]
            if len(data) >= T_max*fe: 
                # Shape invalid: truncate pour le coup
                data = data[:int(T_max*fe)]
            # After this transformation add zeroes to have the right shape
            else :
                data = np.concatenate([data, np.zeros(int(T_max*fe - len(data)))])
            X_audio.append(data)
        return X_audio


def Convert2logMelSpectrogram(X_data):
    fe = params['sampling_rate']
    X_audio=[]
    size_ =len(X_data)
    with tqdm(total=size_) as pbar:
        for i in range(size_):
            pbar.update(1)
            # Apply the logMelSpectrogram function.    
            spectre_audio = logMelSpectrogram(X_data[i], fe, DT)
            X_audio.append(spectre_audio)
        return np.array(X_audio)   
    

# Chargement du modèle

In [10]:
model2 = tf.keras.models.load_model('model/cv-sw/cv_singleword.h5')

In [11]:
model2.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d (Conv1D)              (None, 243, 64)           51584     
_________________________________________________________________
dropout (Dropout)            (None, 243, 64)           0         
_________________________________________________________________
leaky_re_lu (LeakyReLU)      (None, 243, 64)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 235, 128)          41088     
_________________________________________________________________
dropout_1 (Dropout)          (None, 235, 128)          0         
_________________________________________________________________
leaky_re_lu_1 (LeakyReLU)    (None, 235, 128)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 227, 256)          1

# Chargement des poids

In [12]:
model2.load_weights('model/cv-sw/cv_singleword_weights')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fee99e95850>

# Prédiction

In [13]:
dataT, feT = load_audio('datasets/cv-sw/common_voice_fr_22157149.mp3')
dt=0.02
T_max=5
            
# For the audio > T_max : Use just the fist T_max seconde, to have the right shape. 
if len(dataT)>= T_max*feT:
    dataT = dataT[:int(T_max*feT)]        
        # For the audio < T_max : Add in the signal a zeros vector, to have the right shape.
else :
    dataT = np.concatenate([dataT, np.zeros(int(T_max*feT - len(dataT)))])
    
Audio(dataT, rate = feT)    

In [14]:
a_path = []
a_path.append('datasets/cv-sw/common_voice_fr_22157149.mp3')

In [15]:
a_path[0] 

'datasets/cv-sw/common_voice_fr_22157149.mp3'

In [16]:
a_data = load_rawdata(np.array(a_path))

100%|██████████| 1/1 [00:00<00:00,  1.52it/s]


In [17]:
a_data = ProcessRawData(a_data)

100%|██████████| 1/1 [00:00<00:00, 2474.52it/s]


In [18]:
a_data = np.array(a_data)

In [19]:
a_mel = Convert2logMelSpectrogram(a_data)

100%|██████████| 1/1 [00:00<00:00, 233.32it/s]


In [20]:
a_mel.shape[1:]

(251, 161)

In [21]:
spectre_audio = a_mel.reshape(1,251, 161)

In [22]:
spectre_audio.shape

(1, 251, 161)

In [23]:
prediction = model2.predict(spectre_audio).argmax()
displaylabel(prediction)

'Firefox'