In [1]:
import tensorflow as tf
import tensorflow_io as tfio
from Module_reco_voc import recuperation_transcription
from Module_reco_voc_2 import Fabriquer_ou_restorer_model
import pandas as pd
import numpy as np
import keras

In [2]:
df_full = recuperation_transcription(monRepertoire='../sources/librispeech/train-clean-360', type_fic_audio='.flac')

Nombre de fichier audio enregistrés : 104014


In [3]:
df_full.head(150)

Unnamed: 0,id_audio,speaker_id,chapter_id,id_line,chemin,transcription
0,100-121669-0000,100,121669,0000,../sources/librispeech/train-clean-360/100/121...,TOM THE PIPER'S SON
1,100-121669-0001,100,121669,0001,../sources/librispeech/train-clean-360/100/121...,THE PIG WAS EAT AND TOM WAS BEAT AND TOM RAN C...
2,100-121669-0002,100,121669,0002,../sources/librispeech/train-clean-360/100/121...,HE NEVER DID ANY WORK EXCEPT TO PLAY THE PIPES...
3,100-121669-0003,100,121669,0003,../sources/librispeech/train-clean-360/100/121...,BUT HE WAS SO SLY AND CAUTIOUS THAT NO ONE HAD...
4,100-121669-0004,100,121669,0004,../sources/librispeech/train-clean-360/100/121...,AND THEY LIVED ALL ALONE IN A LITTLE HUT AWAY ...
...,...,...,...,...,...,...
28,1001-134707-0028,1001,134707,0028,../sources/librispeech/train-clean-360/1001/13...,THE BANNERS OF THE STATES AND FLAGS OF EVERY L...
29,1001-134707-0029,1001,134707,0029,../sources/librispeech/train-clean-360/1001/13...,SHALL ALL THAT FORWARDS PERFECT HUMAN LIFE BE ...
30,1001-134707-0030,1001,134707,0030,../sources/librispeech/train-clean-360/1001/13...,BUT ALL THE WORKMEN OF THE WORLD HERE TO BE RE...
31,1001-134707-0031,1001,134707,0031,../sources/librispeech/train-clean-360/1001/13...,MATERIALS HERE UNDER YOUR EYE SHALL CHANGE THE...


In [4]:
df_predict = df_full[(df_full['id_audio'] == '14-208-0000') | (df_full['id_audio'] == '16-122827-0035') | (df_full['id_audio'] == '23-124439-0111') ]
df_predict

Unnamed: 0,id_audio,speaker_id,chapter_id,id_line,chemin,transcription
0,14-208-0000,14,208,0,../sources/librispeech/train-clean-360/14/208/...,CHAPTER ELEVEN THE MORROW BROUGHT A VERY SOBER...
35,16-122827-0035,16,122827,35,../sources/librispeech/train-clean-360/16/1228...,I PUT A SPOONFUL OF SUGAR IN WE ALWAYS DO DON'...
111,23-124439-0111,23,124439,111,../sources/librispeech/train-clean-360/23/1244...,DO I READ YOUR LESSON ARIGHT AH YOU ARE MY FAV...


In [5]:
# Définition de la fonction de maping du texte en nombre

# Liste des caractères acceptés
caracteres = [x for x in "abcdefghijklmnopqrstuvwxyz' "]

# Mapping des caractères en chiffres (int)
char_to_num = keras.layers.StringLookup(vocabulary=caracteres, oov_token="")

# Mapping (retour) des chiffres à des caractères
num_to_char = keras.layers.StringLookup(vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True)

In [6]:
# Fonction servant au préprocess final des données pour passer d'un dataframe à un dataset de tensorflow
# Il y a deux parties :
# La première : transformation de l'audio en spectrogramme
# La seconde : Mapping de la transcription en nombre

# Paramètre de la transformé courte de fournier utilisé pour l'optention du spectrogramme
# Taille de la fenêtre en échantillons audio
frame_length = 512

# Pas d'échantillonnage entre le départ de deux fenêtre
frame_step = 128

# Nombre d'échantillon pour une durée fixé sur laquelle on applique la FFT
fft_length = 512

# Pour optimiser le temps de calcul il est recommandé d'utiliser un n_fft = 2^n (puissance de 2)
# (2/4/8/16/32/64/128/256/512/1024/2048/4096/...)
# Dans notre cas il est recommander d'utiliser 512 (pour le traitement de la voix) cela correspond à 32 milisecondes



def Recup_spectrogramme_transcription(fichier_audio, transcription):
    #  ==== Récupération du spectrogramme ==== #
    # Lecture du fichier audio
    fichier = tf.io.read_file(fichier_audio)

    # Decodage du fichier audio .flac
    audio = tfio.audio.decode_flac(fichier, dtype = tf.int16)
    audio = tf.squeeze(audio, axis=-1)

    # Passage de l'audio en float32
    audio = tf.cast(audio, tf.float32)    # pas utile dans notre cas

    # Récupération spectrogramme
    spectrogram = tf.signal.stft(audio,
                                 frame_length = frame_length,
                                 frame_step = frame_step,
                                 fft_length = fft_length
                                )

    # On ne conserve que la racine carré de la valeur absolue du nombre du complexe
    spectrogram = tf.abs(spectrogram)              # valeur absolue
    spectrogram = tf.math.pow(spectrogram, 0.5)    # racine carrée

    # normalisation du spectrogramme
    means = tf.math.reduce_mean(spectrogram, 1, keepdims=True)
    stddevs = tf.math.reduce_std(spectrogram, 1, keepdims=True)
    spectrogram = (spectrogram - means) / (stddevs + 1e-10)


    # ==== Mapping de la transcription ==== #
    # Passage de la transcription en minuscules
    transcription = tf.strings.lower(transcription)

    # Séparationt de la transcription
    transcription = tf.strings.unicode_split(transcription, input_encoding="UTF-8")

    # Map les caractères de la transcription en nombres
    transcription = char_to_num(transcription)

    return spectrogram, transcription

In [7]:
batch_size = 28
predict_dataset = tf.data.Dataset.from_tensor_slices((list(df_predict["chemin"]), list(df_predict["transcription"])))
predict_dataset = ( predict_dataset.map(Recup_spectrogramme_transcription, num_parallel_calls=tf.data.AUTOTUNE)
                   .padded_batch(batch_size)
                    #.prefetch(buffer_size=tf.data.AUTOTUNE)
)
predict_dataset

<PaddedBatchDataset element_spec=(TensorSpec(shape=(None, None, 257), dtype=tf.float32, name=None), TensorSpec(shape=(None, None), dtype=tf.int64, name=None))>

In [8]:
# A utility function to decode the output of the network
def decode_batch_predictions(pred):
    input_len = np.ones(pred.shape[0]) * pred.shape[1]
    # Use greedy search. For complex tasks, you can use beam search
    results = keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0]
    # Iterate over the results and get back the text
    output_text = []
    for result in results:
        result = tf.strings.reduce_join(num_to_char(result)).numpy().decode("utf-8")
        output_text.append(result)
    return output_text

In [9]:
def CTCLoss(y_true, y_pred):
    # Compute the training-time loss value
    batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
    input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
    label_length = tf.cast(tf.shape(y_true)[1], dtype="int64")

    input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")
    label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64")

    loss = keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length)
    return loss


In [10]:
# Chargement du modèle
model = Fabriquer_ou_restorer_model(output_dim = char_to_num.vocabulary_size(), checkpoint_doss = '../models')

Restoring from ../models/recovocale_final.hdf5


In [11]:
predictions = []
targets = []

for batch in predict_dataset:
    X, y = batch
    batch_predictions = model.predict(X)
    batch_predictions = decode_batch_predictions(batch_predictions)
    predictions.extend(batch_predictions)
    for label in y:
        label = tf.strings.reduce_join(num_to_char(label)).numpy().decode("utf-8")
        targets.append(label)
        
for i in range(0, len(predictions)):
    print(f"Target    : {targets[i]}")
    print(f"Prediction: {predictions[i]}")
    print("-" * 100)

Target    : chapter eleven the morrow brought a very sober looking morning the sun making only a few efforts to appear and catherine augured from it everything most favourable to her wishes
Prediction: chaptor a levin the morow brouht a very solber looking morning the sun making only a few efforts to appea an an hae hoe oe oe toe hoe he he he he oe oe oe es
----------------------------------------------------------------------------------------------------
Target    : i put a spoonful of sugar in we always do don't you like it
Prediction: i put is bootful of shigarind we an was do don't you like it
----------------------------------------------------------------------------------------------------
Target    : do i read your lesson aright ah you are my favourite pupil still it is worth to teach you now that you are willing to understand you have taken the first step to understand
Prediction: to i reanour less in a right ah you are my favourit pupl still it is worst to teach ou now that 