In [1]:
import sys
import os
import pandas as pd
import numpy as np
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 
import tensorflow as tf
import matplotlib.pyplot as plt

from IPython.display import Audio 
from IPython.display import display

sys.path.append('config')
import settings
sys.path.append('Dataset')
from dataset import LjSpeechDataset
sys.path.append('Preprocessing')
from preprocessing import LJSpeechPreprocessor
sys.path.append('Postprocessing')
from postprocessing import LJSpeechPostprocessing

In [2]:
model = tf.keras.models.load_model("Artifacts/Models/v1/Model/tf", compile=False)
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None, 193)]       0         
                                                                 
 reshape (Reshape)           (None, None, 193, 1)      0         
                                                                 
 conv2d (Conv2D)             (None, None, 97, 32)      14432     
                                                                 
 batch_normalization (BatchN  (None, None, 97, 32)     128       
 ormalization)                                                   
                                                                 
 re_lu (ReLU)                (None, None, 97, 32)      0         
                                                                 
 conv2d_1 (Conv2D)           (None, None, 49, 32)      236544    
                                                             

In [3]:
speechDataset = LjSpeechDataset(
        data_path=settings.DATA_PATH,
        charlist_file=settings.CHARLIST_PATH,
        batch_size=settings.BATCH_SIZE
    )
    
charlist = speechDataset.get_charlist()
wavs_path = speechDataset.get_wavs_path()

preprocessor = LJSpeechPreprocessor(charlist, wavs_path)
(train_ds, val_ds, test_ds) = speechDataset.create_data_pipelines(preprocessor)
test_paths = speechDataset.get_test_paths()

char_to_num = preprocessor.get_char_to_num()
num_to_char = preprocessor.get_num_to_char()

postprocessor = LJSpeechPostprocessing(num_to_char, charlist)

In [7]:
i = 0

for spectrograms, labels in test_ds.take(1):
    for index, spectrogram in enumerate(spectrograms):
        file = tf.io.read_file(wavs_path + list(test_paths["file_name"])[i] + ".wav")
        audio, _ = tf.audio.decode_wav(file)
        audio = audio.numpy()
        display(Audio(np.transpose(audio), rate=16000))
        
        label = tf.strings.reduce_join(num_to_char(labels[index])).numpy().decode("utf-8")
        
        spectrogram = tf.expand_dims(spectrogram, axis=0)
        predictions = model.predict(spectrogram, verbose=0)
        
        output_text = postprocessor.postprocess(predictions, use_spell_correction=False)[0]
        
        print("True label: ", label)
        print("Predicted label: ", output_text)
        
        i += 1
        if i > 3:
            break

True label:  at that station the safes were given out heavy with shot not gold the thieves went on to dover and byandby
Predicted label:  at that station the sates we givan out ha ve with shot not god the theves wenton to dover and bab


True label:  no traces of its moat have appeared
Predicted label:  no traes of its mote hafe apeard


True label:  a notorious miser robert smith had recently died in seven dials where he had amassed a considerable fortune
Predicted label:  tha no torius miser brobert smeth had resont la dided in sevendils where he had amast i considerable fortion


True label:  no attempt was made to maintain discipline
Predicted label:  no attempt was made to mantain discipplyn


In [12]:
path = "Data/Test/test1.wav"
file = tf.io.read_file(path)
audio, _ = tf.audio.decode_wav(file)
audio = audio.numpy()
display(Audio(np.transpose(audio), rate=16000))

spectrogram = preprocessor.encode_wav(path)
spectrogram = tf.expand_dims(spectrogram, axis=0)
predictions = model.predict(spectrogram, verbose=0)

output_text = postprocessor.postprocess(predictions, use_spell_correction=False)[0]

print("Predicted label: ", output_text)

ValueError: rate must be specified when data is a numpy array or list of audio samples.