In [13]:

import string
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.models import Model
from keras.layers import LSTM, Input, TimeDistributed, Dense, Activation, RepeatVector, Embedding
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [101]:
path_to_data = r"C:\Users\Chrispdl\Desktop\live speech recognition\spa.txt"
translation_file = open(path_to_data, "r", encoding='utf-8') 
raw_data = translation_file.read()
translation_file.close()
print(raw_data[:1000])
raw_data = raw_data.split('\n')
pairs = [sentence.split('\t') for sentence in raw_data]
pairs = pairs[1000:20000]


Go.	Ve.
Go.	Vete.
Go.	Vaya.
Go.	Váyase.
Hi.	Hola.
Run!	¡Corre!
Run.	Corred.
Who?	¿Quién?
Wow!	¡Órale!
Fire!	¡Fuego!
Fire!	¡Incendio!
Fire!	¡Disparad!
Help!	¡Ayuda!
Help!	¡Socorro! ¡Auxilio!
Help!	¡Auxilio!
Jump!	¡Salta!
Jump.	Salte.
Stop!	¡Parad!
Stop!	¡Para!
Stop!	¡Pare!
Wait!	¡Espera!
Wait.	Esperen.
Go on.	Continúa.
Go on.	Continúe.
Hello!	Hola.
I ran.	Corrí.
I ran.	Corría.
I try.	Lo intento.
I won!	¡He ganado!
Oh no!	¡Oh, no!
Relax.	Tomátelo con soda.
Smile.	Sonríe.
Attack!	¡Al ataque!
Attack!	¡Atacad!
Get up.	Levanta.
Go now.	Ve ahora mismo.
Got it!	¡Lo tengo!
Got it?	¿Lo pillas?
Got it?	¿Entendiste?
He ran.	Él corrió.
Hop in.	Métete adentro.
Hug me.	Abrázame.
I fell.	Me caí.
I know.	Yo lo sé.
I left.	Salí.
I lied.	Mentí.
I lost.	Perdí.
I quit.	Dimito.
I quit.	Renuncié.
I sang.	Canté.
I work.	Estoy trabajando.
I'm 19.	Tengo diecinueve.
I'm up.	Estoy levantado.
Listen.	Escucha.
Listen.	Escuche.
Listen.	Escuchen.
No way!	¡No puede ser!
No way!	De ninguna manera.
No way!	¡De ninguna m

In [102]:
def clean_sentence(sentence):
    # Lower case the sentence
    lower_case_sent = sentence.lower()
    # Strip punctuation
    string_punctuation = string.punctuation + "¡" + '¿'
    # str.maketrans('', '', string_punctuation) this will work as translation map 
    clean_sentence = lower_case_sent.translate(str.maketrans('', '', string_punctuation))
    return clean_sentence



In [103]:
def tokenize(sentences):
    # Create tokenizer
    text_tokenizer = Tokenizer()
    # Fit texts
    text_tokenizer.fit_on_texts(sentences)
    return text_tokenizer.texts_to_sequences(sentences), text_tokenizer

In [104]:
# Clean sentences
english_sentences = [clean_sentence(pair[0]) for pair in pairs]
spanish_sentences = [clean_sentence(pair[1]) for pair in pairs]


# Tokenize words
spa_text_tokenized, spa_text_tokenizer = tokenize(spanish_sentences)
eng_text_tokenized, eng_text_tokenizer = tokenize(english_sentences)

print('Maximum length spanish sentence: {}'.format(len(max(spa_text_tokenized,key=len))))
print('Maximum length english sentence: {}'.format(len(max(eng_text_tokenized,key=len))))


# Check language length
spanish_vocab = len(spa_text_tokenizer.word_index) + 1
english_vocab = len(eng_text_tokenizer.word_index) + 1
print(f"Spanish vocabulary is of {spanish_vocab} unique words")
print(f"English vocabulary is of {english_vocab} unique words")

Maximum length spanish sentence: 12
Maximum length english sentence: 6
Spanish vocabulary is of 7225 unique words
English vocabulary is of 3800 unique words


From the previous code we have a maximum length of 12 words for Spanish sentences and 6 words for English. Here we can see the advantage of using an encoder decoder model. Otherwise you should apply padding to the english sentences up to 12.Consequently with seq2seq model we are reducinf the number of LSTM time steps , reducing computation needs and complexity

In [65]:
# now we apply padding to make the maximum length of the sentences in each lanuage equal 

max_spanish_len = int(len(max(spa_text_tokenized,key=len)))
max_english_len = int(len(max(eng_text_tokenized,key=len)))

spa_pad_sentence = pad_sequences(spa_text_tokenized, max_spanish_len, padding = "post")
eng_pad_sentence = pad_sequences(eng_text_tokenized, max_english_len, padding = "post")

# Reshape data
spa_pad_sentence = spa_pad_sentence.reshape(*spa_pad_sentence.shape, 1)
eng_pad_sentence = eng_pad_sentence.reshape(*eng_pad_sentence.shape, 1)

## ENCODER

In [69]:

input_sequence = Input(shape=(max_spanish_len,))
embedding = Embedding(input_dim=spanish_vocab, output_dim=128,)(input_sequence)

the ‘input_dim’ which is the length of the Spanish vocabulary and ‘output_dim’ which is the shape of the embedding vector. 
The higher the output dimension the more semantic meaning you can extract from each word, but also the higher the calculations required and the processing time. Finding a balance between speed and performance is required

Next we will add the LSTM layer of size 64. Even though each time step of the LSTM outputs a hidden vector, we will focus our attention on the last one, therefore the parameter return_sequences is ‘False’. We will see how the LSTM layer works with return_sequences=True for the decoder.

In [70]:
encoder = LSTM(64, return_sequences=False)(embedding)

The output of the encoder layer will be the hidden state of the last time step. We will then need to feed this vector into the decoder. Let’s look more precisely at the decoder part and understand how it works.

The hidden vector is repeated n times, so each time step of the LSTM receives the same vector. In order to have this same vector for every time step we need to use the layer RepeatVector, as its names implies its role is to repeat the vector it is receiving, the only parameter we need to define is n, the number of repetitions. This number is equal to the number of time step of the decoder part, in other words the maximum English sentence length, 6.

In [71]:
r_vec = RepeatVector(max_english_len)(encoder)


## DECODER

This is also built with a LSTM layer, the difference is the parameter return_sequences, which in this case is ‘True’. What is this parameter for? In the encoder part we were expecting only one vector in the last time step and neglecting all the others, here we are expecting an output vector at every time step so the Dense layer can make a prediction.

In [73]:
decoder = LSTM(64, return_sequences=True, dropout=0.2)(r_vec)


In [74]:
logits = TimeDistributed(Dense(english_vocab))(decoder)


In [75]:
enc_dec_model = Model(input_sequence, Activation('softmax')(logits))
enc_dec_model.compile(loss=sparse_categorical_crossentropy,
              optimizer=Adam(1e-3),
              metrics=['accuracy'])
enc_dec_model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 12)]              0         
                                                                 
 embedding_2 (Embedding)     (None, 12, 128)           924800    
                                                                 
 lstm_1 (LSTM)               (None, 64)                49408     
                                                                 
 repeat_vector (RepeatVector  (None, 6, 64)            0         
 )                                                               
                                                                 
 lstm_2 (LSTM)               (None, 6, 64)             33024     
                                                                 
 time_distributed_1 (TimeDis  (None, 6, 3800)          247000    
 tributed)                                                   

In [76]:
model_results = enc_dec_model.fit(spa_pad_sentence, eng_pad_sentence, batch_size=30, epochs=100)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [96]:
def logits_to_sentence(logits, tokenizer):
    # Create a mapping from index to word using the tokenizer's word_index
    index_to_words = {idx: word for word, idx in tokenizer.word_index.items()}
    # Filter out <empty> tokens
    index_to_words = {idx: word for idx, word in index_to_words.items() if word != '<empty>'}
    # Convert the predicted logits to a human-readable sentence
    return ' '.join([index_to_words.get(prediction, "") for prediction in np.argmax(logits, 1)])


index = 20
print("The english sentence is: {}".format(english_sentences[index]))
print("The spanish sentence is: {}".format(spanish_sentences[index]))
print('The predicted sentence is :')
print(logits_to_sentence(enc_dec_model.predict(spa_pad_sentence[index:index+1])[0], eng_text_tokenizer))


The english sentence is: hes broke
The spanish sentence is: está sin blanca
The predicted sentence is :
hes broke    


In [78]:
# save the model
path_to_save_model = r"C:\Users\Chrispdl\Desktop\live speech recognition\model.h5"
enc_dec_model.save(path_to_save_model)


In [91]:
# Input a real sentence in Spanish
input_sentence = "Mañana vamos a explorar la ciudad"  # Replace with your own Spanish sentence

# Tokenize and pad the input sentence
input_sequence = spa_text_tokenizer.texts_to_sequences([input_sentence])
input_pad_sequence = pad_sequences(input_sequence, max_spanish_len, padding="post")

# Use the model to predict the translation
predicted_logits = enc_dec_model.predict(input_pad_sequence)[0]

# Print the original and predicted sentences
print("Original Sentence (Spanish):", input_sentence)
print("Predicted Sentence (English):", logits_to_sentence(predicted_logits, eng_text_tokenizer))


Original Sentence (Spanish): Mañana vamos a explorar la ciudad
Predicted Sentence (English): come home    
