In [1]:
import tensorflow as tf
from pathlib import Path
url="https://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip"
file=tf.keras.utils.get_file("spa.zip",origin=url,extract=True,cache_dir="datasets")
text=(Path(file).with_name("spa_extracted")/"spa-eng"/"spa.txt").read_text(encoding="utf-8")

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip
[1m2638744/2638744[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 0us/step


In [2]:
import numpy as np
text=text.replace("¡","").replace("¿","")
pairs=[line.split("\t") for line in text.splitlines()]
np.random.shuffle(pairs)
sentences_en,sentences_es=zip(*pairs)

In [3]:
words_vocabulary_size=14000
sentence_output_length=50
text_vec_en=tf.keras.layers.TextVectorization(words_vocabulary_size,output_sequence_length=sentence_output_length)
text_vec_es=tf.keras.layers.TextVectorization(words_vocabulary_size,output_sequence_length=sentence_output_length)
text_vec_en.adapt(sentences_en)
text_vec_es.adapt([f"startofseq{s}endofseq" for s in sentences_es])

In [4]:
x_train_enc=tf.constant(sentences_en[:100000])
x_val_enc=tf.constant(sentences_en[100000:])

x_train_dec=tf.constant([f"startofseq{s}" for s in sentences_es[:100000]])
x_val_dec=tf.constant([f"startofseq{s}" for s in sentences_es[100000:]])

y_train=text_vec_es([f"{s}endofseq" for s in sentences_es[:100000]])
y_val=text_vec_es([f"{s}endofseq" for s in sentences_es[100000:]])

In [5]:
from tensorflow.keras.layers import Concatenate
embedding_dims=128

encoder_input_layer=tf.keras.layers.Input(shape=[],dtype=tf.string)
decoder_input_layer=tf.keras.layers.Input(shape=[],dtype=tf.string)

en_token_ids=text_vec_en(encoder_input_layer)
es_token_ids=text_vec_en(decoder_input_layer)

Embedding_enc_layer=tf.keras.layers.Embedding(input_dim=words_vocabulary_size,output_dim=embedding_dims,mask_zero=True)
Embedding_dec_layer=tf.keras.layers.Embedding(input_dim=words_vocabulary_size,output_dim=embedding_dims,mask_zero=True)

enc_emb=Embedding_enc_layer(en_token_ids)
dec_emb=Embedding_dec_layer(es_token_ids)

encoder=tf.keras.layers.Bidirectional(
    tf.keras.layers.LSTM(512,return_sequences=True,return_state=True))
decoder=tf.keras.layers.LSTM(1024,return_sequences=True,return_state=True)

encoder_output,forward_short_h,forward_long_c,backward_short_h,backward_long_c=encoder(enc_emb)
short_h=Concatenate()([forward_short_h,backward_short_h])
long_c=Concatenate()([forward_long_c,backward_long_c])
decoder_output,*decoder_state=decoder(dec_emb,initial_state=[short_h,long_c])

In [6]:
output_layer=tf.keras.layers.Dense(words_vocabulary_size,activation="softmax")
output_probas=output_layer(decoder_output)

model=tf.keras.Model(inputs=[encoder_input_layer,decoder_input_layer],outputs=[output_probas])
model.compile(loss="sparse_categorical_crossentropy",metrics=["Accuracy"],optimizer="nadam")
model.fit((x_train_enc,x_train_dec),y_train,validation_data=((x_val_enc,x_val_dec),y_val),epochs=50)


Epoch 1/50
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m131s[0m 40ms/step - Accuracy: 0.0288 - loss: 5.3427 - val_Accuracy: 0.0466 - val_loss: 3.8386
Epoch 2/50
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m125s[0m 40ms/step - Accuracy: 0.0504 - loss: 3.4251 - val_Accuracy: 0.0567 - val_loss: 3.1026
Epoch 3/50
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m125s[0m 40ms/step - Accuracy: 0.0616 - loss: 2.5249 - val_Accuracy: 0.0622 - val_loss: 2.7380
Epoch 4/50
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m125s[0m 40ms/step - Accuracy: 0.0714 - loss: 1.8913 - val_Accuracy: 0.0648 - val_loss: 2.5895
Epoch 5/50
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m125s[0m 40ms/step - Accuracy: 0.0797 - loss: 1.4522 - val_Accuracy: 0.0662 - val_loss: 2.5765
Epoch 6/50
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m125s[0m 40ms/step - Accuracy: 0.0870 - loss: 1.1338 - val_Accuracy: 0.0669 - val_loss: 2.596

KeyboardInterrupt: 

In [7]:
def translate(en_sentence):
    translation=""
    for word_idx in range(sentence_output_length):
        enc_input=np.array([en_sentence],dtype=object)
        dec_input=np.array([f"startofseq"+translation],dtype=object)
        probas=model.predict((enc_input,dec_input))[0,word_idx]
        word_id=np.argmax(probas)
        word=text_vec_es.get_vocabulary()[word_id]
        if word=="endofseq":
            break
        translation+=" "+word
    return translation.strip()

In [8]:
translate("Today is a beautiful day. I went to the park with my friends, and we had a great time talking and laughing together.")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 383ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4

'hoy es una y que hoy [UNK] y y y y y pienso y y trabajar y y que y y [UNK] [UNK] cuchilloendofseq juntosendofseq largosendofseq largosendofseq largosendofseq semanaendofseq juntosendofseq huevosendofseq huevosendofseq otraendofseq juntosendofseq medianocheendofseq tempranoendofseq medianocheendofseq medianocheendofseq tempranoendofseq tardeendofseq [UNK] centímetros centímetros tardeendofseq tardeendofseq [UNK] semanaendofseq semanaendofseq semanaendofseq semanaendofseq'