In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [4]:
import tensorflow as tf
from tensorflow import keras
from path import Path 
import numpy as np

url = "https://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip"
path = keras.utils.get_file("spa-eng.zip", origin=url, cache_dir="datasets", extract=True)
text = (Path(path).with_name("spa-eng") / "spa.txt").read_text()
text = text.replace("¡", "").replace("¿", "")
pairs = [line.split("\t") for line in text.splitlines()]

np.random.shuffle(pairs)
sentences_en, sentences_es = zip(*pairs)

In [5]:
vocab_size = 1500
encoder_text_vectorization = keras.layers.TextVectorization(vocab_size, output_sequence_length = 64)
decoder_text_vectorization = keras.layers.TextVectorization(vocab_size, output_sequence_length = 64)

encoder_text_vectorization.adapt(sentences_en)
decoder_text_vectorization.adapt([f"startofseq {sentence} endofseq" for sentence in sentences_es])

I0000 00:00:1745682405.964729      31 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13942 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1745682405.965372      31 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13942 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5


In [7]:
X_train_enc = tf.constant(sentences_en[:100_000])
X_valid_enc = tf.constant(sentences_en[100_000:])

X_train_dec = tf.constant([f"startofseq {s}" for s in sentences_es[:100_000]])
X_valid_dec = tf.constant([f"startofseq {s}" for s in sentences_es[100_000:]])

Y_train = decoder_text_vectorization([f"{s} endofseq" for s in sentences_es[:100_000]])
Y_valid = decoder_text_vectorization([f"{s} endofseq" for s in sentences_es[100_000:]])

In [12]:
encoder_inputs = keras.layers.Input(shape=[], dtype=tf.string)
decoder_inputs = keras.layers.Input(shape=[], dtype=tf.string)

encoder_x = encoder_text_vectorization(encoder_inputs)
decoder_x = decoder_text_vectorization(encoder_inputs)

encoder_x = keras.layers.Embedding(input_dim = vocab_size, output_dim = 128, mask_zero=True)(encoder_x)
decoder_x = keras.layers.Embedding(input_dim = vocab_size, output_dim = 128, mask_zero=True)(decoder_x)

encoder_outputs, *encoder_state = keras.layers.LSTM(512, return_state = True)(encoder_x)


decoder_x = keras.layers.LSTM(512, return_sequences=True)(decoder_x, initial_state=encoder_state)
output = keras.layers.Dense(vocab_size, activation='softmax')(decoder_x)

model = keras.Model(inputs = [encoder_inputs,decoder_inputs], outputs = [output])
model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam", metrics=["accuracy"])

In [13]:
model.fit((X_train_enc, X_train_dec), Y_train, epochs=10, validation_data=((X_valid_enc, X_valid_dec), Y_valid))

Epoch 1/10
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 24ms/step - accuracy: 0.8223 - loss: 4.4265 - val_accuracy: 0.9230 - val_loss: 3.1977
Epoch 2/10
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 24ms/step - accuracy: 0.9245 - loss: 2.9723 - val_accuracy: 0.9294 - val_loss: 2.5930
Epoch 3/10
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 24ms/step - accuracy: 0.9316 - loss: 2.3864 - val_accuracy: 0.9326 - val_loss: 2.3468
Epoch 4/10
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 24ms/step - accuracy: 0.9364 - loss: 2.0660 - val_accuracy: 0.9344 - val_loss: 2.2340
Epoch 5/10
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 24ms/step - accuracy: 0.9394 - loss: 1.8443 - val_accuracy: 0.9301 - val_loss: 2.1917
Epoch 6/10
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 24ms/step - accuracy: 0.9421 - loss: 1.6587 - val_accuracy: 0.9245 - val_loss: 2.1988
Epoc

<keras.src.callbacks.history.History at 0x79648e3b60d0>

In [23]:
max_length = 50
def translate(sentence_en):
    translation = ""
    for word_idx in range(max_length):
        X = tf.constant([sentence_en])  # encoder input
        X_dec = tf.constant(["startofseq " + translation])  # decoder input
        predictions = model.predict((X, X_dec))
        y_proba = predictions[0, word_idx]  # last token's probas
        predicted_word_id = np.argmax(y_proba)
        predicted_word = decoder_text_vectorization.get_vocabulary()[predicted_word_id]
        if predicted_word == "endofseq":
            break
        translation += " " + predicted_word
    return translation.strip()

In [24]:
translate("I like soccer")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step


'me gusta el fútbol'