In [10]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np


In [11]:
# Example small parallel corpus
data = [
    ("hello", "hola"),
    ("how are you", "como estas"),
    ("good morning", "buenos dias"),
    ("thank you", "gracias"),
    ("good night", "buenas noches")
]

In [12]:
# Tokenization
english_texts, spanish_texts = zip(*data)
eng_tokenizer = Tokenizer()
spa_tokenizer = Tokenizer()
eng_tokenizer.fit_on_texts(english_texts)
spa_tokenizer.fit_on_texts(spanish_texts)

In [13]:
# Convert text to sequences
eng_sequences = eng_tokenizer.texts_to_sequences(english_texts)
spa_sequences = spa_tokenizer.texts_to_sequences(spanish_texts)


In [14]:
# Padding
max_length = max(len(seq) for seq in spa_sequences)
eng_sequences = pad_sequences(eng_sequences, maxlen=max_length, padding='post')
spa_sequences = pad_sequences(spa_sequences, maxlen=max_length, padding='post')

In [17]:
# Define model
embedding_dim = 64
hidden_units = 128

encoder_inputs = tf.keras.Input(shape=(max_length,))
encoder_embedding = Embedding(len(eng_tokenizer.word_index) + 1, embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(hidden_units, return_state=True)
_, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

decoder_inputs = tf.keras.Input(shape=(max_length,))
decoder_embedding = Embedding(len(spa_tokenizer.word_index) + 1, embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(hidden_units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(len(spa_tokenizer.word_index) + 1, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [18]:
# Compile model
model = tf.keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [19]:
# Prepare decoder targets
spa_sequences_output = np.array(spa_sequences).reshape((-1, max_length, 1))

In [20]:
# Train model
model.fit([eng_sequences, spa_sequences], spa_sequences_output, epochs=100, verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.src.callbacks.History at 0x27b9a2f3910>

In [21]:
# Translation function
def translate(sentence):
    sequence = eng_tokenizer.texts_to_sequences([sentence])
    sequence = pad_sequences(sequence, maxlen=max_length, padding='post')
    prediction = model.predict([sequence, sequence])
    predicted_words = [spa_tokenizer.index_word.get(np.argmax(word)) for word in prediction[0]]
    return " ".join([w for w in predicted_words if w])

In [22]:
# Example translation
print("Translation:", translate("hello"))

Translation: hola
