In [12]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
import numpy as np

In [13]:
# Data
source_texts = ['hello', 'how are you', 'goodbye']
target_texts = ['bonjour', 'comment ça va', 'au revoir']

In [14]:
# Tokenize and pad sequences
source_tokenizer = Tokenizer()
target_tokenizer = Tokenizer()
source_tokenizer.fit_on_texts(source_texts)
target_tokenizer.fit_on_texts(target_texts)

In [18]:
# Text to sequences
source_sequences = source_tokenizer.texts_to_sequences(source_texts)

source_texts,source_sequences

(['hello', 'how are you', 'goodbye'], [[1], [2, 3, 4], [5]])

In [19]:
# pad sequences
source_sequences = pad_sequences(source_sequences, padding='post')

source_sequences

array([[1, 0, 0],
       [2, 3, 4],
       [5, 0, 0]])

In [35]:
target_sequences = target_tokenizer.texts_to_sequences(target_texts)
target_sequences = pad_sequences(target_sequences, padding='post')

target_sequences

array([[1, 0, 0],
       [2, 3, 4],
       [5, 6, 0]])

In [36]:
# Reshape target sequences for sparse_categorical_crossentropy
target_sequences = np.expand_dims(target_sequences, -1)

target_sequences

array([[[1],
        [0],
        [0]],

       [[2],
        [3],
        [4]],

       [[5],
        [6],
        [0]]])

In [24]:
# Model
model = Sequential([
    Embedding(input_dim=len(source_tokenizer.word_index) + 1, output_dim=8, input_length=source_sequences.shape[1]),
    SimpleRNN(16, return_sequences=True),
    Dense(len(target_tokenizer.word_index) + 1, activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

In [25]:
# Training
model.fit(source_sequences, target_sequences, epochs=100)

Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - loss: 1.9369
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - loss: 1.9300
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - loss: 1.9232
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - loss: 1.9164
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - loss: 1.9095
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - loss: 1.9026
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - loss: 1.8956
Epoch 8/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - loss: 1.8886
Epoch 9/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - loss: 1.8815
Epoch 10/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step - loss: 1.8743
Epoch 11/10

<keras.src.callbacks.history.History at 0x1b2f997d790>

In [26]:
# Inference function
def translate(text):
    sequence = source_tokenizer.texts_to_sequences([text])
    padded_sequence = pad_sequences(sequence, maxlen=source_sequences.shape[1], padding='post')
    prediction = model.predict(padded_sequence)

    translated_sentence = []
    for word_probs in prediction[0]:
        predicted_word_index = np.argmax(word_probs)
        word = target_tokenizer.index_word.get(predicted_word_index, "")
        if word:
            translated_sentence.append(word)
    return ' '.join(translated_sentence)

In [30]:
# Test translation
print(translate("hello"))
print(translate("how are you"))
print(translate("goodbye"))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
bonjour
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
comment ça va
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
comment revoir
