In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
import numpy as np

In [2]:
# Data
source_texts = ['hello', 'how are you', 'goodbye']
target_texts = ['bonjour', 'comment ça va', 'au revoir']

In [3]:
# Tokenize and pad sequences
source_tokenizer = Tokenizer()
target_tokenizer = Tokenizer()
source_tokenizer.fit_on_texts(source_texts)
target_tokenizer.fit_on_texts(target_texts)

In [4]:
# Text to sequences
source_sequences = source_tokenizer.texts_to_sequences(source_texts)

source_texts,source_sequences

(['hello', 'how are you', 'goodbye'], [[1], [2, 3, 4], [5]])

In [5]:
# pad sequences
source_sequences = pad_sequences(source_sequences, padding='post')

source_sequences

array([[1, 0, 0],
       [2, 3, 4],
       [5, 0, 0]])

In [6]:
target_sequences = target_tokenizer.texts_to_sequences(target_texts)
target_sequences = pad_sequences(target_sequences, padding='post')

target_sequences

array([[1, 0, 0],
       [2, 3, 4],
       [5, 6, 0]])

In [7]:
# Reshape target sequences for sparse_categorical_crossentropy
target_sequences = np.expand_dims(target_sequences, -1)

target_sequences

array([[[1],
        [0],
        [0]],

       [[2],
        [3],
        [4]],

       [[5],
        [6],
        [0]]])

In [8]:
X= source_sequences
X.reshape(X.shape[0], X.shape[1], 1)
rnn_shape = (X.shape[1],1)

In [9]:
# Model
model = Sequential([
    SimpleRNN(16, return_sequences=True, input_shape=rnn_shape),
    Dense(len(target_tokenizer.word_index) + 1, activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

  super().__init__(**kwargs)


In [10]:
# Training
model.fit(X, target_sequences, epochs=100)

Epoch 1/100


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - loss: 2.0431
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - loss: 2.0297
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - loss: 2.0166
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - loss: 2.0036
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - loss: 1.9908
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - loss: 1.9781
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - loss: 1.9656
Epoch 8/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - loss: 1.9534
Epoch 9/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - loss: 1.9412
Epoch 10/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - loss: 1.9293
Epoch 11/100
[1m1/1[0

<keras.src.callbacks.history.History at 0x26982ba0800>

In [11]:
# Inference function
def translate(text):
    sequence = source_tokenizer.texts_to_sequences([text])
    padded_sequence = pad_sequences(sequence, maxlen=source_sequences.shape[1], padding='post')
    prediction = model.predict(padded_sequence)

    translated_sentence = []
    for word_probs in prediction[0]:
        predicted_word_index = np.argmax(word_probs)
        word = target_tokenizer.index_word.get(predicted_word_index, "")
        if word:
            translated_sentence.append(word)
    return ' '.join(translated_sentence)

In [12]:
# Test translation
print(translate("hello"))
print(translate("how are you"))
print(translate("goodbye"))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 222ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 222ms/step

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
au ça va
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
au revoir


In [13]:
target_tokenizer.index_word.setdefault


<function dict.setdefault(key, default=None, /)>