In [22]:
input_texts = [
    "hi",
    "how are you",
    "i love machine learning",
    "good morning"
]

target_texts = [
    "salut",
    "comment ca va",
    "j'aime l'apprentissage automatique",
    "bonjour"
]

In [23]:
# Tokenization (convert words to numbers)

In [24]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Add START and END tokens
target_texts = ["<start> " + txt + " <end>" for txt in target_texts]

# Tokenizers
input_tokenizer = Tokenizer(filters='')
target_tokenizer = Tokenizer(filters='')


input_tokenizer.fit_on_texts(input_texts)
target_tokenizer.fit_on_texts(target_texts)

# Convert to sequences
encoder_seq = input_tokenizer.texts_to_sequences(input_texts)
decoder_seq = target_tokenizer.texts_to_sequences(target_texts)

# Pad sequences
max_encoder_len = max(len(seq) for seq in encoder_seq)
max_decoder_len = max(len(seq) for seq in decoder_seq)

encoder_seq = pad_sequences(encoder_seq, maxlen=max_encoder_len, padding='post')
decoder_seq = pad_sequences(decoder_seq, maxlen=max_decoder_len, padding='post')

num_encoder_tokens = len(input_tokenizer.word_index) + 1
num_decoder_tokens = len(target_tokenizer.word_index) + 1


In [25]:
# Prepare decoder inputs & targets (teacher forcing)

In [26]:
decoder_input_data = decoder_seq[:, :-1]
decoder_target_data = decoder_seq[:, 1:]
# Shifted by 1 timestep.

In [27]:
# Encoder

In [28]:
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.models import Model

latent_dim = 128

encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(num_encoder_tokens, latent_dim)

encoder_embedded = encoder_embedding(encoder_inputs)

encoder_lstm = LSTM(latent_dim, return_state=True)

encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedded)

encoder_states = [state_h, state_c]


In [29]:
# decoder

In [30]:
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(num_decoder_tokens, latent_dim)

decoder_embedded = decoder_embedding(decoder_inputs)

decoder_lstm = LSTM(
    latent_dim,
    return_sequences=True,
    return_state=True
)

decoder_outputs, _, _ = decoder_lstm(
    decoder_embedded,
    initial_state=encoder_states
)

decoder_dense = Dense(num_decoder_tokens, activation="softmax")

decoder_outputs = decoder_dense(decoder_outputs)


In [31]:
# final model

In [32]:
model = Model(
    [encoder_inputs, decoder_inputs],
    decoder_outputs
)

model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

model.summary()


In [None]:
model.fit(
    [encoder_seq, decoder_input_data],
    decoder_target_data,
    batch_size=2,
    epochs=100
)


Epoch 1/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 32ms/step - accuracy: 0.2083 - loss: 2.3955
Epoch 2/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 0.4167 - loss: 2.3703
Epoch 3/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 0.4167 - loss: 2.3388
Epoch 4/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - accuracy: 0.3750 - loss: 2.3187
Epoch 5/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - accuracy: 0.3750 - loss: 2.2823
Epoch 6/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - accuracy: 0.3750 - loss: 2.2395
Epoch 7/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - accuracy: 0.3750 - loss: 2.1801
Epoch 8/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - accuracy: 0.3333 - loss: 2.0928
Epoch 9/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3

In [None]:
# inference

In [None]:
encoder_model = Model(encoder_inputs, encoder_states)

In [None]:
# decoder inference model

In [None]:
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))

decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_embedded_inf = decoder_embedding(decoder_inputs)

decoder_outputs, h, c = decoder_lstm(
    decoder_embedded_inf,
    initial_state=decoder_states_inputs
)

decoder_states = [h, c]

decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states
)


In [None]:
# translation function

In [None]:
import numpy as np

reverse_target_index = {v:k for k,v in target_tokenizer.word_index.items()}

def translate(sentence):

    seq = input_tokenizer.texts_to_sequences([sentence])
    seq = pad_sequences(seq, maxlen=max_encoder_len, padding="post")

    states = encoder_model.predict(seq)

    target_seq = np.zeros((1,1))
    target_seq[0,0] = target_tokenizer.word_index["<start>"]

    decoded_sentence = []

    while True:

        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states
        )

        sampled_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = reverse_target_index.get(sampled_index, "")

        if sampled_word == "<end>" or len(decoded_sentence) > max_decoder_len:
            break

        decoded_sentence.append(sampled_word)

        target_seq = np.zeros((1,1))
        target_seq[0,0] = sampled_index

        states = [h, c]

    return " ".join(decoded_sentence)


In [None]:
print(translate("hi"))
print(translate("good morning"))
