In [None]:
# Preparing data for training
import numpy as np
import os

# Reading data into a list where each element is one line (string)
input_texts = []
target_texts = []
data_path = 'sample_en-fr.txt'

with open(data_path, 'r', encoding='utf-8') as f:
    for line in f:
        input_texts.append(line.rstrip('\n'))
# input_texts now contains each line as a string
print(f'Read {len(input_texts)} lines from {data_path}')

Read 6 lines from sample_en-fr.txt


In [2]:
input_texts

['hello<EOS>bonjour<EOS>',
 'how are you<EOS>comment ça va<EOS>',
 'i am a student<EOS>je suis étudiant<EOS>',
 'thank you<EOS>merci<EOS>',
 'good morning<EOS>bon matin<EOS>',
 'my friend<EOS>mon ami<EOS>']

In [3]:
eng_texts = [] # Encoder input texts
fra_texts = [] # Decoder target texts also, decoder input texts for teacher forcing

for line in input_texts:
    eng, fra, _ = line.split("<EOS>")
    eng_texts.append(eng)
    fra_texts.append(fra)
eng_texts

['hello',
 'how are you',
 'i am a student',
 'thank you',
 'good morning',
 'my friend']

In [4]:
fra_texts

['bonjour',
 'comment ça va',
 'je suis étudiant',
 'merci',
 'bon matin',
 'mon ami']

In [None]:
# Adding start and end tokens to target texts

encoder_texts = eng_texts
decoder_texts = []

for fra in fra_texts:
    decoder_texts.append("\t" + fra + "\n") # \t is the "start sequence" token, \n is "end sequence" token
decoder_texts

['\tbonjour\n',
 '\tcomment ça va\n',
 '\tje suis étudiant\n',
 '\tmerci\n',
 '\tbon matin\n',
 '\tmon ami\n']

In [6]:
# Building character vocabularies for the encoder and decoder

input_chars = set()
target_chars = set()

for text in encoder_texts:
    for char in text:
        input_chars.add(char)

for text in decoder_texts:
    for char in text:
        target_chars.add(char)

input_chars = sorted(list(input_chars))
target_chars = sorted(list(target_chars))

# All these numbers are for the model srchitectire for input shapes
num_encoder_tokens = len(input_chars)
num_decoder_tokens = len(target_chars)
max_encoder_seq_length = max([len(txt) for txt in encoder_texts])
max_decoder_seq_length = max([len(txt) for txt in decoder_texts])  

In [9]:
# Character to index mapping

input_token_index = {char: i for i, char in enumerate(input_chars)}
target_token_index = {char: i for i, char in enumerate(target_chars)}


In [10]:
# Initializing the tensors for the inputs

num_samples = len(encoder_texts)

# Encoder input data
encoder_input_data = np.zeros(
    (num_samples, max_encoder_seq_length, num_encoder_tokens),
    dtype="float32"
)

# Decoder input data
decoder_input_data = np.zeros(
    (num_samples, max_decoder_seq_length, num_decoder_tokens),
    dtype="float32"
)

# Decoder target data which is basically one timestep ahead of decoder_input_data for teacher forcing
decoder_target_data = np.zeros(
    (num_samples, max_decoder_seq_length, num_decoder_tokens),
    dtype="float32"
)


In [11]:
# Filling the tensors with one-hot encoded vectors

for i, (enc_text, dec_text) in enumerate(zip(encoder_texts, decoder_texts)):
    # Encoder input
    for t, char in enumerate(enc_text):
        encoder_input_data[i, t, input_token_index[char]] = 1.0

    # Decoder input and target
    for t, char in enumerate(dec_text):
        decoder_input_data[i, t, target_token_index[char]] = 1.0

        if t > 0:
            # Decoder target is shifted by one timestep
            decoder_target_data[i, t - 1, target_token_index[char]] = 1.0


In [None]:
from tensorflow import keras
from keras.models import Model
from keras.layers import Input, LSTM, Dense

# Implementing a simple sequence-to-sequence model using Keras.
latent_dim = 64  # Latent dimensionality of the encoding space.

# Encoder
# Define an input sequence and process it.
encoder_inputs = Input(shape=(None, num_encoder_tokens)) # Initialize encoder input using the number of unique input tokens
encoder = LSTM(latent_dim, return_state=True) # Initialize LSTM layer with latent dimension and return state
# return state parameter, makes sure the layer returns the last hidden state and cell state in addition to the output
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c] # context vector for the decoder

# Decoder
decoder_inputs = Input(shape=(None, num_decoder_tokens))
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the 
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax') # Dense layer with softmax activation for output
decoder_outputs = decoder_dense(decoder_outputs) 

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

  if not hasattr(np, "object"):


In [55]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=3,
          epochs=500)

Epoch 1/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 36ms/step - loss: 0.2203
Epoch 2/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - loss: 0.2172
Epoch 3/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - loss: 0.2194
Epoch 4/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - loss: 0.2168
Epoch 5/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - loss: 0.2180
Epoch 6/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - loss: 0.2170
Epoch 7/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - loss: 0.2178
Epoch 8/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - loss: 0.2169
Epoch 9/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step - loss: 0.2176
Epoch 10/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - loss: 0.2168
Epoch 11/

<keras.src.callbacks.history.History at 0x2a061fa1210>

In [57]:
reverse_target_char_index = {
    i: char for char, i in target_token_index.items()
}

In [58]:
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

In [59]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, target_token_index['\t']] = 1.

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '\n' or
           len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [h, c]

    return decoded_sentence


In [60]:
def encode_input_sentence(sentence):
    # Initialize empty encoder input
    sentence = sentence.lower() # convert to lowercase
    encoder_input = np.zeros(
        (1, max_encoder_seq_length, num_encoder_tokens),
        dtype="float32"
    )

    for t, char in enumerate(sentence):
        if char in input_token_index:
            encoder_input[0, t, input_token_index[char]] = 1.0
        # else: silently ignore unknown characters

    return encoder_input


In [66]:
english_input = "my friend"
input_seq = encode_input_sentence(english_input)

french = decode_sequence(input_seq)
print(french)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 102ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
mon ami

