#Use a simple dataset for English-to-French translation. You can either use a small dataset like this or download a more extensive dataset such as the Tab-delimited Bilingual Sentence Pairs dataset from Tatoeba or Parallel Corpus from the European Parliament.
Example data (small English to French pairs)

data = [ ("hello", "bonjour"), ("how are you", "comment ça va"), ("I am fine", "je vais bien"), ("what is your name", "comment tu t'appelles"), ("my name is", "je m'appelle"), ("thank you", "merci"), ("goodbye", "au revoir") ]

(a) Data Preprocessing

In [12]:

!pip install tensorflow numpy

import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

data = [("hello", "bonjour"),
        ("how are you", "comment ça va"),
        ("I am fine", "je vais bien"),
        ("what is your name", "comment tu t'appelles"),
        ("my name is", "je m'appelle"),
        ("thank you", "merci"),
        ("goodbye", "au revoir")]

english_sentences, french_sentences = zip(*data)

eng_tokenizer = Tokenizer()
eng_tokenizer.fit_on_texts(english_sentences)
eng_sequences = eng_tokenizer.texts_to_sequences(english_sentences)
eng_word_index = eng_tokenizer.word_index
max_eng_len = max(len(seq) for seq in eng_sequences)

fr_tokenizer = Tokenizer()
fr_tokenizer.fit_on_texts(french_sentences)
fr_sequences = fr_tokenizer.texts_to_sequences(french_sentences)
fr_word_index = fr_tokenizer.word_index
max_fr_len = max(len(seq) for seq in fr_sequences)

eng_padded = pad_sequences(eng_sequences, maxlen=max_eng_len, padding='post')
fr_padded = pad_sequences(fr_sequences, maxlen=max_fr_len, padding='post')

eng_vocab_size = len(eng_word_index) + 1
fr_vocab_size = len(fr_word_index) + 1




(b) Build Seq2Seq Model

In [14]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding

latent_dim = 256

encoder_inputs = Input(shape=(max_eng_len,))
enc_emb = Embedding(eng_vocab_size, latent_dim)(encoder_inputs)

encoder_lstm = LSTM(latent_dim, return_state=True)
_, state_h, state_c = encoder_lstm(enc_emb)
encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(max_fr_len,))
dec_emb_layer = Embedding(fr_vocab_size, latent_dim)
dec_emb = dec_emb_layer(decoder_inputs)

decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)

decoder_dense = Dense(fr_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)


model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()


(c) Preparing the Data for Training

In [15]:

fr_padded_target = np.zeros_like(fr_padded)
fr_padded_target[:, :-1] = fr_padded[:, 1:]

train_size = int(0.8 * len(eng_padded))
eng_train, eng_val = eng_padded[:train_size], eng_padded[train_size:]
fr_train, fr_val = fr_padded[:train_size], fr_padded[train_size:]
fr_target_train, fr_target_val = fr_padded_target[:train_size], fr_padded_target[train_size:]

(d) Train the Model on the Dataset

In [16]:
batch_size = 64
epochs = 100

history = model.fit([eng_train, fr_train], fr_target_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    validation_data=([eng_val, fr_val], fr_target_val))


Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 6s/step - accuracy: 0.2000 - loss: 2.6313 - val_accuracy: 0.8333 - val_loss: 2.5665
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step - accuracy: 0.5333 - loss: 2.5810 - val_accuracy: 0.8333 - val_loss: 2.4894
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 139ms/step - accuracy: 0.5333 - loss: 2.5268 - val_accuracy: 0.8333 - val_loss: 2.3981
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 135ms/step - accuracy: 0.5333 - loss: 2.4634 - val_accuracy: 0.8333 - val_loss: 2.2834
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 135ms/step - accuracy: 0.5333 - loss: 2.3850 - val_accuracy: 0.8333 - val_loss: 2.1351
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 133ms/step - accuracy: 0.5333 - loss: 2.2853 - val_accuracy: 0.8333 - val_loss: 1.9422
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━

(e) Inference Setup for Translation

In [17]:

encoder_model = Model(encoder_inputs, encoder_states)


decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_emb2 = dec_emb_layer(decoder_inputs)

decoder_outputs2, state_h2, state_c2 = decoder_lstm(
    dec_emb2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2)

decoder_model = Model([decoder_inputs] + decoder_states_inputs,
                      [decoder_outputs2] + decoder_states2)


(f) Translate New Sentences

In [18]:

french_sentences_with_tokens = ['start ' + sent + ' end' for sent in french_sentences]

fr_tokenizer = Tokenizer()
fr_tokenizer.fit_on_texts(french_sentences_with_tokens)
fr_sequences = fr_tokenizer.texts_to_sequences(french_sentences_with_tokens)
fr_word_index = fr_tokenizer.word_index
max_fr_len = max(len(seq) for seq in fr_sequences)


eng_padded = pad_sequences(eng_sequences, maxlen=max_eng_len, padding='post')
fr_padded = pad_sequences(fr_sequences, maxlen=max_fr_len, padding='post')


In [20]:
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)

    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = fr_word_index['start']

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = None
        for word, index in fr_word_index.items():
            if index == sampled_token_index:
                sampled_word = word
                break

        if sampled_word == 'end' or len(decoded_sentence) > max_fr_len:
            stop_condition = True
        else:
            decoded_sentence += sampled_word + ' '

        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]

    return decoded_sentence


In [24]:
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)

    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = fr_word_index['start']

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)


        sampled_token_index = np.argmax(output_tokens[0, -1, :])

        sampled_word = None
        for word, index in fr_word_index.items():
            if index == sampled_token_index:
                sampled_word = word
                break

        if sampled_word is None:
            print(f"Warning: No word found for token index {sampled_token_index}")
            stop_condition = True
        elif sampled_word == 'end' or len(decoded_sentence.split()) > max_fr_len:
            stop_condition = True
        else:
            decoded_sentence += sampled_word + ' '

        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        states_value = [h, c]

    return decoded_sentence


In [26]:

test_sentence = "hello"
test_sequence = eng_tokenizer.texts_to_sequences([test_sentence])
test_sequence = pad_sequences(test_sequence, maxlen=max_eng_len, padding='post')

translation = decode_sequence(test_sequence)
print(f"Translation: {translation}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
Translation: 


(g) Experimenting and Improving the Model with a Larger Dataset


In [27]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.optimizers import Adam

latent_dim = 256
eng_vocab_size = 15
fr_vocab_size = 15
embedding_dim = 128


encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(input_dim=eng_vocab_size, output_dim=embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_sequences=True)(encoder_embedding)
encoder_outputs, state_h, state_c = LSTM(latent_dim, return_state=True)(encoder_lstm)
encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(input_dim=fr_vocab_size, output_dim=embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True)(decoder_embedding, initial_state=encoder_states)
decoder_outputs = Dense(fr_vocab_size, activation='softmax')(decoder_lstm)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)


opt = Adam(learning_rate=0.0005)
model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

print(f"eng_train shape: {eng_train.shape}, fr_train shape: {fr_train.shape}, fr_target_train shape: {fr_target_train.shape}")

model.fit([eng_train, fr_train], fr_target_train, batch_size=batch_size, epochs=200, validation_data=([eng_val, fr_val], fr_target_val))


eng_train shape: (5, 4), fr_train shape: (5, 3), fr_target_train shape: (5, 3)
Epoch 1/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 8s/step - accuracy: 0.0000e+00 - loss: 2.7107 - val_accuracy: 0.6667 - val_loss: 2.6981
Epoch 2/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 98ms/step - accuracy: 0.4667 - loss: 2.6978 - val_accuracy: 0.8333 - val_loss: 2.6825
Epoch 3/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 102ms/step - accuracy: 0.5333 - loss: 2.6846 - val_accuracy: 0.8333 - val_loss: 2.6656
Epoch 4/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 139ms/step - accuracy: 0.5333 - loss: 2.6705 - val_accuracy: 0.8333 - val_loss: 2.6463
Epoch 5/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 100ms/step - accuracy: 0.5333 - loss: 2.6550 - val_accuracy: 0.8333 - val_loss: 2.6237
Epoch 6/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 102ms/step - accuracy: 0.5333 - loss: 2.63

<keras.src.callbacks.history.History at 0x7b7a4da316c0>