<a href="https://colab.research.google.com/github/ByriVarshini/NLP_2024/blob/main/NLP_Assignment_7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Use a simple dataset for English-to-French translation. You can either use a small dataset like this or download a more extensive dataset such as the Tab-delimited Bilingual Sentence Pairs dataset from Tatoeba or Parallel Corpus from the European Parliament.

Example data (small English to French pairs)
data = [ ("hello", "bonjour"), ("how are you", "comment ça va"), ("I am fine", "je vais bien"), ("what is your name", "comment tu t'appelles"), ("my name is", "je m'appelle"), ("thank you", "merci"), ("goodbye", "au revoir") ] [CO4]

(a) Data Preprocessing

(b) Build Seq2Seq Model

(c) Preparing the Data for Training

(d) Train the model on the dataset

(e) Inference Setup for Translation

(f) Translate New Sentences

(g) Experimenting and Improving the Model by large dataset and hyper tune parameter.

In [2]:
pip install tensorflow numpy



In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [3]:
data = [
    ("hello", "bonjour"),
    ("how are you", "comment ça va"),
    ("I am fine", "je vais bien"),
    ("what is your name", "comment tu t'appelles"),
    ("my name is", "je m'appelle"),
    ("thank you", "merci"),
    ("goodbye", "au revoir")
]

# Split dataset into English and French pairs
english_texts, french_texts = zip(*data)

# Add "start" and "end" tokens to French sentences
french_texts = ["start " + text + " end" for text in french_texts]

# Tokenize English and French sentences
def tokenize(sentences):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(sentences)
    return tokenizer, tokenizer.texts_to_sequences(sentences)

eng_tokenizer, eng_sequences = tokenize(english_texts)
fr_tokenizer, fr_sequences = tokenize(french_texts)

# Define maximum sequence lengths for padding
max_eng_len = max([len(seq) for seq in eng_sequences])
max_fr_len = max([len(seq) for seq in fr_sequences])

# Pad sequences for uniform input length
eng_sequences = pad_sequences(eng_sequences, maxlen=max_eng_len, padding="post")
fr_sequences = pad_sequences(fr_sequences, maxlen=max_fr_len, padding="post")

# Vocabulary sizes
eng_vocab_size = len(eng_tokenizer.word_index) + 1
fr_vocab_size = len(fr_tokenizer.word_index) + 1
# Split into training and validation sets
eng_train, eng_val, fr_train, fr_val = train_test_split(eng_sequences, fr_sequences, test_size=0.2)

In [4]:
# Define model parameters
embedding_dim = 256
latent_dim = 512

# Encoder
encoder_inputs = Input(shape=(max_eng_len,))
encoder_embedding = Embedding(eng_vocab_size, embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(max_fr_len,))
decoder_embedding = Embedding(fr_vocab_size, embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(fr_vocab_size, activation="softmax")
decoder_outputs = decoder_dense(decoder_outputs)

# Define Seq2Seq model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

In [5]:
# Shift French sequences for teacher forcing
fr_train_target = np.delete(fr_train, 0, axis=1)
fr_train_target = np.insert(fr_train_target, max_fr_len - 1, 0, axis=1)
fr_val_target = np.delete(fr_val, 0, axis=1)
fr_val_target = np.insert(fr_val_target, max_fr_len - 1, 0, axis=1)


In [6]:
# Shift French sequences for teacher forcing
fr_train_target = np.delete(fr_train, 0, axis=1)
fr_train_target = np.insert(fr_train_target, max_fr_len - 1, 0, axis=1)
fr_val_target = np.delete(fr_val, 0, axis=1)
fr_val_target = np.insert(fr_val_target, max_fr_len - 1, 0, axis=1)


In [7]:
# Training
batch_size = 64
epochs = 100

history = model.fit(
    [eng_train, fr_train],
    fr_train_target[..., np.newaxis],
    batch_size=batch_size,
    epochs=epochs,
    validation_data=([eng_val, fr_val], fr_val_target[..., np.newaxis]),
)



Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step - accuracy: 0.0400 - loss: 2.7702 - val_accuracy: 0.6000 - val_loss: 2.7030
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 735ms/step - accuracy: 0.2800 - loss: 2.7236 - val_accuracy: 0.6000 - val_loss: 2.6186
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 219ms/step - accuracy: 0.2800 - loss: 2.6680 - val_accuracy: 0.6000 - val_loss: 2.4898
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 219ms/step - accuracy: 0.2800 - loss: 2.5897 - val_accuracy: 0.6000 - val_loss: 2.2814
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 310ms/step - accuracy: 0.2800 - loss: 2.4701 - val_accuracy: 0.6000 - val_loss: 1.9511
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 265ms/step - accuracy: 0.2800 - loss: 2.2870 - val_accuracy: 0.6000 - val_loss: 1.5095
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━

In [8]:
# Define inference encoder model
encoder_model = Model(encoder_inputs, encoder_states)

# Define inference decoder model
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_embedding, initial_state=decoder_states_inputs
)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states
)


In [10]:
# Define translation function
def translate_sentence(sentence):
    # Tokenize and pad sentence
    sequence = eng_tokenizer.texts_to_sequences([sentence])
    sequence = pad_sequences(sequence, maxlen=max_eng_len, padding="post")

    # Get encoder states
    states = encoder_model.predict(sequence)

    # Start decoding
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = fr_tokenizer.word_index["start"]

    translated_sentence = ""
    stop_condition = False
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states)

        # Sample a word
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = fr_tokenizer.index_word.get(sampled_token_index, "")

        if sampled_word == "end":
            stop_condition = True
        else:
            translated_sentence += " " + sampled_word

        # Update target sequence and states
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        states = [h, c]

    return translated_sentence.strip()
print(translate_sentence("how are you"))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
comment ça va
