<a href="https://colab.research.google.com/github/2203A51251/NLP_Lab/blob/main/Assignment7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

1.Use a simple dataset for English-to-French translation. You can either use a small dataset like this or download a more extensive dataset such as the Tab-delimited Bilingual Sentence Pairs dataset from Tatoeba or Parallel Corpus from the European Parliament.

Example data (small English to French pairs)

data = [ ("hello", "bonjour"), ("how are you", "comment ça va"), ("I am fine", "je vais bien"), ("what is your name", "comment tu t'appelles"), ("my name is", "je m'appelle"), ("thank you", "merci"), ("goodbye", "au revoir") ]  [CO4]



(a) Data Preprocessing

(b) Build Seq2Seq Model

(c) Preparing the Data for Training

(d) Train the model on the dataset

(e) Inference Setup for Translation

(f) Translate New Sentences

(g) Experimenting and Improving the Model by large dataset and hyper tune parameter.

In [2]:
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding

(a) Data Preprocessing

In [22]:
# Example data (small English to French pairs)
data = [("hello", "bonjour"),
        ("how are you", "comment ça va"),
        ("I am fine", "je vais bien"),
        ("what is your name", "comment tu t'appelles"),
        ("my name is", "je m'appelle"),
        ("thank you", "merci"),
        ("goodbye", "au revoir")]

# Splitting the data into input (English) and output (French) pairs
english_sentences, french_sentences = zip(*data)

# Tokenize English sentences
eng_tokenizer = Tokenizer()
eng_tokenizer.fit_on_texts(english_sentences)
eng_sequences = eng_tokenizer.texts_to_sequences(english_sentences)
eng_word_index = eng_tokenizer.word_index
max_eng_len = max(len(seq) for seq in eng_sequences)

# Tokenize French sentences
fr_tokenizer = Tokenizer()
fr_tokenizer.fit_on_texts(french_sentences)
fr_sequences = fr_tokenizer.texts_to_sequences(french_sentences)
fr_word_index = fr_tokenizer.word_index
max_fr_len = max(len(seq) for seq in fr_sequences)

# Padding sequences
eng_padded = pad_sequences(eng_sequences, maxlen=max_eng_len, padding='post')
fr_padded = pad_sequences(fr_sequences, maxlen=max_fr_len, padding='post')

# Vocabulary sizes
eng_vocab_size = len(eng_word_index) + 1
fr_vocab_size = len(fr_word_index) + 1

(b) Build Seq2Seq Model

In [24]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding

# Encoder
latent_dim = 256
encoder_inputs = Input(shape=(max_eng_len,))
enc_emb = Embedding(eng_vocab_size, latent_dim)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
_, state_h, state_c = encoder_lstm(enc_emb)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(max_fr_len,))
dec_emb_layer = Embedding(fr_vocab_size, latent_dim)
dec_emb = dec_emb_layer(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
decoder_dense = Dense(fr_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Seq2Seq model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()


(c) Preparing the Data for Training

In [25]:
# Preparing decoder target data (shifted French sentences)
fr_padded_target = np.zeros_like(fr_padded)
fr_padded_target[:, :-1] = fr_padded[:, 1:]

# Train/Val Split
train_size = int(0.8 * len(eng_padded))
eng_train, eng_val = eng_padded[:train_size], eng_padded[train_size:]
fr_train, fr_val = fr_padded[:train_size], fr_padded[train_size:]
fr_target_train, fr_target_val = fr_padded_target[:train_size], fr_padded_target[train_size:]


 (d) Train the model on the dataset

In [26]:
batch_size = 64
epochs = 100
history = model.fit([eng_train, fr_train], fr_target_train,batch_size=batch_size,epochs=epochs,
                    validation_data=([eng_val, fr_val], fr_target_val))


Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step - accuracy: 0.5333 - loss: 2.6264 - val_accuracy: 0.8333 - val_loss: 2.5462
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 107ms/step - accuracy: 0.5333 - loss: 2.5759 - val_accuracy: 0.8333 - val_loss: 2.4680
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 132ms/step - accuracy: 0.5333 - loss: 2.5199 - val_accuracy: 0.8333 - val_loss: 2.3719
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 104ms/step - accuracy: 0.5333 - loss: 2.4531 - val_accuracy: 0.8333 - val_loss: 2.2488
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 147ms/step - accuracy: 0.5333 - loss: 2.3694 - val_accuracy: 0.8333 - val_loss: 2.0882
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 117ms/step - accuracy: 0.5333 - loss: 2.2621 - val_accuracy: 0.8333 - val_loss: 1.8794
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━

(e) Inference Setup for Translation

In [27]:
# Inference Encoder
encoder_model = Model(encoder_inputs, encoder_states)

# Inference Decoder
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
dec_emb2 = dec_emb_layer(decoder_inputs)
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2)
decoder_model = Model([decoder_inputs] + decoder_states_inputs,[decoder_outputs2] + decoder_states2)



(f) Translate New Sentences

In [28]:
# Add 'start' and 'end' tokens to French sentences
french_sentences_with_tokens = ['start ' + sent + ' end' for sent in french_sentences]

# Tokenize French sentences with start and end tokens
fr_tokenizer = Tokenizer()
fr_tokenizer.fit_on_texts(french_sentences_with_tokens)
fr_sequences = fr_tokenizer.texts_to_sequences(french_sentences_with_tokens)
fr_word_index = fr_tokenizer.word_index
max_fr_len = max(len(seq) for seq in fr_sequences)

# Padding sequences
eng_padded = pad_sequences(eng_sequences, maxlen=max_eng_len, padding='post')
fr_padded = pad_sequences(fr_sequences, maxlen=max_fr_len, padding='post')


In [29]:
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)

    # Start with the 'start' token
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = fr_word_index['start']

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = None
        for word, index in fr_word_index.items():
            if index == sampled_token_index:
                sampled_word = word
                break

        if sampled_word == 'end' or len(decoded_sentence) > max_fr_len:
            stop_condition = True
        else:
            decoded_sentence += sampled_word + ' '

        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]

    return decoded_sentence

In [33]:
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)

    # Start with the 'start' token
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = fr_word_index['start']

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Get the index of the most likely next word
        sampled_token_index = np.argmax(output_tokens[0, -1, :])

        # Retrieve the corresponding word for the token index
        sampled_word = None
        for word, index in fr_word_index.items():
            if index == sampled_token_index:
                sampled_word = word
                break

        # Handle the case where the word is not found
        if sampled_word is None:
            print(f"Warning: No word found for token index {sampled_token_index}")
            stop_condition = True
        elif sampled_word == 'end' or len(decoded_sentence.split()) > max_fr_len:
            stop_condition = True
        else:
            decoded_sentence += sampled_word + ' '

        # Update the target sequence to the predicted word
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Update states for the next iteration
        states_value = [h, c]

    return decoded_sentence


In [34]:
# Testing with a new sentence
test_sentence = "hello"
test_sequence = eng_tokenizer.texts_to_sequences([test_sentence])
test_sequence = pad_sequences(test_sequence, maxlen=max_eng_len, padding='post')

translation = decode_sequence(test_sequence)
print(f"Translation: {translation}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
Translation: 


(g) Experimenting and Improving the Model with a Larger Dataset

In [35]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.optimizers import Adam

# Define the latent_dim and other hyperparameters
latent_dim = 256
eng_vocab_size = 15  # Example value; replace with actual vocab size
fr_vocab_size = 15   # Example value; replace with actual vocab size
embedding_dim = 128  # Size of the embedding vectors

# Define encoder
encoder_inputs = Input(shape=(None,))  # Only the sequence length is needed here
encoder_embedding = Embedding(input_dim=eng_vocab_size, output_dim=embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_sequences=True)(encoder_embedding)
encoder_outputs, state_h, state_c = LSTM(latent_dim, return_state=True)(encoder_lstm)
encoder_states = [state_h, state_c]

# Define decoder
decoder_inputs = Input(shape=(None,))  # Only the sequence length is needed here
decoder_embedding = Embedding(input_dim=fr_vocab_size, output_dim=embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True)(decoder_embedding, initial_state=encoder_states)
decoder_outputs = Dense(fr_vocab_size, activation='softmax')(decoder_lstm)

# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model
opt = Adam(learning_rate=0.0005)
model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Check the shapes of the training data
# Ensure eng_train, fr_train, and fr_target_train have the appropriate shapes
print(f"eng_train shape: {eng_train.shape}, fr_train shape: {fr_train.shape}, fr_target_train shape: {fr_target_train.shape}")

# Train the model
model.fit([eng_train, fr_train], fr_target_train, batch_size=batch_size, epochs=200, validation_data=([eng_val, fr_val], fr_target_val))



eng_train shape: (5, 4), fr_train shape: (5, 3), fr_target_train shape: (5, 3)
Epoch 1/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7s/step - accuracy: 0.0000e+00 - loss: 2.7091 - val_accuracy: 0.3333 - val_loss: 2.7010
Epoch 2/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 113ms/step - accuracy: 0.5333 - loss: 2.6969 - val_accuracy: 0.8333 - val_loss: 2.6875
Epoch 3/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 136ms/step - accuracy: 0.5333 - loss: 2.6843 - val_accuracy: 0.8333 - val_loss: 2.6729
Epoch 4/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 102ms/step - accuracy: 0.5333 - loss: 2.6707 - val_accuracy: 0.8333 - val_loss: 2.6564
Epoch 5/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 111ms/step - accuracy: 0.5333 - loss: 2.6557 - val_accuracy: 0.8333 - val_loss: 2.6374
Epoch 6/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 149ms/step - accuracy: 0.5333 - loss: 2.6

<keras.src.callbacks.history.History at 0x7d4b75fe1810>