<a href="https://colab.research.google.com/github/Alisha210302/NLP/blob/main/29_Seq2Seq_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import tensorflow
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping

### Sample data - English to French translation

In [None]:
english_sentences = ['hello','how are you','good morning','good night','thankyou']
french_sentences = ['bonjour', 'comment ça va', 'bonjour', 'bonne nuit', 'merci']

### add START and END tokens to the French Sentences

In [None]:
french_sentences = ['starttoken '+sentence+' endtoken' for sentence in french_sentences]

# Hyperparameters
batch_size=2
epochs=100
latent_dim = 256  # Dimentionality of encoding space

In [None]:
french_sentences

['starttoken bonjour endtoken',
 'starttoken comment ça va endtoken',
 'starttoken bonjour endtoken',
 'starttoken bonne nuit endtoken',
 'starttoken merci endtoken']

### Initialize the Tokenizer for both source and target languages

In [None]:
eng_tokenizer = Tokenizer(char_level=False)
fra_tokenizer = Tokenizer(char_level=False)

### Fit the tokenizer on the sentences

In [None]:
eng_tokenizer.fit_on_texts(english_sentences)
fra_tokenizer.fit_on_texts(french_sentences)

In [None]:
eng_tokenizer.word_index

{'good': 1,
 'hello': 2,
 'how': 3,
 'are': 4,
 'you': 5,
 'morning': 6,
 'night': 7,
 'thankyou': 8}

In [None]:
fra_tokenizer.word_index

{'starttoken': 1,
 'endtoken': 2,
 'bonjour': 3,
 'comment': 4,
 'ça': 5,
 'va': 6,
 'bonne': 7,
 'nuit': 8,
 'merci': 9}

### Convert the sentences into sequences of integers

In [None]:
encoder_input_data = eng_tokenizer.texts_to_sequences(english_sentences)
decoder_input_data = fra_tokenizer.texts_to_sequences(french_sentences)

### Pad the sequences to ensure uniform length

In [None]:
max_encoder_seq_length = max([len(seq) for seq in encoder_input_data])
max_decoder_seq_length = max([len(seq) for seq in decoder_input_data])

In [None]:
max_encoder_seq_length

3

In [None]:
max_decoder_seq_length

5

In [None]:
encoder_input_data = pad_sequences(encoder_input_data, maxlen=max_encoder_seq_length, padding='post')
decoder_input_data = pad_sequences(decoder_input_data,maxlen=max_decoder_seq_length, padding='post')

In [None]:
encoder_input_data

array([[2, 0, 0],
       [3, 4, 5],
       [1, 6, 0],
       [1, 7, 0],
       [8, 0, 0]], dtype=int32)

In [None]:
decoder_input_data

array([[1, 3, 2, 0, 0],
       [1, 4, 5, 6, 2],
       [1, 3, 2, 0, 0],
       [1, 7, 8, 2, 0],
       [1, 9, 2, 0, 0]], dtype=int32)

### Prepare the target data (decoder output, shifted by one time step)

In [None]:
decoder_output_data = np.zeros_like(decoder_input_data)
decoder_output_data[:,:-1] = decoder_input_data[:,1:]

In [None]:
decoder_output_data

array([[3, 2, 0, 0, 0],
       [4, 5, 6, 2, 0],
       [3, 2, 0, 0, 0],
       [7, 8, 2, 0, 0],
       [9, 2, 0, 0, 0]], dtype=int32)

In [None]:
decoder_input_data

array([[1, 3, 2, 0, 0],
       [1, 4, 5, 6, 2],
       [1, 3, 2, 0, 0],
       [1, 7, 8, 2, 0],
       [1, 9, 2, 0, 0]], dtype=int32)

### Define the vocabulary size (total unique words)

In [None]:
num_encoder_tokens = len(eng_tokenizer.word_index) + 1
num_decoder_tokens = len(fra_tokenizer.word_index) + 1

In [None]:
num_encoder_tokens

9

In [None]:
num_decoder_tokens

10

### Define the input sequence and output sequence for the Seq2Seq model

In [None]:
encoder_inputs = Input(shape=(None,))
decoder_inputs = Input(shape=(None,))

In [None]:
encoder_inputs

<KerasTensor shape=(None, None), dtype=float32, sparse=False, name=keras_tensor_6>

### Encoder

In [None]:
encoder_embedding = Embedding(input_dim=num_encoder_tokens, output_dim=latent_dim, input_length=max_encoder_seq_length)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)

### Decoder

In [None]:
decoder_embedding = Embedding(input_dim=num_decoder_tokens, output_dim=latent_dim, input_length=max_decoder_seq_length)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=[state_h, state_c])



### Dense layer for generating predictions

In [None]:
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

### Define the model

In [None]:
model = Model([encoder_inputs,decoder_inputs],decoder_outputs)

In [None]:
model.summary()

### Compile the model

In [None]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

### Train the model

In [None]:
model.fit([encoder_input_data,decoder_input_data],
          np.expand_dims(decoder_output_data,-1),
          batch_size=batch_size,epochs=epochs)

Epoch 1/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 34ms/step - accuracy: 0.2850 - loss: 2.2886
Epoch 2/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - accuracy: 0.4775 - loss: 2.1580
Epoch 3/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step - accuracy: 0.4525 - loss: 1.9470
Epoch 4/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - accuracy: 0.5150 - loss: 1.4935
Epoch 5/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - accuracy: 0.4525 - loss: 1.3491
Epoch 6/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step - accuracy: 0.5275 - loss: 1.1076
Epoch 7/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step - accuracy: 0.4725 - loss: 1.1625
Epoch 8/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - accuracy: 0.7150 - loss: 0.9687
Epoch 9/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3

<keras.src.callbacks.history.History at 0x7f9149d42680>

### Create inference models for the encoder and decoder for prediction
### Encoder Model

In [None]:
encoder_model = Model(encoder_inputs, [encoder_outputs,state_h, state_c])

### Decoder Model (for inference)

In [None]:
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_lstm_inf = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs_inf, state_h_inf, state_c_inf = decoder_lstm_inf(decoder_embedding, initial_state=[decoder_state_input_h, decoder_state_input_c])
decoder_output_inf = decoder_dense(decoder_outputs_inf)
decoder_model = Model([decoder_inputs, decoder_state_input_h, decoder_state_input_c], [decoder_output_inf, state_h_inf, state_c_inf])

### Function to decode the sequence

In [None]:
def decode_sequence(input_seq):
  # Get the encoder states
  states_value = encoder_model.predict(input_seq)

  # Generate an initial target sequence (the start token)
  target_seq = np.zeros((1,1))
  target_seq[0,0] = fra_tokenizer.word_index['starttoken']  # start token index

  # sample output tokens
  stop_condition = False
  decoded_sentence = ''
  while not stop_condition:
    # Correct the prediction input to match expected inputs
    # Unpack the states_value list into separate arguments
    output_tokens, h, c = decoder_model.predict([target_seq, states_value[1], states_value[2]])

    # Sample the next token
    sampled_token_index = np.argmax(output_tokens[0,-1,:])
    sampled_token = fra_tokenizer.index_word[sampled_token_index]

    decoded_sentence += ' ' + sampled_token

    # Stop if we hit the end token or reach max length
    if sampled_token == 'endtoken' or len(decoded_sentence) > max_decoder_seq_length:
      stop_condition = True

    # update the target sequence
    target_seq = np.zeros((1,1))
    target_seq[0,0] = sampled_token_index

    # update the states
    states_value = [h,c]

  return decoded_sentence

### Test the decoder with the sample sentence

In [None]:
input_seq = encoder_input_data[2:3] # sample input sequence

In [None]:
input_seq

array([[1, 6, 0]], dtype=int32)

In [None]:
encoder_input_data

array([[2, 0, 0],
       [3, 4, 5],
       [1, 6, 0],
       [1, 7, 0],
       [8, 0, 0]], dtype=int32)

In [None]:
decoded_sentence = decode_sequence(input_seq)
print('Decoded Sentence: ',decoded_sentence)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
Decoded Sentence:   bonjour


In [None]:
new = ['good night']
new_input_seq = eng_tokenizer.texts_to_sequences(new)
new_input_seq = pad_sequences(new_input_seq, maxlen=max_encoder_seq_length, padding='post')
decoded_sentence = decode_sequence(new_input_seq)
print('Decoded Sentence: ', decoded_sentence)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
Decoded Sentence:   bonne
