In [1]:
######Most of the code is copied from eng_fra_seq2seq_basic_v1 just modified for attention


In [3]:
#import all required libraries
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate, TimeDistributed
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re
import io
import os

In [5]:
import tensorflow as tf
from tensorflow.keras.layers import Layer, Dense

class BahdanauAttention(Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = Dense(units)
        self.W2 = Dense(units)
        self.V = Dense(1)
    
    def call(self, query, values):
        # query shape: (batch_size, hidden_size)
        # values shape: (batch_size, seq_len, hidden_size)
        query_with_time_axis = tf.expand_dims(query, 1)
        score = self.V(tf.nn.tanh(self.W1(values) + self.W2(query_with_time_axis)))
        attention_weights = tf.nn.softmax(score, axis=1)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector, attention_weights

In [7]:
class AttentionConcatLayer(Layer):
    def __init__(self, lstm_units, **kwargs):
        super().__init__(**kwargs)
        self.attention = BahdanauAttention(lstm_units) 
    def call(self, decoder_outputs, encoder_outputs):
        #decoder_outputs(batch, tgt_seq_len, lstm_units)
        def apply_attention(dec_t):
            # dec_t-shape (batch, lstm_units) (for single time step)
            context_vector, _ = self.attention(dec_t, encoder_outputs)
            return context_vector
        #swap to time-major for map_fn(tgt_seq_len, batch, lstm_units)
        decoder_outputs_time_major = tf.transpose(decoder_outputs, [1, 0, 2])
        #compute context vector for each decoder timestep (returns: (tgt_seq_len, batch, lstm_units))
        context_seq_time_major = tf.map_fn(
            lambda dec_t: apply_attention(dec_t), 
            decoder_outputs_time_major, 
            fn_output_signature=tf.float32
        )
        #return to batch-major(batch, tgt_seq_len, lstm_units)
        context_sequence = tf.transpose(context_seq_time_major, [1, 0, 2])
        combined = Concatenate(axis=-1)([decoder_outputs, context_sequence])
        return combined

In [9]:
#we load our data by giving path as well as max. no of samples
def load_data(path , num_samples=10000):
    #open the txt file split by space
    with open(path , 'r' , encoding ='utf-8') as f:
        lines = f.read().split('\n')
    input_texts = []
    target_texts = []
    #separately store input and target text
    for line in lines[:num_samples]:
        parts = line.strip().split('\t')
        if len(parts) < 2:
            continue
        #now here the given dataset is french to english but i want eng to french trans so careful with the order
        eng , fra = parts[0] , parts[1]
        #add start and end for decoder
        target_text = "<start> " + fra + " <end>"
        input_texts.append(eng)
        target_texts.append(target_text)
        
    return input_texts, target_texts

input_texts, target_texts = load_data('fra.txt')

In [11]:
def preprocess_text(sentences):
    #convert in lowercase
    sentences =  [s.lower() for s in sentences]
    #remove everything except letters , digits , white spaces , and <>(remember the <start>)
    #dont forget ^ this its the negation symbol i forgot and spend 30 minutes looking for bug
    sentences = [re.sub(r"[^a-zA-Z0-9<>\s]", "", s) for s in sentences]
    return sentences
input_texts = preprocess_text(input_texts)
target_texts = preprocess_text(target_texts)

In [13]:
# Tokenize output
#as we have already cleaned the text filter is off('' - do not remove anything)
input_tokenizer = Tokenizer(filters = '')
#go through all sentences and build a word-index vocab
input_tokenizer.fit_on_texts(input_texts)
#use the vocab and replce each word with its integer ID
input_sequences = input_tokenizer.texts_to_sequences(input_texts)
#extract the word to index vocab or dict. for future uses
input_word_index = input_tokenizer.word_index
#find the max.sequence length
max_input_len = max(len(seq) for seq in input_sequences)
#pad with 0 if length is lower than max length
encoder_input_data = pad_sequences(input_sequences , maxlen = max_input_len ,padding='post')



In [15]:
# Tokenize output
#No comments as everything is same as above
target_tokenizer = Tokenizer(filters='')
target_tokenizer.fit_on_texts(target_texts)
target_sequences = target_tokenizer.texts_to_sequences(target_texts)
target_word_index = target_tokenizer.word_index
max_target_len = max(len(seq) for seq in target_sequences)
decoder_input_data = pad_sequences(target_sequences, maxlen=max_target_len,padding='post')

In [17]:
## Creating decoder targets

#create a numpy array of same shape as decoder_input_data
decoder_target_data = np.zeros_like(decoder_input_data)
#now we take all the columns from 1 to end from decoder_input_data and fill it in decoder_target_data from 0 to end - 1 
#so basically we shifted left
decoder_target_data[:,:-1] = decoder_input_data[:, 1:]
#well this is actually not necessary but i still did that just to be safe , it does nothing but make sure last value is zero
decoder_target_data[:, -1] = 0

In [31]:
#embedding dimensions
embedding_dim = 256 
#LSTM hidden units
lstm_units = 512
#vocab size , +1 for padding token
input_vocab_size = len(input_word_index)+1
target_vocab_size = len(target_word_index)+1
print(input_vocab_size)
print(target_vocab_size)

2022
4566


In [33]:
#Encoder Model
#encoder inputs
encoder_inputs = Input(shape = (None , ))
#embedding layer take encoder inputs of size vocab size as defined above and embed them and convert into dimensions of embedding_dim
enc_emb = Embedding(input_vocab_size ,embedding_dim , mask_zero = True)(encoder_inputs)
#a LSTM layer with hidden units = lstm_units and we want each hidden state as well as cell state so return_state is true and return_sequence is true
encoder_outputs , state_h , state_c = LSTM(lstm_units, return_sequences = True, return_state = True)(enc_emb)
#store the stats
encoder_states = [state_h , state_c]

In [34]:
#Decoder Model
decoder_inputs = Input(shape = (None , ))
decoder_emb_layer = Embedding(target_vocab_size, embedding_dim, mask_zero=True, name='dec_emb')
dec_emb = decoder_emb_layer(decoder_inputs)
#just defining the LSTM layer and in addition to states we want output at each state/time step so return_sequence = True
decoder_lstm = LSTM(lstm_units,return_sequences = True, return_state = True)
#get the outputs by giving dec_emb as input and passing encoder's states as initial states
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state = encoder_states)
# Attention application (replace tf.map_fn/transpose with this)
attention_concat = AttentionConcatLayer(lstm_units)
decoder_combined_context = attention_concat(decoder_outputs, encoder_outputs)
#final output dense layer applied to each time step (TimeDistributed)
decoder_dense = TimeDistributed(Dense(target_vocab_size, activation='softmax'))
decoder_outputs_final = decoder_dense(decoder_combined_context)




In [36]:
#Training Model
#combined model with teacher forcing
model = Model([encoder_inputs,decoder_inputs],decoder_outputs)
#we are using 'sparse_categorical_crossentropy' cause we didnt one-hot encode targets
model.compile(optimizer = 'adam' , loss ='sparse_categorical_crossentropy' , metrics=['accuracy'])

In [37]:
#lets check our model 
model.summary()

In [41]:
#Train the Model
#... maens all the dimensions
history = model.fit([encoder_input_data,decoder_input_data], decoder_target_data[...,np.newaxis], 
                    batch_size = 64, epochs = 30,
                   validation_split = 0.2)

Epoch 1/30


InvalidArgumentError: Graph execution error:

Detected at node compile_loss/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits defined at (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main

  File "<frozen runpy>", line 88, in _run_code

  File "C:\Users\LENOVO\anaconda3\Lib\site-packages\ipykernel_launcher.py", line 17, in <module>

  File "C:\Users\LENOVO\anaconda3\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance

  File "C:\Users\LENOVO\anaconda3\Lib\site-packages\ipykernel\kernelapp.py", line 701, in start

  File "C:\Users\LENOVO\anaconda3\Lib\site-packages\tornado\platform\asyncio.py", line 205, in start

  File "C:\Users\LENOVO\anaconda3\Lib\asyncio\windows_events.py", line 322, in run_forever

  File "C:\Users\LENOVO\anaconda3\Lib\asyncio\base_events.py", line 641, in run_forever

  File "C:\Users\LENOVO\anaconda3\Lib\asyncio\base_events.py", line 1987, in _run_once

  File "C:\Users\LENOVO\anaconda3\Lib\asyncio\events.py", line 88, in _run

  File "C:\Users\LENOVO\anaconda3\Lib\site-packages\ipykernel\kernelbase.py", line 534, in dispatch_queue

  File "C:\Users\LENOVO\anaconda3\Lib\site-packages\ipykernel\kernelbase.py", line 523, in process_one

  File "C:\Users\LENOVO\anaconda3\Lib\site-packages\ipykernel\kernelbase.py", line 429, in dispatch_shell

  File "C:\Users\LENOVO\anaconda3\Lib\site-packages\ipykernel\kernelbase.py", line 767, in execute_request

  File "C:\Users\LENOVO\anaconda3\Lib\site-packages\ipykernel\ipkernel.py", line 429, in do_execute

  File "C:\Users\LENOVO\anaconda3\Lib\site-packages\ipykernel\zmqshell.py", line 549, in run_cell

  File "C:\Users\LENOVO\anaconda3\Lib\site-packages\IPython\core\interactiveshell.py", line 3075, in run_cell

  File "C:\Users\LENOVO\anaconda3\Lib\site-packages\IPython\core\interactiveshell.py", line 3130, in _run_cell

  File "C:\Users\LENOVO\anaconda3\Lib\site-packages\IPython\core\async_helpers.py", line 128, in _pseudo_sync_runner

  File "C:\Users\LENOVO\anaconda3\Lib\site-packages\IPython\core\interactiveshell.py", line 3334, in run_cell_async

  File "C:\Users\LENOVO\anaconda3\Lib\site-packages\IPython\core\interactiveshell.py", line 3517, in run_ast_nodes

  File "C:\Users\LENOVO\anaconda3\Lib\site-packages\IPython\core\interactiveshell.py", line 3577, in run_code

  File "C:\Users\LENOVO\AppData\Local\Temp\ipykernel_44484\2680249506.py", line 3, in <module>

  File "C:\Users\LENOVO\anaconda3\Lib\site-packages\keras\src\utils\traceback_utils.py", line 117, in error_handler

  File "C:\Users\LENOVO\anaconda3\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 377, in fit

  File "C:\Users\LENOVO\anaconda3\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 220, in function

  File "C:\Users\LENOVO\anaconda3\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 133, in multi_step_on_iterator

  File "C:\Users\LENOVO\anaconda3\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 114, in one_step_on_data

  File "C:\Users\LENOVO\anaconda3\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 61, in train_step

  File "C:\Users\LENOVO\anaconda3\Lib\site-packages\keras\src\trainers\trainer.py", line 383, in _compute_loss

  File "C:\Users\LENOVO\anaconda3\Lib\site-packages\keras\src\trainers\trainer.py", line 351, in compute_loss

  File "C:\Users\LENOVO\anaconda3\Lib\site-packages\keras\src\trainers\compile_utils.py", line 690, in __call__

  File "C:\Users\LENOVO\anaconda3\Lib\site-packages\keras\src\trainers\compile_utils.py", line 699, in call

  File "C:\Users\LENOVO\anaconda3\Lib\site-packages\keras\src\losses\loss.py", line 67, in __call__

  File "C:\Users\LENOVO\anaconda3\Lib\site-packages\keras\src\losses\losses.py", line 33, in call

  File "C:\Users\LENOVO\anaconda3\Lib\site-packages\keras\src\losses\losses.py", line 2330, in sparse_categorical_crossentropy

  File "C:\Users\LENOVO\anaconda3\Lib\site-packages\keras\src\ops\nn.py", line 2000, in sparse_categorical_crossentropy

  File "C:\Users\LENOVO\anaconda3\Lib\site-packages\keras\src\backend\tensorflow\nn.py", line 753, in sparse_categorical_crossentropy

Received a label value of 3716 which is outside the valid range of [0, 512).  Label values: 8 2770 2 0 0 0 0 0 0 0 0 0 11 1080 12 17 2623 2 0 0 0 0 0 0 8 64 3716 2 0 0 0 0 0 0 0 0 32 3247 2 0 0 0 0 0 0 0 0 0 570 2 0 0 0 0 0 0 0 0 0 0 13 206 2 0 0 0 0 0 0 0 0 0 3 18 72 303 2 0 0 0 0 0 0 0 4 1615 2 0 0 0 0 0 0 0 0 0 21 131 120 2 0 0 0 0 0 0 0 0 41 762 2 0 0 0 0 0 0 0 0 0 396 78 2 0 0 0 0 0 0 0 0 0 3 14 5 10 199 2 0 0 0 0 0 0 6 53 2 0 0 0 0 0 0 0 0 0 2350 2 0 0 0 0 0 0 0 0 0 0 181 604 379 2 0 0 0 0 0 0 0 0 1526 2 0 0 0 0 0 0 0 0 0 0 541 2 0 0 0 0 0 0 0 0 0 0 11 788 2 0 0 0 0 0 0 0 0 0 55 257 2 0 0 0 0 0 0 0 0 0 604 719 2 0 0 0 0 0 0 0 0 0 1779 2 0 0 0 0 0 0 0 0 0 0 6 19 136 3008 2 0 0 0 0 0 0 0 4 9 12 587 2 0 0 0 0 0 0 0 14 1071 10 2 0 0 0 0 0 0 0 0 4 9 300 2 0 0 0 0 0 0 0 0 3573 2 0 0 0 0 0 0 0 0 0 0 923 12 922 2 0 0 0 0 0 0 0 0 15 27 324 2 0 0 0 0 0 0 0 0 95 431 12 15 2 0 0 0 0 0 0 0 1500 140 2 0 0 0 0 0 0 0 0 0 3 269 110 2 0 0 0 0 0 0 0 0 32 3249 2 0 0 0 0 0 0 0 0 0 2776 91 2 0 0 0 0 0 0 0 0 0 7 616 4 2 0 0 0 0 0 0 0 0 1921 2 0 0 0 0 0 0 0 0 0 0 13 17 826 2 0 0 0 0 0 0 0 0 70 1191 2 0 0 0 0 0 0 0 0 0 32 2793 2 0 0 0 0 0 0 0 0 0 3 18 5 307 744 2 0 0 0 0 0 0 31 209 2 0 0 0 0 0 0 0 0 0 11 97 2 0 0 0 0 0 0 0 0 0 128 109 7 2 0 0 0 0 0 0 0 0 6 717 2 0 0 0 0 0 0 0 0 0 113 3176 2 0 0 0 0 0 0 0 0 0 4 15 800 16 317 2 0 0 0 0 0 0 2417 2 0 0 0 0 0 0 0 0 0 0 79 12 279 2 0 0 0 0 0 0 0 0 276 384 2 0 0 0 0 0 0 0 0 0 3 5 3422 2 0 0 0 0 0 0 0 0 6 3012 2 0 0 0 0 0 0 0 0 0 4 69 31 3667 2 0 0 0 0 0 0 0 3 269 297 2 0 0 0 0 0 0 0 0 11 787 2 0 0 0 0 0 0 0 0 0 3 5 421 2 0 0 0 0 0 0 0 0 11 51 2639 2 0 0 0 0 0 0 0 0 4 394 2 0 0 0 0 0 0 0 0 0 2297 2 0 0 0 0 0 0 0 0 0 0 32 763 2 0 0 0 0 0 0 0 0 0 13 17 339 2 0 0 0 0 0 0 0 0 3 5 3482 2 0 0 0 0 0 0 0 0 3558 2 0 0 0 0 0 0 0 0 0 0 35 18 1946 2 0 0 0 0 0 0 0 0 13 1830 2 0 0 0 0 0 0 0 0 0 15 27 23 2789 2 0 0 0 0 0 0 0
	 [[{{node compile_loss/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits}}]] [Op:__inference_multi_step_on_iterator_8502]

In [None]:
#inference encoder model- just takes input and gives out hidden states
encoder_model = Model(encoder_inputs, [encoder_outputs, state_h, state_c])
#Decoder setup
decoder_state_input_h = Input(shape=(lstm_units,))
decoder_state_input_c = Input(shape=(lstm_units,))
decoder_hidden_state_input = Input(shape=(max_input_len, lstm_units))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
#single step decoder input token (1 timestep)
decoder_inputs_single = Input(shape=(1,))
dec_emb2 = decoder_emb_layer(decoder_inputs_single)
#run the decoder LSTM for a single timestep with passed states
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)

#compute attention context vector based on decoder output and encoder hidden states
attention_layer = BahdanauAttention(lstm_units)
context_vector2, attention_weights2 = attention_layer(decoder_outputs2[:, 0, :], decoder_hidden_state_input)
#expand dims for concatenation to match decoder_outputs2 shape
context_vector2 = tf.expand_dims(context_vector2, 1)
decoder_combined_context2 = Concatenate(axis=-1)([decoder_outputs2, context_vector2])

#final output dense layer for next word prediction
decoder_outputs2 = decoder_dense(decoder_combined_context2)

#final inference decoder model
decoder_model = Model(
    [decoder_inputs_single, decoder_hidden_state_input] + decoder_states_inputs,
    [decoder_outputs2, state_h2, state_c2, attention_weights2]
)

In [None]:
reverse_target_index = {i: word for word, i in target_tokenizer.word_index.items()}
reverse_target_index[0] = ''
target_index_word = target_tokenizer.word_index
start_token = target_index_word['<start>']
end_token = target_index_word['<end>']

def decode_sequence(input_seq):
    #encode the input and get encoder outputs and states
    enc_outs, state_h, state_c = encoder_model.predict(input_seq)
    
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = start_token
    stop_condition = False
    decoded_sentence = []
    
    while not stop_condition:
        output_tokens, h, c, _ = decoder_model.predict([target_seq, enc_outs, state_h, state_c])
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = reverse_target_index.get(sampled_token_index, '')
        if sampled_word == '<end>' or len(decoded_sentence) > max_target_len:
            stop_condition = True
        else:
            decoded_sentence.append(sampled_word)
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        state_h, state_c = h, c
    
    return ' '.join(decoded_sentence)

In [None]:
#lets check
def predict_sample(index):
    input_seq = encoder_input_data[index:index+1]
    decoded = decode_sequence(input_seq)
    print("Input:", input_texts[index])
    print("Target:", target_texts[index])
    print("Predicted:", decoded)


predict_sample(0)

In [None]:
# lets translate and check 5 sentences
def predict_samples(start_index, num_samples=5):
    for i in range(start_index, start_index + num_samples):
        input_seq = encoder_input_data[i:i+1]
        decoded = decode_sequence(input_seq)
        print(f"Sample {i}:")
        print("Input: ", input_texts[i])
        print("Target:", target_texts[i])
        print("Predicted:", decoded)
        print("="*50)

# predict 5 samples from index 0
predict_samples(0, 5)

In [None]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
smoothie = SmoothingFunction().method4  

In [None]:
def evaluate_bleu_score(num_samples=100):
    total_score = 0.0
    individual_scores = []

    for i in range(num_samples):
        input_seq = encoder_input_data[i:i+1]
        decoded_sentence = decode_sequence(input_seq).strip().split()
        reference_sentence = target_texts[i].replace('<start>', '').replace('<end>', '').strip().split()

        score = sentence_bleu([reference_sentence], decoded_sentence, smoothing_function=smoothie)
        individual_scores.append(score)
        total_score += score

    avg_bleu = total_score / num_samples
    print(f"\nAverage BLEU score on {num_samples} samples: {avg_bleu:.4f}")
    return avg_bleu, individual_scores

In [None]:
avg_bleu, scores = evaluate_bleu_score(100)

In [None]:
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.legend()
plt.title('Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.show()

In [None]:
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title("Accuracy and Loss over Epochs")
plt.xlabel("Epochs")
plt.ylabel("Value")
plt.legend()
plt.grid(True)
plt.show()