In [1]:
import os, re, numpy as np, pandas as pd, matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Attention, Concatenate, TimeDistributed, Lambda
from tensorflow.keras.callbacks import EarlyStopping

keras.utils.set_random_seed(42)
tf.config.experimental.enable_op_determinism()

In [2]:
from google.colab import drive
drive.mount('/content/drive')

DATA_PATH = '/content/drive/MyDrive/Dataset.txt'
if not os.path.exists(DATA_PATH):
    raise FileNotFoundError("Dataset.txt not found in /content/drive/MyDrive/")

lines = []
with open(DATA_PATH, 'r', encoding='utf-8') as f:
    for ln in f:
        ln = ln.strip()
        if '\t' in ln:
            lines.append(ln)
lines = lines[:10000]  # Use first 10,000 samples

eng_texts, ar_texts = [], []
for ln in lines:
    en, ar = ln.split('\t')
    eng_texts.append("< " + en + " >")
    ar_texts.append("< " + ar + " >")

num_samples = len(eng_texts)
print(f"Loaded {num_samples} sentence pairs.")


Mounted at /content/drive
Loaded 10000 sentence pairs.


In [3]:
input_chars = sorted(list(set(''.join(eng_texts))))
output_chars = sorted(list(set(''.join(ar_texts))))

num_encoder_tokens = len(input_chars)
num_decoder_tokens = len(output_chars)

max_encoder_seq_length = max(len(s) for s in eng_texts)
max_decoder_seq_length = max(len(s) for s in ar_texts)

input_token_index  = {ch:i for i,ch in enumerate(input_chars)}
output_token_index = {ch:i for i,ch in enumerate(output_chars)}
rev_output_index   = {i:ch for ch,i in output_token_index.items()}

print("Encoder tokens:", num_encoder_tokens)
print("Decoder tokens:", num_decoder_tokens)
print("Max encoder len:", max_encoder_seq_length)
print("Max decoder len:", max_decoder_seq_length)

Encoder tokens: 75
Decoder tokens: 107
Max encoder len: 48
Max decoder len: 69


In [4]:
encoder_input_data  = np.zeros((num_samples, max_encoder_seq_length, num_encoder_tokens), dtype='float32')
decoder_input_data  = np.zeros((num_samples, max_decoder_seq_length, num_decoder_tokens), dtype='float32')
decoder_output_data = np.zeros((num_samples, max_decoder_seq_length, num_decoder_tokens), dtype='float32')

for i, (src, tgt) in enumerate(zip(eng_texts, ar_texts)):
    for t, ch in enumerate(src):
        encoder_input_data[i, t, input_token_index[ch]] = 1.0
    encoder_input_data[i, t+1:, input_token_index[' ']] = 1.0

    for t, ch in enumerate(tgt):
        decoder_input_data[i, t, output_token_index[ch]] = 1.0
        if t > 0:
            decoder_output_data[i, t-1, output_token_index[ch]] = 1.0
    decoder_input_data[i, t+1:, output_token_index[' ']] = 1.0
    decoder_output_data[i, t:,   output_token_index[' ']] = 1.0

print("Data shapes → Encoder:", encoder_input_data.shape,
      "| Decoder In:", decoder_input_data.shape,
      "| Decoder Out:", decoder_output_data.shape)

Data shapes → Encoder: (10000, 48, 75) | Decoder In: (10000, 69, 107) | Decoder Out: (10000, 69, 107)


In [5]:
lstm_dim = 128

enc_inputs = Input(shape=(None, num_encoder_tokens))
enc_lstm   = LSTM(lstm_dim, return_sequences=True, return_state=True)
enc_outputs, enc_h, enc_c = enc_lstm(enc_inputs)
enc_states = [enc_h, enc_c]

dec_inputs = Input(shape=(None, num_decoder_tokens))
dec_lstm   = LSTM(lstm_dim, return_sequences=True, return_state=True)
dec_outputs, _, _ = dec_lstm(dec_inputs, initial_state=enc_states)

attn_ctx   = Attention(use_scale=True)([dec_outputs, enc_outputs])
dec_concat = Concatenate(axis=-1)([dec_outputs, attn_ctx])

proj_att   = TimeDistributed(Dense(lstm_dim, activation='tanh'))(dec_concat)
dec_dense  = TimeDistributed(Dense(num_decoder_tokens, activation='softmax'))(proj_att)

model_att = Model([enc_inputs, dec_inputs], dec_dense)
model_att.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model_att.summary()


In [6]:
es = EarlyStopping(monitor='val_loss', mode='min', patience=5, restore_best_weights=True, verbose=1)

history = model_att.fit(
    [encoder_input_data, decoder_input_data],
    decoder_output_data,
    batch_size=128,
    epochs=10,
    validation_split=0.2,
    callbacks=[es],
    verbose=1
)

Epoch 1/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 1s/step - accuracy: 0.7005 - loss: 2.3065 - val_accuracy: 0.6292 - val_loss: 1.6230
Epoch 2/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 990ms/step - accuracy: 0.7579 - loss: 0.9865 - val_accuracy: 0.6264 - val_loss: 1.4466
Epoch 3/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 970ms/step - accuracy: 0.7596 - loss: 0.9042 - val_accuracy: 0.6314 - val_loss: 1.3825
Epoch 4/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 996ms/step - accuracy: 0.7742 - loss: 0.8524 - val_accuracy: 0.6669 - val_loss: 1.2927
Epoch 5/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 960ms/step - accuracy: 0.7898 - loss: 0.8085 - val_accuracy: 0.6748 - val_loss: 1.2387
Epoch 6/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 998ms/step - accuracy: 0.7944 - loss: 0.7774 - val_accuracy: 0.6766 - val_loss: 1.2180
Epoch 7/10
[1m63/63[0m 