### ENGLISH TO SPANISH TRANSLATOR


#### Loading and Pre-Processing Data

In [1]:
import pathlib
import numpy as np
import re
import tensorflow as tf
from tensorflow import keras
from keras.models import Model
from keras.layers import Input, LSTM, Embedding, Dense, Attention, Concatenate
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Loading dataset 
zip_path = keras.utils.get_file(
    fname="spa-eng.zip",
    origin="http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip",
    extract=True,
)
data_dir = pathlib.Path(zip_path).parent / "spa-eng"
text_file = data_dir / "spa.txt"
with open(text_file, "r", encoding="utf-8") as f:
    lines = f.read().strip().split("\n")[:10000]
sentence_pairs = [line.split("\t") for line in lines]


In [2]:
def preprocess_sentence(sentence):
    sentence = sentence.lower().strip()
    sentence = re.sub(r"([?.!,¿])", r" \1 ", sentence)
    sentence = re.sub(r'[" "]+', " ", sentence)
    sentence = re.sub(r"[^a-zA-Z?.!,¿]+", " ", sentence)
    return sentence.strip()

cleaned_pairs = []
for eng, spa in sentence_pairs:
    eng = preprocess_sentence(eng)
    spa = preprocess_sentence(spa)
    spa = "sos " + spa + " eos"
    cleaned_pairs.append((eng, spa))


#### Tokenisation

In [3]:
eng_texts, spa_texts = zip(*cleaned_pairs)

eng_tokenizer = Tokenizer(filters='', lower=True)
spa_tokenizer = Tokenizer(filters='', lower=True)
eng_tokenizer.fit_on_texts(eng_texts)
spa_tokenizer.fit_on_texts(spa_texts)
reverse_spa_index = {v: k for k, v in spa_tokenizer.word_index.items()}


eng_seq = eng_tokenizer.texts_to_sequences(eng_texts)
spa_seq = spa_tokenizer.texts_to_sequences(spa_texts)

max_eng_len = max(len(seq) for seq in eng_seq)
max_spa_len = max(len(seq) for seq in spa_seq)

encoder_input = pad_sequences(eng_seq, maxlen=max_eng_len, padding='post')
decoder_input = pad_sequences([seq[:-1] for seq in spa_seq], maxlen=max_spa_len-1, padding='post')
decoder_target = pad_sequences([seq[1:] for seq in spa_seq], maxlen=max_spa_len-1, padding='post')

eng_vocab_size = len(eng_tokenizer.word_index) + 1
spa_vocab_size = len(spa_tokenizer.word_index) + 1


embedding_dim = 128
latent_dim = 256




#### Model building and training

In [5]:
# Encoder
encoder_inputs = Input(shape=(None,), name='encoder_input')
encoder_emb = Embedding(eng_vocab_size, embedding_dim)(encoder_inputs)
encoder_lstm, state_h, state_c = LSTM(latent_dim, return_sequences=True, return_state=True)(encoder_emb)

# Decoder
decoder_inputs = Input(shape=(None,), name='decoder_input')
decoder_emb = Embedding(spa_vocab_size, embedding_dim)(decoder_inputs)
decoder_lstm, _, _ = LSTM(latent_dim, return_sequences=True, return_state=True)(decoder_emb, initial_state=[state_h, state_c])

# Attention
attention = Attention()
context_vector = attention([decoder_lstm, encoder_lstm])
decoder_concat = Concatenate(axis=-1)([decoder_lstm, context_vector])

# Output layer
decoder_outputs = Dense(spa_vocab_size, activation='softmax')(decoder_concat)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()


model.fit([encoder_input, decoder_input], decoder_target,
          batch_size=32,
          epochs=25,
          validation_split=0.2)

model.save("final_nmt_model.keras")


Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 encoder_input (InputLayer)  [(None, None)]               0         []                            
                                                                                                  
 decoder_input (InputLayer)  [(None, None)]               0         []                            
                                                                                                  
 embedding_2 (Embedding)     (None, None, 128)            294912    ['encoder_input[0][0]']       
                                                                                                  
 embedding_3 (Embedding)     (None, None, 128)            556800    ['decoder_input[0][0]']       
                                                                                            

#### Inference setup and decoding

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Concatenate

encoder_inf_inputs = model.get_layer('encoder_input').input
encoder_outputs, state_h_enc, state_c_enc = model.get_layer('lstm_2').output
encoder_model = Model(encoder_inf_inputs, [encoder_outputs, state_h_enc, state_c_enc])

decoder_state_input_h = Input(shape=(256,))
decoder_state_input_c = Input(shape=(256,))
decoder_hidden_state_input = Input(shape=(None, 256))  

decoder_inf_inputs = Input(shape=(1,))


dec_emb_layer = model.get_layer('embedding_3')
decoder_lstm_layer = model.get_layer('lstm_3')
attention_layer = model.get_layer('attention_1')
concat_layer = model.get_layer('concatenate_1')
dense_layer = model.get_layer('dense_1')

# Embedding
dec_emb_inf = dec_emb_layer(decoder_inf_inputs)

# Decoder LSTM
decoder_outputs, state_h, state_c = decoder_lstm_layer(
    dec_emb_inf, initial_state=[decoder_state_input_h, decoder_state_input_c]
)

# Attention
attn_out_inf = attention_layer([decoder_outputs, decoder_hidden_state_input])
decoder_concat_inf = concat_layer([decoder_outputs, attn_out_inf])

# Final output layer
decoder_outputs_final = dense_layer(decoder_concat_inf)

# Decoder inference model
decoder_model = Model(
    [decoder_inf_inputs, decoder_hidden_state_input, decoder_state_input_h, decoder_state_input_c],
    [decoder_outputs_final, state_h, state_c]
)


In [7]:
def decode_sequence(input_seq):
    enc_outs, h, c = encoder_model.predict(input_seq)
    target_seq = np.array([[spa_tokenizer.word_index['sos']]])
    decoded_sentence = ''
    stop_condition = False

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq, enc_outs, h, c])
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = reverse_spa_index.get(sampled_token_index, '')

        if sampled_word == 'eos' or len(decoded_sentence.split()) > max_spa_len:
            stop_condition = True
        else:
            decoded_sentence += ' ' + sampled_word

        target_seq = np.array([[sampled_token_index]])

    return decoded_sentence.strip()


#### Testing and saving the model

In [8]:
def translate(sentence):
    sentence = preprocess_sentence(sentence)
    seq = eng_tokenizer.texts_to_sequences([sentence])
    padded = pad_sequences(seq, maxlen=max_eng_len, padding='post')
    translation = decode_sequence(padded)
    print(f"English: {sentence}")
    print(f"Spanish: {translation}")

translate("who are you")
translate("how was your day")
translate("What are you doing?")


English: who are you
Spanish: aabe .
English: how was your day
Spanish: no hay un cerdo .
English: what are you doing ?
Spanish: ¿ c mo est n ?


In [9]:
import pickle
import json

model.save("spanish_translation_model.keras")

with open("eng_tokenizer.json", "w") as f:
    f.write(eng_tokenizer.to_json())

with open("spa_tokenizer.json", "w") as f:
    f.write(spa_tokenizer.to_json())

reverse_spa_index = {v: k for k, v in spa_tokenizer.word_index.items()}
with open("reverse_spa_index.pkl", "wb") as f:
    pickle.dump(reverse_spa_index, f)


with open("seq_lengths.json", "w") as f:
    json.dump({"max_eng_len": max_eng_len, "max_spa_len": max_spa_len}, f)
