In [4]:
import string
import numpy as np
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import LSTM, Input, TimeDistributed, Dense, Activation, RepeatVector, Embedding
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy



In [7]:
raw_data = """
Hello, how are you?\tBonjour, comment ça va ?
I am fine, thank you.\tJe vais bien, merci.
What is your name?\tComment tu t'appelles ?
My name is John.\tJe m'appelle John.
Where do you live?\tOù habites-tu ?
I live in New York.\tJ'habite à New York.
Do you speak French?\tParles-tu français ?
Yes, I speak a little French.\tOui, je parle un peu français.
I like to read books.\tJ'aime lire des livres.
Can you help me?\tPeux-tu m'aider ?
The weather is nice today.\tLe temps est agréable aujourd'hui.
See you later!\tÀ plus tard !
I am learning French.\tJ'apprends le français.
What time is it?\tQuelle heure est-il ?
I need to go now.\tJe dois y aller maintenant.
Thank you for your help.\tMerci pour ton aide.
You're welcome.\tDe rien.
Good morning!\tBonjour !
Good night!\tBonne nuit !
Have a nice day!\tBonne journée !
"""

# Split the raw data into lines
raw_data = raw_data.strip().split('\n')

# Split each line into pairs of sentences
pairs = [sentence.split('\t') for sentence in raw_data]

# Display the pairs
for pair in pairs:
    print(pair)


['Hello, how are you?', 'Bonjour, comment ça va ?']
['I am fine, thank you.', 'Je vais bien, merci.']
['What is your name?', "Comment tu t'appelles ?"]
['My name is John.', "Je m'appelle John."]
['Where do you live?', 'Où habites-tu ?']
['I live in New York.', "J'habite à New York."]
['Do you speak French?', 'Parles-tu français ?']
['Yes, I speak a little French.', 'Oui, je parle un peu français.']
['I like to read books.', "J'aime lire des livres."]
['Can you help me?', "Peux-tu m'aider ?"]
['The weather is nice today.', "Le temps est agréable aujourd'hui."]
['See you later!', 'À plus tard !']
['I am learning French.', "J'apprends le français."]
['What time is it?', 'Quelle heure est-il ?']
['I need to go now.', 'Je dois y aller maintenant.']
['Thank you for your help.', 'Merci pour ton aide.']
["You're welcome.", 'De rien.']
['Good morning!', 'Bonjour !']
['Good night!', 'Bonne nuit !']
['Have a nice day!', 'Bonne journée !']


In [8]:
def clean_sentence(sentence):
    # Lower case the sentence
    lower_case_sent = sentence.lower()
    # Strip punctuation
    string_punctuation = string.punctuation + "¡" + '¿'
    clean_sentence = lower_case_sent.translate(str.maketrans('', '', string_punctuation))

    return clean_sentence

In [9]:
def tokenize(sentences):
    # Create tokenizer
    text_tokenizer = Tokenizer()
    # Fit texts
    text_tokenizer.fit_on_texts(sentences)
    return text_tokenizer.texts_to_sequences(sentences), text_tokenizer

In [10]:
# Clean sentences
english_sentences = [clean_sentence(pair[0]) for pair in pairs]
spanish_sentences = [clean_sentence(pair[1]) for pair in pairs]

# Tokenize words
spa_text_tokenized, spa_text_tokenizer = tokenize(spanish_sentences)
eng_text_tokenized, eng_text_tokenizer = tokenize(english_sentences)

print('Maximum length spanish sentence: {}'.format(len(max(spa_text_tokenized,key=len))))
print('Maximum length english sentence: {}'.format(len(max(eng_text_tokenized,key=len))))


# Check language length
spanish_vocab = len(spa_text_tokenizer.word_index) + 1
english_vocab = len(eng_text_tokenizer.word_index) + 1
print("Spanish vocabulary is of {} unique words".format(spanish_vocab))
bprint("English vocabulary is of {} unique words".format(english_vocab))

Maximum length spanish sentence: 6
Maximum length english sentence: 6
Spanish vocabulary is of 54 unique words
English vocabulary is of 53 unique words


In [11]:
max_spanish_len = int(len(max(spa_text_tokenized,key=len)))
max_english_len = int(len(max(eng_text_tokenized,key=len)))

spa_pad_sentence = pad_sequences(spa_text_tokenized, max_spanish_len, padding = "post")
eng_pad_sentence = pad_sequences(eng_text_tokenized, max_english_len, padding = "post")

# Reshape data
spa_pad_sentence = spa_pad_sentence.reshape(*spa_pad_sentence.shape, 1)
eng_pad_sentence = eng_pad_sentence.reshape(*eng_pad_sentence.shape, 1)

In [18]:
eng_pad_sentence.shape

(20, 6, 1)

In [19]:
spa_pad_sentence.shape

(20, 6, 1)

In [None]:
'''
Input Layer: Takes Spanish sequences of fixed length.
Embedding Layer: Converts the Spanish sequences into dense vectors.
Encoder LSTM: Encodes the sequence into a context vector.
Repeat Vector: Repeats the context vector to match the length of the English sequences.
Decoder LSTM: Decodes the repeated context vector into English sequences.
TimeDistributed Dense: Applies a dense layer to each timestep of the decoded sequences to produce logits, which can then be used to predict the English words.
'''

In [20]:
input_sequence = Input(shape=(max_spanish_len,))
embedding = Embedding(input_dim=spanish_vocab, output_dim=128,)(input_sequence)
encoder = LSTM(64, return_sequences=False)(embedding)
r_vec = RepeatVector(max_english_len)(encoder)

'''
RepeatVector simple
encoder_expanded = tf.expand_dims(encoder, axis=1)
r_vec = tf.tile(encoder_expanded, [1, max_english_len, 1]) 
'''
decoder = LSTM(64, return_sequences=True, dropout=0.2)(r_vec)
logits = TimeDistributed(Dense(english_vocab))(decoder)

'''
TimeDistributed simple
reshaped_decoder = Reshape((max_english_len * 64,))(decoder)

# Apply Dense layer
dense_output = Dense(english_vocab)(reshaped_decoder)

# Reshape back to (batch_size, max_english_len, english_vocab)
logits = Reshape((max_english_len, english_vocab))(dense_output)

'''

'\nTimeDistributed\nreshaped_decoder = Reshape((max_english_len * 64,))(decoder)\n\n# Apply Dense layer\ndense_output = Dense(english_vocab)(reshaped_decoder)\n\n# Reshape back to (batch_size, max_english_len, english_vocab)\nlogits = Reshape((max_english_len, english_vocab))(dense_output)\n\n'

In [None]:
'''
Repeat Vector 
The decoder needs a context (or summary) of the input at each time step to generate the output sequence.
Takes the final output from the encoder (which summarizes the input) and copies it for every time step of the output sequence.

TimeDistributed
We need to apply the same transformation (like predicting the next word) to each time step of the decoder's output.
Ensures that a layer (like Dense) is applied independently to each step in the sequence.

'''

In [13]:
enc_dec_model = Model(input_sequence, Activation('softmax')(logits))
enc_dec_model.summary()
enc_dec_model.compile(loss=sparse_categorical_crossentropy,
              optimizer=Adam(1e-3),
              metrics=['accuracy'])
enc_dec_model.summary()

In [14]:
model_results = enc_dec_model.fit(spa_pad_sentence, eng_pad_sentence, batch_size=30, epochs=100)

Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - accuracy: 0.0083 - loss: 3.9704
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 0.2917 - loss: 3.9585
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 0.3250 - loss: 3.9437
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - accuracy: 0.3250 - loss: 3.9315
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - accuracy: 0.3250 - loss: 3.9102
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.3250 - loss: 3.8906
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.3250 - loss: 3.8656
Epoch 8/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - accuracy: 0.3250 - loss: 3.8273
Epoch 9/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

In [15]:
def logits_to_sentence(logits, tokenizer):

    index_to_words = {idx: word for word, idx in tokenizer.word_index.items()}
    index_to_words[0] = '<empty>'

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

index = 6
print("The english sentence is: {}".format(english_sentences[index]))
print("The spanish sentence is: {}".format(spanish_sentences[index]))
print('The predicted sentence is :')
print(logits_to_sentence(enc_dec_model.predict(spa_pad_sentence[index:index+1])[0], eng_text_tokenizer))


The english sentence is: do you speak french
The spanish sentence is: parlestu français 
The predicted sentence is :
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 175ms/step
you you <empty> <empty> <empty> <empty>
