<a href="https://colab.research.google.com/github/BRAHIMLOUARDI/movie-web-app/blob/main/Welcome_To_Colaboratory.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [31]:
import string
import numpy as np

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import LSTM, Input, TimeDistributed, Dense, Activation, RepeatVector, Embedding
from tensorflow.keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy

# Path to translation file
path_to_data = '/content/fra.txt'

# Read file
translation_file = open(path_to_data,"r", encoding='utf-8') 
raw_data = translation_file.read()
translation_file.close()

# Parse data
raw_data = raw_data.split('\n')
pairs = [sentence.split('\t') for sentence in  raw_data]
pairs = pairs[1000:20000]


In [40]:
def clean_sentence(sentence):
    # Lower case the sentence
    lower_case_sent = sentence.lower()
    # Strip punctuation
    string_punctuation = string.punctuation + "¡" + '¿'
    clean_sentence = lower_case_sent.translate(str.maketrans('', '', string_punctuation))
   
    return clean_sentence

In [41]:
def tokenize(sentences):
    # Create tokenizer
    text_tokenizer = Tokenizer()
    # Fit texts
    text_tokenizer.fit_on_texts(sentences)
    return text_tokenizer.texts_to_sequences(sentences), text_tokenizer

In [43]:

english_sentences = [clean_sentence(pair[0]) for pair in pairs]
french_sentences = [clean_sentence(pair[1]) for pair in pairs]

# Tokenize words
fra_text_tokenized, fra_text_tokenizer = tokenize(french_sentences)
eng_text_tokenized, eng_text_tokenizer = tokenize(english_sentences)

print('Maximum length french sentence: {}'.format(len(max(fra_text_tokenized,key=len))))
print('Maximum length english sentence: {}'.format(len(max(eng_text_tokenized,key=len))))


# Check language length
french_vocab = len(fra_text_tokenizer.word_index) + 1
english_vocab = len(eng_text_tokenizer.word_index) + 1
print("Spanish vocabulary is of {} unique words".format(french_vocab))
print("English vocabulary is of {} unique words".format(english_vocab))

Maximum length spanish sentence: 11
Maximum length english sentence: 5
Spanish vocabulary is of 7575 unique words
English vocabulary is of 3423 unique words


In [62]:
max_french_len = int(len(max(fra_text_tokenized,key=len)))
max_english_len = int(len(max(eng_text_tokenized,key=len)))

fra_pad_sentence = pad_sequences(fra_text_tokenized, max_french_len, padding = "post")
eng_pad_sentence = pad_sequences(eng_text_tokenized, max_english_len, padding = "post")

fra_pad_sentence1 = pad_sequences(fra_text_tokenized, max_french_len, padding = "post")
eng_pad_sentence1 = pad_sequences(eng_text_tokenized, max_english_len, padding = "post")

# Reshape data
fra_pad_sentence = fra_pad_sentence.reshape(*fra_pad_sentence.shape, 1)
eng_pad_sentence = eng_pad_sentence.reshape(*eng_pad_sentence.shape, 1)

In [68]:
input_sequence = Input(shape=(max_french_len,))
embedding = Embedding(input_dim=french_vocab, output_dim=128,)(input_sequence)

In [69]:
encoder = LSTM(64, return_sequences=False)(embedding)
r_vec = RepeatVector(max_english_len)(encoder)
decoder = LSTM(64, return_sequences=True, dropout=0.2)(r_vec)
logits = TimeDistributed(Dense(english_vocab))(decoder)


In [70]:
enc_dec_model = Model(input_sequence, Activation('softmax')(logits))
enc_dec_model.compile(loss=sparse_categorical_crossentropy,
              optimizer=Adam(1e-3),
              metrics=['accuracy'])
enc_dec_model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 11)]              0         
                                                                 
 embedding (Embedding)       (None, 11, 128)           969600    
                                                                 
 lstm (LSTM)                 (None, 64)                49408     
                                                                 
 repeat_vector (RepeatVector  (None, 5, 64)            0         
 )                                                               
                                                                 
 lstm_1 (LSTM)               (None, 5, 64)             33024     
                                                                 
 time_distributed (TimeDistr  (None, 5, 3423)          222495    
 ibuted)                                                     

In [71]:
model_results = enc_dec_model.fit(fra_pad_sentence, eng_pad_sentence, batch_size=30, epochs=100)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [73]:
enc_dec_model.save('my_model.h5')

In [75]:

def logits_to_sentence(logits, tokenizer):

    index_to_words = {idx: word for word, idx in tokenizer.word_index.items()}
    index_to_words[0] = '<empty>' 

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

index = 14
print("The english sentence is: {}".format(english_sentences[index]))
print("The spanish sentence is: {}".format(french_sentences[index]))
print('The predicted sentence is :')
print(logits_to_sentence(enc_dec_model.predict(, eng_text_tokenizer))

The english sentence is: i crashed
The spanish sentence is: je suis tombée
The predicted sentence is :
i crashed <empty> <empty> <empty>


In [None]:

def logits_to_sentence(logits, tokenizer):

    index_to_words = {idx: word for word, idx in tokenizer.word_index.items()}
    index_to_words[0] = '<empty>' 

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

index = 14
print("The english sentence is: {}".format(english_sentences[index]))
print("The spanish sentence is: {}".format(french_sentences[index]))
print('The predicted sentence is :')
print(logits_to_sentence(enc_dec_model.predict(, eng_text_tokenizer))

The english sentence is: i crashed
The spanish sentence is: je suis tombée
The predicted sentence is :
i crashed <empty> <empty> <empty>


In [83]:

def logits_to_sentence(logits, tokenizer):

    index_to_words = {idx: word for word, idx in tokenizer.word_index.items()}
    index_to_words[0] = '<empty>' 

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

index = 14
print("The english sentence is: {}".format(english_sentences[index]))
print("The spanish sentence is: {}".format(french_sentences[index]))
print('The predicted sentence is :')
print(logits_to_sentence(enc_dec_model.predict(fra_pad_sentence[index:index+1])[0], eng_text_tokenizer))
print("rfrif")
print(fra_pad_sentence[index:index+1])






The english sentence is: i crashed
The spanish sentence is: je suis tombée
The predicted sentence is :
i crashed <empty> <empty> <empty>
rfrif
[[[   1]
  [   4]
  [1161]
  [   0]
  [   0]
  [   0]
  [   0]
  [   0]
  [   0]
  [   0]
  [   0]]]


In [95]:
new_model = tf.keras.models.load_model('my_model.h5')
index = 15


print(fra_text_tokenized)

print(fra_text_tokenizer.word_index)
print(fra_text_tokenized)


# fra_pad_sentence = pad_sequences(fra_text_tokenized, max_french_len, padding = "post")
# eng_pad_sentence = pad_sequences(eng_text_tokenized, max_english_len, padding = "post")

# fra_pad_sentence1 = pad_sequences(fra_text_tokenized, max_french_len, padding = "post")
# eng_pad_sentence1 = pad_sequences(eng_text_tokenized, max_english_len, padding = "post")

# # Reshape data
# fra_pad_sentence = fra_pad_sentence.reshape(*fra_pad_sentence.shape, 1)
# eng_pad_sentence = eng_pad_sentence.reshape(*eng_pad_sentence.shape, 1)


print(fra_pad_sentence[0])



[[1, 4, 191], [1, 4, 272], [1, 4, 861], [1, 4, 422], [1, 8, 20, 136], [1, 8, 20, 3806], [1, 8, 20, 1411], [1, 40, 136], [13, 2397, 35, 172], [1, 157, 1412], [1, 157, 247], [1, 157, 631], [13, 1160], [3807], [1, 4, 1161], [1, 4, 1162], [1, 18, 4, 1413], [1, 18, 4, 3808], [2398, 15, 1414, 10, 2399], [13, 144, 15, 1414, 10, 2399], [1, 4, 862, 20, 498], [282, 3809], [13, 3810, 22, 2400], [13, 863], [13, 1791], [13, 1163], [282, 472], [282, 2401], [1, 1415, 38, 771, 67, 261], [3811], [13, 3812], [1, 18, 4, 306, 16, 283, 273], [1, 18, 4, 864, 16, 283, 273], [13, 577], [13, 3813], [13, 2402], [1, 18, 4, 578], [1, 18, 4, 632], [1, 18, 4, 2403], [1, 18, 4, 2404], [1, 83, 21], [1, 65], [13, 1164, 2], [706, 41], [1, 18, 4, 2405], [1, 18, 4, 2406], [1, 18, 4, 1416], [1, 18, 4, 2407], [282, 3814], [1, 11, 1165], [21, 18, 88], [52, 21], [52, 41], [1, 1166, 138], [75, 21], [75, 21], [1, 4, 1167], [1, 4, 284], [1, 104, 45, 66], [5, 49, 24, 183, 989], [5, 18, 49, 45, 66], [5, 18, 49, 89], [5, 18, 49, 1

In [119]:
new_model = tf.keras.models.load_model('my_model.h5')


print(fra_text_tokenized)

print(fra_text_tokenizer.word_index)
print(fra_text_tokenized)


# fra_pad_sentence = pad_sequences(fra_text_tokenized, max_french_len, padding = "post")
# eng_pad_sentence = pad_sequences(eng_text_tokenized, max_english_len, padding = "post")

# fra_pad_sentence1 = pad_sequences(fra_text_tokenized, max_french_len, padding = "post")
# eng_pad_sentence1 = pad_sequences(eng_text_tokenized, max_english_len, padding = "post")

# # Reshape data
# fra_pad_sentence = fra_pad_sentence.reshape(*fra_pad_sentence.shape, 1)
# eng_pad_sentence = eng_pad_sentence.reshape(*eng_pad_sentence.shape, 1)


[[1, 4, 191], [1, 4, 272], [1, 4, 861], [1, 4, 422], [1, 8, 20, 136], [1, 8, 20, 3806], [1, 8, 20, 1411], [1, 40, 136], [13, 2397, 35, 172], [1, 157, 1412], [1, 157, 247], [1, 157, 631], [13, 1160], [3807], [1, 4, 1161], [1, 4, 1162], [1, 18, 4, 1413], [1, 18, 4, 3808], [2398, 15, 1414, 10, 2399], [13, 144, 15, 1414, 10, 2399], [1, 4, 862, 20, 498], [282, 3809], [13, 3810, 22, 2400], [13, 863], [13, 1791], [13, 1163], [282, 472], [282, 2401], [1, 1415, 38, 771, 67, 261], [3811], [13, 3812], [1, 18, 4, 306, 16, 283, 273], [1, 18, 4, 864, 16, 283, 273], [13, 577], [13, 3813], [13, 2402], [1, 18, 4, 578], [1, 18, 4, 632], [1, 18, 4, 2403], [1, 18, 4, 2404], [1, 83, 21], [1, 65], [13, 1164, 2], [706, 41], [1, 18, 4, 2405], [1, 18, 4, 2406], [1, 18, 4, 1416], [1, 18, 4, 2407], [282, 3814], [1, 11, 1165], [21, 18, 88], [52, 21], [52, 41], [1, 1166, 138], [75, 21], [75, 21], [1, 4, 1167], [1, 4, 284], [1, 104, 45, 66], [5, 49, 24, 183, 989], [5, 18, 49, 45, 66], [5, 18, 49, 89], [5, 18, 49, 1

In [120]:
fra_text = pad_sequences([[1,4,43,]], max_french_len, padding = "post")

fra_text = fra_text.reshape(*fra_text.shape, 1)
print(fra_text)
print(logits_to_sentence(enc_dec_model.predict(fra_text)[0], eng_text_tokenizer))

[[[1]
  [4]
  [2]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]]]
im mean tom <empty> <empty>
