In [47]:
import os, sys
from keras.models import Model
from keras.layers import Input, LSTM, GRU, Dense, Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import numpy as np
import matplotlib.pyplot as plt
from tensorflow import keras

In [48]:
NUM_SENTENCES = 20000
MAX_SENTENCE_LENGTH = 50
MAX_NUM_WORDS = 20000
EMBEDDING_SIZE = 50
VALIDATION_SPLIT = 0.1
LSTM_NODES = 256
BATCH_SIZE = 64

In [49]:
corpus = open(r"/content/drive/MyDrive/NLP/fra_eng.txt", encoding="utf-8")

# Preprocess corpus

The input to the encoder LSTM is the sentence in the original
language; the input to the decoder LSTM is the sentence in the translated
language with a start-of-sentence token. The output is the actual target
sentence with an end-of-sentence token.

In [50]:
input_sentences = []
output_sentences = []
output_sentences_inputs = []
count = 0

for line in corpus:
  count += 1
  if count > NUM_SENTENCES:
    break
  if '\t' not in line:
    continue
  input_sentence, output = line.rstrip().split('\t')[0:2]
  output_sentence = output + ' <eos>'
  output_sentence_input = '<sos> ' + output
  input_sentences.append(input_sentence)
  output_sentences.append(output_sentence)
  output_sentences_inputs.append(output_sentence_input)

print("num samples input:", len(input_sentences))
print("num samples output:", len(output_sentences))
print("num samples output input:", len(output_sentences_inputs))

num samples input: 20000
num samples output: 20000
num samples output input: 20000


In [51]:
print(input_sentences[172])
print(output_sentences[172])
print(output_sentences_inputs[172])

I'm hit!
Je suis touchée ! <eos>
<sos> Je suis touchée !


In [52]:
input_tokenizer = Tokenizer(num_words=MAX_NUM_WORDS) # keep only the top 20 000 words in the sentences
input_tokenizer.fit_on_texts(input_sentences)
input_integer_seq = input_tokenizer.texts_to_sequences(input_sentences) # vectorized input sentences
word2idx_inputs = input_tokenizer.word_index # give integer value of a word for input
id2words_inputs = input_tokenizer.index_word # give the word representing an integer
print('Total unique words in the input: %s' % len(word2idx_inputs))
max_input_len = max(len(sen) for sen in input_integer_seq)
print("Length of longest sentence in input: %g" % max_input_len)

Total unique words in the input: 3511
Length of longest sentence in input: 6


In [53]:
output_tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, filters='')
output_tokenizer.fit_on_texts(output_sentences + output_sentences_inputs)
output_integer_seq = output_tokenizer.texts_to_sequences(output_sentences)
output_input_integer_seq = output_tokenizer.texts_to_sequences(output_sentences_inputs)
word2idx_outputs = output_tokenizer.word_index
id2words_outputs = output_tokenizer.index_word
print('Total unique words in the output: %s' % len(word2idx_outputs))
num_words_output = len(word2idx_outputs) + 1
max_out_len = max(len(sen) for sen in output_integer_seq)
print("Length of longest sentence in the output: %g" % max_out_len)

Total unique words in the output: 9523
Length of longest sentence in the output: 13


In [54]:
encoder_input_sequences = pad_sequences(input_integer_seq, maxlen=max_input_len)
print("encoder_input_sequences.shape:", encoder_input_sequences.shape)
print("encoder_input_sequences[172]:", encoder_input_sequences[172])

encoder_input_sequences.shape: (20000, 6)
encoder_input_sequences[172]: [  0   0   0   0   6 615]


In [55]:
decoder_input_sequences = pad_sequences(output_input_integer_seq, maxlen=max_out_len, padding='post')
print("decoder_input_sequences.shape:", decoder_input_sequences.shape)
print("decoder_input_sequences[172]:", decoder_input_sequences[172])

decoder_input_sequences.shape: (20000, 13)
decoder_input_sequences[172]: [   2    3    6 2024    5    0    0    0    0    0    0    0    0]


In [56]:
decoder_output_sequences = pad_sequences(output_integer_seq, maxlen=max_out_len, padding="post")
print("decoder_output_sequences.shape:", decoder_output_sequences.shape)
print("decoder_output_sequences[172]:", decoder_output_sequences[172])

decoder_output_sequences.shape: (20000, 13)
decoder_output_sequences[172]: [   3    6 2024    5    1    0    0    0    0    0    0    0    0]


# Words embedding

In [57]:
from numpy import array
from numpy import asarray
from numpy import zeros
# create dic of {word:vector_of_word}
embeddings_dictionary = dict()
glove_file = open(r'/content/drive/MyDrive/NLP/6471382cdd837544bf3ac72497a38715e845897d265b2b424b4761832009c837/glove.6B.50d.txt', encoding="utf8")
for line in glove_file:
 records = line.split()
 word = records[0]
 vector_dimensions = asarray(records[1:], dtype='float32')
 embeddings_dictionary[word] = vector_dimensions
glove_file.close()

In [58]:
# create a matrix where each row represent the integer value of a word
num_words = min(MAX_NUM_WORDS, len(word2idx_inputs) + 1)
embedding_matrix = zeros((num_words, EMBEDDING_SIZE))
for word, index in word2idx_inputs.items():
 embedding_vector = embeddings_dictionary.get(word)
 if embedding_vector is not None:
  embedding_matrix[index] = embedding_vector

In [59]:
embedding_matrix.shape

(3512, 50)

In [60]:
word2idx_inputs["hit"]

615

In [61]:
embedding_matrix[615]

array([-0.41659001, -0.47595999,  0.95744002,  0.27019   ,  0.17657   ,
        0.24828   , -1.29869998,  0.53851002,  0.35336   ,  0.58221   ,
       -0.33079001, -0.59680003, -0.97055   ,  0.72083998,  0.49463001,
       -0.83398002,  0.12236   , -0.37237   , -1.45459998,  0.41384   ,
       -0.36311001,  0.2202    ,  0.057482  , -0.24951001,  0.37654001,
       -1.30610001,  0.22596   ,  0.47510001,  1.28600001, -0.62642998,
        3.4058001 ,  0.18436   ,  1.26559997,  1.07410002,  0.3026    ,
        0.31395   ,  0.33682999, -0.31895   ,  0.31911999,  0.37919   ,
       -1.1652    ,  0.94625002, -0.044854  , -1.07790005, -0.16669001,
        0.11604   , -0.11983   , -0.23662999,  0.29087999,  0.11071   ])

In [62]:
embedding_layer = Embedding(num_words, EMBEDDING_SIZE, weights=[embedding_matrix], input_length=max_input_len)

In [63]:
num_words

3512

# Creating the model

In [64]:
encoder_inputs_placeholder = Input(shape=(max_input_len,))
x = embedding_layer(encoder_inputs_placeholder)
encoder = LSTM(LSTM_NODES, return_state=True)
encoder_outputs, h, c = encoder(x)
encoder_states = [h, c]

In [65]:
decoder_inputs_placeholder = Input(shape=(max_out_len,))
decoder_embedding = Embedding(num_words_output, LSTM_NODES)
decoder_inputs_x = decoder_embedding(decoder_inputs_placeholder)
decoder_lstm = LSTM(LSTM_NODES, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs_x, initial_state=encoder_states)

In [66]:
decoder_dense = Dense(num_words_output, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [67]:
model = Model([encoder_inputs_placeholder,
 decoder_inputs_placeholder], decoder_outputs)
model.compile(
 optimizer='rmsprop',
 loss='categorical_crossentropy',
 metrics=['accuracy']
)

In [68]:
# You should train your neural net model as shown below:
import tensorflow
import numpy as np
from tensorflow.keras.utils import Sequence
from numpy.random           import seed
from tensorflow.random      import set_seed
#-------------------------------------------------------------------------------
# These steps would not be required if you were developing the auto-translation
# project for yourself. These however, help reducing the variance induced by the
# random neural weights assigned at the beginning of the gradient descent.
# Which is useful when grading your answer
#-------------------------------------------------------------------------------
seed(42)
set_seed(42)
#
#-------------------------------------------------------------------------------
# This sequence is used to feed the training process with batches that are not
# all loaded in ram at once
#------------------------------------------------------------------------------
class LazyLoadedSequence(Sequence):
  def __init__(self, begin, end):
      self.begin      = begin        # beginning (included) of the data
      self.end        = end          # end (excluded) of the data
      self.nb_samples = end - begin  # number of data samples

  def __len__(self):
      # returns the number of batches of data
      return np.ceil(self.nb_samples / BATCH_SIZE).astype(np.int)

  def __getitem__(self, idx):
      # returns the `idx`th batch of data
      # (returns both inputs aka xs and outputs aka ys)
      start   = self.begin + BATCH_SIZE * idx
      end     = min(self.end, start + BATCH_SIZE)
      #
      enc_x   = encoder_input_sequences[start:end]
      dec_x   = decoder_input_sequences[start:end]
      one_hot = np.zeros((end-start, max_out_len, num_words_output), dtype='float16')
      # now let us actually build the one hot encoded representation for each of
      # the output sentences (in french)
      for i, d in enumerate(decoder_output_sequences[start:end]):
        for t, word in enumerate(d):
          one_hot[i, t, word] = 1
      # now return both the xs and the ys
      return [enc_x, dec_x], one_hot

#-------------------------------------------------------------------------------
# Actually fit it with custom batches
#------------------------------------------------------------------------------
nb_sentences    = len(input_sentences)
split_limit     = np.ceil(nb_sentences * (1 - VALIDATION_SPLIT)).astype(np.int)
train_data      = LazyLoadedSequence(0, split_limit)
validation_data = LazyLoadedSequence(split_limit, nb_sentences)

r = model.fit(
    train_data,
    validation_data = validation_data,
    epochs          = 20,
)

Epoch 1/20
Epoch 2/20

KeyboardInterrupt: ignored

In [None]:
model.summary()

In [None]:
#model.save("/content/drive/MyDrive/NLP/model_translation")

# Making predictions

In [None]:
encoder_model = Model(encoder_inputs_placeholder, encoder_states)

In [None]:
decoder_state_input_h = Input(shape=(LSTM_NODES,))
decoder_state_input_c = Input(shape=(LSTM_NODES,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

In [None]:
decoder_inputs_single = Input(shape=(1,))
decoder_inputs_single_x = decoder_embedding(decoder_inputs_single)

In [None]:
decoder_outputs, h, c = decoder_lstm(decoder_inputs_single_x, initial_state=decoder_states_inputs)

In [None]:
decoder_states = [h, c]
decoder_outputs = decoder_dense(decoder_outputs)

In [None]:
decoder_model = Model(
 [decoder_inputs_single] + decoder_states_inputs,
 [decoder_outputs] + decoder_states
)

In [None]:
idx2word_input = {v:k for k, v in word2idx_inputs.items()}
idx2word_target = {v:k for k, v in word2idx_outputs.items()}

# Translate sentences

In [None]:
def translate_sentence(sentence: str, encoder, decoder, word2idx_outputs, idx2word_target) -> str:
  input_seq = pad_sequences(input_tokenizer.texts_to_sequences([sentence]), maxlen=max_input_len)
  states_value = encoder.predict(input_seq)
  target_seq = np.zeros((1, 1))
  target_seq[0, 0] = word2idx_outputs['<sos>']
  eos = word2idx_outputs['<eos>']
  output_sentence = []
  for _ in range(max_out_len):
    output_tokens, h, c = decoder.predict([target_seq] + states_value)
    idx = np.argmax(output_tokens[0, 0, :])
    if eos == idx:
      break
    word = ''
    if idx > 0:
      word = idx2word_target[idx]
      output_sentence.append(word)
    target_seq[0, 0] = idx
    states_value = [h, c]
  return ' '.join(output_sentence)

In [None]:
translate_sentence("I'm a lawyer.", encoder_model, decoder_model, word2idx_outputs, idx2word_target)

In [None]:
translate_sentence("Is anybody hurt?", encoder_model, decoder_model, word2idx_outputs, idx2word_target)

In [None]:
translate_sentence("I'm concentrating.", encoder_model, decoder_model, word2idx_outputs, idx2word_target)

# Evaluation

In [None]:
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu
from nltk.translate.bleu_score import SmoothingFunction

In [None]:
reference = [['this', 'looks', 'highly', 'satisfactory', '<eos>'], ['this', 'looks', 'good', 'indeed', '<eos>' ]]
candidate = ['this', 'is', 'very', 'good', 'indeed', '<eos>']

In [None]:
chencherry = SmoothingFunction()
sentence_bleu(reference, candidate,smoothing_function=chencherry.method1) 

In [None]:
sum = 0
weights = (1./2., 1./2.)
for i in input_sentence[split_limit:]:
  hyp = output_tokenizer.texts_to_sequences([translate_sentence(i,encoder, decoder,word2idx_outputs, idx2word_target ) + "<eos>"])

In [None]:
output_tokenizer.texts_to_sequences([translate_sentence(input_sentences[1],encoder, decoder,word2idx_outputs, idx2word_target )])

In [None]:
validation_output_sentences = output_sentences[split_limit:]
references = [output_tokenizer.texts_to_sequences([i]) for i in validation_output_sentences]
for i in range(len(references)):
  for j in range(len(references[i][0])):
    references[i][0][j] = id2words_outputs[references[i][0][j]]

In [None]:
references[:4]

In [None]:
hypotheses = [translate_sentence(i, encoder_model, decoder_model, word2idx_outputs, idx2word_target) for i in input_sentences[split_limit:]]

In [None]:
hyp = [output_tokenizer.texts_to_sequences([i]) for i in hypotheses]
#hyp = [output_tokenizer.texts_to_sequences([i]) for i in tmp]
for i in range(len(hyp)):
  for j in range(len(hyp[i][0])):
    hyp[i][0][j] = id2words_outputs[hyp[i][0][j]]
  hyp[i][0].append("<eos>")

In [None]:
hyp[:4]

In [None]:
hypotheses[:4]

In [None]:
weights = (1./2., 1./2.)
corpus_bleu(references, hypotheses, weights, smoothing_function=chencherry.method1)