# **Word level text translator** : 
## the model will try to predict the next word from the a sequence of input word, requires a big corpus and the training is longer but its results are more efficient than those of the character-level.
## Word level translation is the most used nowadays.

In [1]:
import pandas as pd
import numpy as np
import string
import matplotlib.pyplot as plt
import re
from sklearn.model_selection import train_test_split
from keras.layers import Input, LSTM, Embedding, Dense, CuDNNLSTM
from keras.models import Model
from keras.utils import plot_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint
from unicodedata import normalize

print('imported')

Using TensorFlow backend.


imported


## **Data preprocessing :**

In [15]:
#initial variables
num_samples = 10000
embedding_size = 256
input_texts = []
target_texts = []

#loading dataset
with open('fra.txt','r', encoding='utf-8') as f:
    lines = f.read().split('\n')
    
#shuffle dataset before deviding it into train/test sets    
lines = lines[: num_samples]
np.random.shuffle(lines)

#cleaning text
for line in lines:
    line = normalize('NFD', line).encode('ascii', 'ignore')
    line = line.decode('UTF-8')
    line = line.replace('-',' ')
    line = line.lower()
    line = line.translate(str.maketrans('', '', string.punctuation))
    input_text, target_text = line.split('\t')
    input_text = [word for word in input_text.split() if word.isalpha()]
    target_text = [word for word in target_text.split() if word.isalpha()]
    input_texts.append(input_text)
    target_texts.append(target_text)

#creating and fitting tokenizers
eng_tokenizer = Tokenizer()
eng_tokenizer.fit_on_texts(input_texts)
fr_tokenizer  = Tokenizer()
fr_tokenizer.fit_on_texts(target_texts)

input_max_len = np.max([len(line) for line in input_texts]) #longest sequence in english
target_max_len = np.max([len(line) for line in target_texts]) #longest sequence in french
eng_vocab_size = len(eng_tokenizer.word_index) + 1 #size of eng vocab, the vocab starts at 1 so we add +1 for the 0 index
fr_vocab_size = len(fr_tokenizer.word_index) + 1 #same for fr

#tokens dict 
fr_tokens_dict = dict((i, char) for char, i in fr_tokenizer.word_index.items())
eng_tokens_dict = dict((i, char) for char, i in eng_tokenizer.word_index.items())

print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (input_max_len))
print('French Vocabulary Size: %d' % fr_vocab_size)
print('French Max Length: %d' % (target_max_len))

English Vocabulary Size: 2124
English Max Length: 5
French Vocabulary Size: 4026
French Max Length: 10


In [16]:
#tokenize texts then fixing the length of the array with pad_sequences
def encode_input(data, max_sequence_length):
    x = eng_tokenizer.texts_to_sequences(data)
    x = pad_sequences(x, maxlen=max_sequence_length, padding='post')
    return x
    
#same as last function, but also transforms the data into one-hot-encoding represtation
def encode_output(data, max_sequence_length):
    x = fr_tokenizer.texts_to_sequences(data)
    x = pad_sequences(x, maxlen=max_sequence_length, padding='post')
    array = []
    for seq in x:
        encoded = to_categorical(seq, num_classes=fr_vocab_size)
        array.append(encoded)
    array = np.array(array)
    return array

In [17]:
#splitting into train and test sets
x_train, x_test, y_train, y_test = train_test_split(input_texts, target_texts, test_size=0.20, random_state=42)

#input will be max sequence length
max_sequence_length = np.max([input_max_len, target_max_len])

#vectorizing data
X_train = encode_input(x_train, max_sequence_length)
Y_train = encode_output(y_train, max_sequence_length)

X_test = encode_input(x_test, max_sequence_length)
Y_test = encode_output(y_test, max_sequence_length)

print('Train : X = ', X_train.shape,', Y = ', Y_train.shape)
print('Test : X = ', X_test.shape,', Y = ', Y_test.shape)

Train : X =  (8000, 10) , Y =  (8000, 10, 4026)
Test : X =  (2000, 10) , Y =  (2000, 10, 4026)


## **Defining the model architecture :**

In [18]:
#defining model
model = Sequential()
model.add(Embedding(eng_vocab_size, embedding_size, input_length=max_sequence_length))
model.add(CuDNNLSTM(256))
model.add(RepeatVector(10))
model.add(CuDNNLSTM(256, return_sequences=True))
model.add(TimeDistributed(Dense(fr_vocab_size, activation='softmax')))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 10, 256)           543744    
_________________________________________________________________
cu_dnnlstm_5 (CuDNNLSTM)     (None, 256)               526336    
_________________________________________________________________
repeat_vector_3 (RepeatVecto (None, 10, 256)           0         
_________________________________________________________________
cu_dnnlstm_6 (CuDNNLSTM)     (None, 10, 256)           526336    
_________________________________________________________________
time_distributed_3 (TimeDist (None, 10, 4026)          1034682   
Total params: 2,631,098
Trainable params: 2,631,098
Non-trainable params: 0
_________________________________________________________________


## **Training phase :**

In [20]:
#training with a checkpoint callback
filename = 'fra_eng_seq2seq.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(X_train, Y_train, epochs=30, batch_size=64, 
          validation_data=(X_test, Y_test), callbacks=[checkpoint], verbose=1)

Train on 8000 samples, validate on 2000 samples
Epoch 1/30

Epoch 00001: val_loss improved from inf to 2.30838, saving model to fra_eng_seq2seq.h5
Epoch 2/30

Epoch 00002: val_loss improved from 2.30838 to 2.23870, saving model to fra_eng_seq2seq.h5
Epoch 3/30

Epoch 00003: val_loss improved from 2.23870 to 2.21850, saving model to fra_eng_seq2seq.h5
Epoch 4/30

Epoch 00004: val_loss improved from 2.21850 to 2.21294, saving model to fra_eng_seq2seq.h5
Epoch 5/30

Epoch 00005: val_loss improved from 2.21294 to 2.20213, saving model to fra_eng_seq2seq.h5
Epoch 6/30

Epoch 00006: val_loss did not improve from 2.20213
Epoch 7/30

Epoch 00007: val_loss did not improve from 2.20213
Epoch 8/30

Epoch 00008: val_loss improved from 2.20213 to 2.19164, saving model to fra_eng_seq2seq.h5
Epoch 9/30

Epoch 00009: val_loss improved from 2.19164 to 2.18651, saving model to fra_eng_seq2seq.h5
Epoch 10/30

Epoch 00010: val_loss did not improve from 2.18651
Epoch 11/30

Epoch 00011: val_loss did not im

<keras.callbacks.History at 0x19680733fd0>

In [49]:
#It may be possible to train for some more epochs (without a checkpoint callback ! we dont want to overwrite the last save)
model.fit(X_train, Y_train, epochs=10, batch_size=64, 
          validation_data=(X_test, Y_test), verbose=1)

Train on 8000 samples, validate on 2000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x19681439d68>

## **Saving model :**

In [54]:
#saving model (Only if we have better performances)
model.save('fra_eng_seq2seq.h5')
print('saved')

saved


## **Loading model :**

In [9]:
#loading model
model.load_weights('fra_eng_seq2seq.h5')
print('loaded')

loaded


## **Testing :**

In [60]:
x = np.expand_dims(X_test[452], axis=0)
translation = model.predict(x, verbose=1)
integers = [np.argmax(vector) for vector in translation[0]]
target = list()
for i in integers:
    if i != 0:
        word = fr_tokens_dict[i]
        target.append(fr_tokens_dict[i])
print(' '.join(x_test[452]))
print(' '.join(target))

this is crazy
cest fou


## **API Model equivalent :**

In [7]:
#API model equivalent
encoder_inputs = Input(shape=(None,))
en_x = Embedding(fr_tokens_length, embedding_size)(encoder_inputs)
encoder = CuDNNLSTM(512, return_state=True)
encoder_outputs, state_h, state_c = encoder(en_x)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(None,))
dex = Embedding(eng_tokens_length, embedding_size)
final_dex = dex(decoder_inputs)
decoder_lstm = CuDNNLSTM(512, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(final_dex, initial_state=encoder_states)
decoder_dense = Dense(fr_tokens_length, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

W1013 13:29:50.970685  3920 deprecation_wrapper.py:119] From c:\users\kino\anaconda3\envs\deeplearning\lib\site-packages\keras\backend\tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W1013 13:29:51.573176  3920 deprecation_wrapper.py:119] From c:\users\kino\anaconda3\envs\deeplearning\lib\site-packages\keras\backend\tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W1013 13:29:51.741045  3920 deprecation_wrapper.py:119] From c:\users\kino\anaconda3\envs\deeplearning\lib\site-packages\keras\backend\tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.



__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 100)    420700      input_1[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, None, 100)    212500      input_2[0][0]                    
__________________________________________________________________________________________________
cu_dnnlstm