In [1]:
import numpy as np
import tensorflow as tf
import random
import string
import re
from pickle import dump
from pickle import load
from unicodedata import normalize
from collections import Counter
from numpy.random import shuffle
import datetime

  from ._conv import register_converters as _register_converters


In [2]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

Using TensorFlow backend.


# CONTENT

## Purpose
This notebook aims to recreate a (very simplified) Google Neural Machine Transalation (https://ai.googleblog.com/2016/09/a-neural-network-for-machine.html) to translate from french to english.

Here are the missing parts for this to work better :
- Attention Mechanism
- Embedding layer
- Multi-layers RNN
- Teacher Forcing
- Gradient Clipping

The Data Preparation part is inspired by [this article](https://machinelearningmastery.com/prepare-french-english-dataset-machine-translation/).

## Parts
1. [Utils](#utils)
2. [Data Preparation](#data_preparation)
3. [Model definition](#model_definition)
4. [Training](#training)

----

## I - Utils
<span id='utils'><span>

In [3]:
# save a list of clean sentences to file
def save_clean_sentences(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print('Saved: %s' % filename)

In [4]:
# load a clean dataset
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))

In [5]:
# sentences length
def sentence_lengths(sentences):
    lengths = [len(s.split()) for s in sentences]
    return lengths

In [6]:
# max sentence length
def max_length(lines):
    return max(len(line.split()) for line in lines)

In [7]:
def get_minibatch(batch_size, samples_id, training_size):
    if batch_size > len(samples_id):
        new_samples = list(range(training_size-1))
        random.shuffle(new_samples)
        samples_id.extend(new_samples)
    
    next_batch = samples_id[:batch_size]
    samples_id = samples_id[batch_size:]
    
    return next_batch, samples_id 

---

## II - Data Preparation
<span id='data_preparation'><span>
    
### Dataset : europarl-v7.fr-en
### Max sentence length output (en) : 3 words
### Anomaly : 2*len(en) < len(fr) || len(fr) == 0 || len(en) == 0

In [25]:
threshold_len = 2
max_sentence_len_to_reduce = 3

### Load dataset

In [11]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, mode='rt', encoding='utf-8')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

In [12]:
# split a loaded document into sentences
def to_sentences(doc):
    return doc.strip().split('\n')

In [13]:
# clean a list of lines
def clean_lines(lines):
    cleaned = list()
    # prepare regex for char filtering
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    for line in lines:
        # normalize unicode characters
        line = normalize('NFD', line).encode('ascii', 'ignore')
        line = line.decode('UTF-8')
        # tokenize on white space
        line = line.split()
        # convert to lower case
        line = [word.lower() for word in line]
        # remove punctuation from each token
        line = [word.translate(table) for word in line]
        # remove non-printable chars form each token
        line = [re_print.sub('', w) for w in line]
        # remove tokens with numbers in them
        line = [word for word in line if word.isalpha()]
        # store as string
        cleaned.append(' '.join(line))
    return cleaned

In [14]:
# load English data
filename = 'Data/europarl-v7.fr-en.en'
doc = load_doc(filename)
sentences = to_sentences(doc)
sentences = clean_lines(sentences)
save_clean_sentences(sentences, 'Data/english.pkl')
# spot check
for i in range(10):
    print(sentences[i])

Saved: Data/english.pkl
resumption of the session
i declare resumed the session of the european parliament adjourned on friday december and i would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period
although as you will have seen the dreaded millennium bug failed to materialise still the people in a number of countries suffered a series of natural disasters that truly were dreadful
you have requested a debate on this subject in the course of the next few days during this partsession
in the meantime i should like to observe a minute s silence as a number of members have requested on behalf of all the victims concerned particularly those of the terrible storms in the various countries of the european union
please rise then for this minute s silence
the house rose and observed a minute s silence
madam president on a point of order
you will be aware from the press and television that there have been a number of bomb explosions and killings i

In [15]:
# load French data
filename = 'Data/europarl-v7.fr-en.fr'
doc = load_doc(filename)
sentences = to_sentences(doc)
sentences = clean_lines(sentences)
save_clean_sentences(sentences, 'Data/french.pkl')
# spot check
for i in range(10):
    print(sentences[i])

Saved: Data/french.pkl
reprise de la session
je declare reprise la session du parlement europeen qui avait ete interrompue le vendredi decembre dernier et je vous renouvelle tous mes vux en esperant que vous avez passe de bonnes vacances
comme vous avez pu le constater le grand bogue de lan ne sest pas produit en revanche les citoyens dun certain nombre de nos pays ont ete victimes de catastrophes naturelles qui ont vraiment ete terribles
vous avez souhaite un debat a ce sujet dans les prochains jours au cours de cette periode de session
en attendant je souhaiterais comme un certain nombre de collegues me lont demande que nous observions une minute de silence pour toutes les victimes des tempetes notamment dans les differents pays de lunion europeenne qui ont ete touches
je vous invite a vous lever pour cette minute de silence
le parlement debout observe une minute de silence
madame la presidente cest une motion de procedure
vous avez probablement appris par la presse et par la televis

### Replace barely used words by token

In [16]:
# create a frequency table for all words
def to_vocab(lines):
    vocab = Counter()
    for line in lines:
        tokens = line.split()
        vocab.update(tokens)
    return vocab

In [17]:
# remove all words with a frequency below a threshold
def trim_vocab(vocab, min_occurance):
    tokens = [k for k,c in vocab.items() if c >= min_occurance]
    return set(tokens)

In [18]:
# mark all OOV with "unk" for all lines
def update_dataset(lines, vocab):
    new_lines = list()
    for line in lines:
        new_tokens = list()
        for token in line.split():
            if token in vocab:
                new_tokens.append(token)
            else:
                new_tokens.append('unk')
        new_line = ' '.join(new_tokens)
        new_lines.append(new_line)
    return new_lines

In [19]:
# load English dataset
filename = 'Data/english.pkl'
lines = load_clean_sentences(filename)
# calculate vocabulary
vocab = to_vocab(lines)
print('English Vocabulary: %d' % len(vocab))
# reduce vocabulary
vocab = trim_vocab(vocab, 5)
print('New English Vocabulary: %d' % len(vocab))
# mark out of vocabulary words
lines = update_dataset(lines, vocab)
# save updated dataset
filename = 'Data/english_vocab.pkl'
save_clean_sentences(lines, filename)
# spot check
for i in range(10):
    print(lines[i])

English Vocabulary: 105357
New English Vocabulary: 41746
Saved: Data/english_vocab.pkl
resumption of the session
i declare resumed the session of the european parliament adjourned on friday december and i would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period
although as you will have seen the dreaded millennium bug failed to materialise still the people in a number of countries suffered a series of natural disasters that truly were dreadful
you have requested a debate on this subject in the course of the next few days during this partsession
in the meantime i should like to observe a minute s silence as a number of members have requested on behalf of all the victims concerned particularly those of the terrible storms in the various countries of the european union
please rise then for this minute s silence
the house rose and observed a minute s silence
madam president on a point of order
you will be aware from the press and television 

In [20]:
# load French dataset
filename = 'Data/french.pkl'
lines = load_clean_sentences(filename)
# calculate vocabulary
vocab = to_vocab(lines)
print('French Vocabulary: %d' % len(vocab))
# reduce vocabulary
vocab = trim_vocab(vocab, 5)
print('New French Vocabulary: %d' % len(vocab))
# mark out of vocabulary words
lines = update_dataset(lines, vocab)
# save updated dataset
filename = 'Data/french_vocab.pkl'
save_clean_sentences(lines, filename)
# spot check
for i in range(10):
    print(lines[i])

French Vocabulary: 141642
New French Vocabulary: 58800
Saved: Data/french_vocab.pkl
reprise de la session
je declare reprise la session du parlement europeen qui avait ete interrompue le vendredi decembre dernier et je vous renouvelle tous mes vux en esperant que vous avez passe de bonnes vacances
comme vous avez pu le constater le grand bogue de lan ne sest pas produit en revanche les citoyens dun certain nombre de nos pays ont ete victimes de catastrophes naturelles qui ont vraiment ete terribles
vous avez souhaite un debat a ce sujet dans les prochains jours au cours de cette periode de session
en attendant je souhaiterais comme un certain nombre de collegues me lont demande que nous observions une minute de silence pour toutes les victimes des tempetes notamment dans les differents pays de lunion europeenne qui ont ete touches
je vous invite a vous lever pour cette minute de silence
le parlement debout observe une minute de silence
madame la presidente cest une motion de procedure


### Create pairs dataset and reduce by keeping only short sentenses

In [21]:
def to_pairs(sentences_language_1, sentences_language_2):
    pairs = [[a, b] for a, b in zip(sentences_language_1, sentences_language_2)]
    return np.array(pairs)

In [22]:
def reduce_dataset(sentences, max_size=15):
    sentences_light = []
    indexes = []
    sentences_lengths = sentence_lengths(sentences)
    for i in range(len(sentences)):
        if sentences_lengths[i] <= max_size:
            sentences_light.append(sentences[i])
            indexes.append(i)
    return sentences_light, indexes

In [23]:
# load French dataset
french_filename = 'Data/french_vocab.pkl'
french_sentences = load_clean_sentences(french_filename)
french_sentences_light, indexes = reduce_dataset(french_sentences, max_sentence_len_to_reduce)

# load English dataset
english_filename = 'Data/english_vocab.pkl'
english_sentences = load_clean_sentences(english_filename)
english_sentences_light, indexes = reduce_dataset(english_sentences, max_sentence_len_to_reduce)

french_sentences_light = [french_sentences[i] for i in indexes]
print("Dataset size : " + str(len(french_sentences_light)))

# create pairs
translation_dataset = to_pairs(french_sentences_light, english_sentences_light)
filename = 'Data/french_to_english.pkl'
save_clean_sentences(translation_dataset, filename)
# # spot check
for i in range(100):
    print('[%s] => [%s]' % (translation_dataset[i,0], translation_dataset[i,1]))

Dataset size : 28261
Saved: Data/french_to_english.pkl
[ordre des travaux] => [agenda]
[en ce qui concerne le mercredi] => [relating to wednesday]
[le premier cest la subsidiarite] => [one is subsidiarity]
[pour quoi faire] => [with what aim]
[pourquoi] => [why]
[non] => [no]
[le parlement europeen y est favorable] => [parliament supports this]
[votes] => [vote]
[heatonharris ppede en monsieur le president cest une motion de procedure] => [heatonharris ppede]
[rapport koch] => [koch report]
[rapport koch] => [koch report]
[rapport schroedter] => [schroedter report]
[] => []
[] => []
[] => []
[rapport berend] => [berend report]
[] => []
[rapport rapkay] => [rapkay report]
[rapport jonckheer] => [jonckheer report]
[rapport langen] => [langen report]
[pourquoi] => [why not]
[elles veulent des reponses] => [they want answers]
[tempetes en europe] => [storms in europe]
[je m explique] => [let me explain]
[chers collegues cest faux] => [this is wrong]
[cest important] => [that is important]


### Remove anomalies

In [26]:
translation_dataset = load_clean_sentences('Data/french_to_english.pkl')
pairs_length = np.array([sentence_lengths(translation_dataset[:, 0]), sentence_lengths(translation_dataset[:, 1])])

# select potential anomalies
potential_anomalies = []
clean_indexes = []
for i in range(len(translation_dataset)):
    if threshold_len*pairs_length[1, i] < pairs_length[0, i] or pairs_length[0,i] == 0 or pairs_length[1, i] == 0:
        potential_anomalies.append(translation_dataset[i, :])
    else :
        clean_indexes.append(i)
potential_anomalies = np.array(potential_anomalies)

print("Number of potential anomalies (Threshold : " + str(threshold_len) + ") : " + str(len(potential_anomalies)))
for i in range(100):
    print('[%s] => [%s]' % (potential_anomalies[i,0], potential_anomalies[i,1]))
    
# Save clean dataset
translation_dataset = translation_dataset[clean_indexes]
print("New dataset size : " + str(len(translation_dataset)))
filename = 'Data/french_to_english_clean.pkl'
save_clean_sentences(translation_dataset, filename)

Number of potential anomalies (Threshold : 2) : 8284
[ordre des travaux] => [agenda]
[heatonharris ppede en monsieur le president cest une motion de procedure] => [heatonharris ppede]
[] => []
[] => []
[] => []
[] => []
[nous en sommes donc a la transmission] => [hence the transmission]
[ordre du jour] => [agenda]
[] => []
[nous devons faire en sorte que lunification europeenne sepanouisse egalement sur un plan politique] => []
[cest une situation bien triste] => []
[aucune de ces deux hypotheses nest exacte] => [neither is correct]
[] => []
[] => []
[] => []
[] => []
[ordre des travaux] => [agenda]
[le point renferme par consequent une idee capitale] => [on the contrary]
[ordre du jour] => [agenda]
[quant aux autres points monsieur le president je souleverai par exemple celui de la politique economique et sociale] => []
[] => []
[souhaits de bienvenue] => [welcome]
[] => []
[] => []
[] => []
[jappelle la] => []
[jappelle la] => []
[jappelle la] => []
[souhaits de bienvenue] => [welcom

### Final reducing + Split Train/Test

In [27]:
translation_dataset = load_clean_sentences('Data/french_to_english_clean.pkl')
translation_dataset_light = translation_dataset[:10000,:]
shuffle(translation_dataset)
train_size = round(len(translation_dataset_light)*0.9)
print("Train size : " + str(train_size))
# split into train/test
train, test = translation_dataset_light[:train_size], translation_dataset_light[train_size:]
# save
save_clean_sentences(translation_dataset_light, 'Data/french_to_english.pkl')
save_clean_sentences(train, 'Data/french_to_english-train.pkl')
save_clean_sentences(test, 'Data/french_to_english-test.pkl')

Train size : 9000
Saved: Data/french_to_english.pkl
Saved: Data/french_to_english-train.pkl
Saved: Data/french_to_english-test.pkl


---

## III - Model definition
<span id='model_definition'></span>

In [8]:
num_units = 256
batch_size = 32

In [9]:
class LST_Cell:
    
    def __init__(self, input_size, num_nodes):
        self.input_gate_input_weights = tf.Variable(tf.truncated_normal([input_size, num_nodes], -0.1, 0.1))
        self.input_gate_prevOutput_weights = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
        self.input_gate_bias = tf.Variable(tf.zeros([1, num_nodes]))
        
        # Forget gate: input, previous output, and bias.
        self.forget_gate_input_weights = tf.Variable(tf.truncated_normal([input_size, num_nodes], -0.1, 0.1))
        self.forget_gate_prevOutput_weights = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
        self.forget_gate_bias = tf.Variable(tf.zeros([1, num_nodes]))
        
        # State gate: input, state and bias.                             
        self.state_gate_input_weights = tf.Variable(tf.truncated_normal([input_size, num_nodes], -0.1, 0.1))
        self.state_gate_prevOutput_weights = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
        self.state_gate_bias = tf.Variable(tf.zeros([1, num_nodes]))
        
        # Output gate: input, previous output, and bias.
        self.output_gate_input_weights = tf.Variable(tf.truncated_normal([input_size, num_nodes], -0.1, 0.1))
        self.output_gate_prevOutput_weights = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
        self.output_gate_bias = tf.Variable(tf.zeros([1, num_nodes]))


    # Definition of the cell computation.
    def compute_cell(self, i, o, state):
        input_gate = tf.sigmoid(tf.matmul(i, self.input_gate_input_weights) + \
                                tf.matmul(o, self.input_gate_prevOutput_weights) + self.input_gate_bias)
        forget_gate = tf.sigmoid(tf.matmul(i, self.forget_gate_input_weights) + \
                                 tf.matmul(o, self.forget_gate_prevOutput_weights) + self.forget_gate_bias)
        update = tf.tanh(tf.matmul(i, self.state_gate_input_weights) + \
                                 tf.matmul(o, self.state_gate_prevOutput_weights) + self.state_gate_bias)
        state = forget_gate * state + input_gate * update
        output_gate = tf.sigmoid(tf.matmul(i, self.output_gate_input_weights) + \
                                 tf.matmul(o, self.output_gate_prevOutput_weights) + self.output_gate_bias)
        return output_gate * tf.tanh(state), state
    

In [62]:
def LSTM_Encoder(input_size, num_nodes, max_length, X):
    lstm_cell = LST_Cell(input_size, num_nodes)
    
    # Variables saving state across unrollings.
    saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    
    # Unrolled LSTM loop.
    outputs = list()
    output = saved_output
    state = saved_state
    for i in range(max_length):
        output, state = lstm_cell.compute_cell(X[:, i, :], output, state)
        outputs.append(output)
    
    return outputs, output, state

In [63]:
def LSTM_Decoder(input_size, num_nodes, max_length, last_output, last_state, output_vocab_size):
    lstm_cell = LST_Cell(input_size, num_nodes)
    
    W_out = tf.Variable(tf.truncated_normal([num_nodes, output_vocab_size], -0.1, 0.1))
    b_out = tf.Variable(tf.zeros([output_vocab_size]))

    # Unrolled LSTM loop.
    outputs = list()
    output = last_output
    state = last_state
    for i in range(max_length):
        output, state = lstm_cell.compute_cell(last_output, output, state)
        final_output = tf.matmul(output, W_out) + b_out
        outputs.append(final_output)
    
    return outputs, output, state 

In [64]:
def seq2seq_model(input_vocab_size, input_max_length, output_vocab_size, output_max_length, padding_onehot):
    graph = tf.Graph()
    with graph.as_default():
        X  = tf.placeholder(tf.float32, shape=(batch_size, input_max_length, input_vocab_size))
        Y  = tf.placeholder(tf.float32, shape=(batch_size, output_max_length, output_vocab_size))
        encoder_outputs, last_encoder_output, last_encoder_state = LSTM_Encoder(input_vocab_size, num_units, input_max_length, X)
        decoder_outputs, _, _ = LSTM_Decoder(num_units, num_units, output_max_length, last_encoder_output, last_encoder_state, output_vocab_size)
        
        decoder_outputs = tf.convert_to_tensor(decoder_outputs, dtype=tf.float32)
        decoder_outputs = tf.reshape(decoder_outputs, [batch_size, output_max_length, output_vocab_size])
        
        predicted_encoded_word = tf.nn.softmax(decoder_outputs)
        
        padding_mask = [[[1 if x == 0 else 0 for x in padding_onehot] for i in range(output_max_length)] for j in range(batch_size)]
        target_weights = tf.reduce_sum(tf.multiply(Y, padding_mask), 2)
        loss = tf.reduce_mean(tf.losses.softmax_cross_entropy(onehot_labels=Y, logits=decoder_outputs, weights=target_weights))
        optimizer = tf.train.AdamOptimizer(0.0001)
        
        # Calculate and clip gradients
        gradients = optimizer.compute_gradients(loss)
#         clipped_gradients, _ = tf.clip_by_global_norm(
#             gradients, 1)
#         update_step = optimizer.apply_gradients(zip(clipped_gradients, params))

        update_step = optimizer.minimize(loss)
        
        ################" TENSORBOARD #################
        tf.summary.scalar('Cross_Entropy', loss)
        grad_summ_op = tf.summary.merge([tf.summary.histogram("%s-grad" % g[1].name, g[0]) for g in gradients])
#         tf.summary.histogram("Clipped Gradients", clipped_gradients)
        merged = tf.summary.merge_all()
        
    return graph, X, Y,\
            [encoder_outputs, last_encoder_output, last_encoder_state, decoder_outputs, target_weights],\
            predicted_encoded_word, loss, update_step, merged

## IV - Training
<span id='training'></span>

### Prepare Data

In [33]:
# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [34]:
# encode and pad sequences
def token_encode_sequences(tokenizer, length, lines):
    # integer encode sequences
    X = tokenizer.texts_to_sequences(lines)
    # pad sequences with 0 values
    X = pad_sequences(X, maxlen=length, padding='post')
    return X

In [35]:
# one hot encode sequence
def one_hot_encode_sequences(sequences, vocab_size):
    ylist = list()
    for sequence in sequences:
        encoded = to_categorical(sequence, num_classes=vocab_size)
        ylist.append(encoded)
    y = np.array(ylist)
    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return y

In [36]:
# load datasets
dataset = load_clean_sentences('Data/french_to_english.pkl')
train = load_clean_sentences('Data/french_to_english-train.pkl')
test = load_clean_sentences('Data/french_to_english-test.pkl')

In [37]:
# prepare French tokenizer
fr_tokenizer = create_tokenizer(dataset[:, 0])
fr_vocab_size = len(fr_tokenizer.word_index) + 1
fr_length = max_length(dataset[:, 0])
print('French Vocabulary Size: %d' % fr_vocab_size)
print('French Max Length: %d' % (fr_length))
# prepare English tokenizer
en_tokenizer = create_tokenizer(dataset[:, 1])
en_vocab_size = len(en_tokenizer.word_index) + 1
en_length = max_length(dataset[:, 1])
print('English Vocabulary Size: %d' % en_vocab_size)
print('English Max Length: %d' % (en_length))

French Vocabulary Size: 3422
French Max Length: 6
English Vocabulary Size: 2901
English Max Length: 3


In [38]:
# prepare training data
trainX = token_encode_sequences(fr_tokenizer, fr_length, train[:, 0])
trainX = one_hot_encode_sequences(trainX, fr_vocab_size)
trainY = token_encode_sequences(en_tokenizer, en_length, train[:, 1])
trainY = one_hot_encode_sequences(trainY, en_vocab_size)
# prepare validation data
testX = token_encode_sequences(fr_tokenizer, fr_length, test[:, 0])
testX = one_hot_encode_sequences(testX, fr_vocab_size)
testY = token_encode_sequences(en_tokenizer, en_length, test[:, 1])
testY = one_hot_encode_sequences(testY, en_vocab_size)

In [47]:
trainX = trainX[:5]
trainY = trainY[:5]

In [48]:
for i in range(4):
    print('Input : [%s] => [%s]' % (train[i, 0], trainX[i]))
    print('Output : [%s] => [%s]' % (train[i, 1], trainY[i]))

Input : [pourquoi] => [[[0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]]]
Output : [why is that] => [[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]]
Input : [un bilan catastrophique] => [[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]]]
Output : [that is atrocious] => [[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]]
Input : [legiferons honnetement] => [[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]]]
Output : [legislate honestly] => [[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]]]
Input : [estce clair] => [[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0.

In [49]:
# map an integer to a word
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

In [50]:
def get_decoded_sequence(encoded_sequence, tokenizer):
    sentence = []
    for encoded_word in encoded_sequence:
        word_token_index = np.argmax(encoded_word)
        word = word_for_id(word_token_index, tokenizer)
        sentence.append(word)
    return sentence

In [51]:
padding_output_onehot = token_encode_sequences(en_tokenizer, en_length, ["sample"])
padding_output_onehot = one_hot_encode_sequences(padding_output_onehot, en_vocab_size)[0][1]

### Train

In [52]:
logdir = "nmt_tensorboard/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + "/"
save_path = "nmt_saves/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + "/"

In [72]:
num_epochs = 100000
load_model = True
load_epoch = 600
load_path = "20180617-201559"

In [67]:
graph, X, Y, encoder_decoder_outputs, predicted_encoded_word, loss, update_step, merged_tb = seq2seq_model(fr_vocab_size, fr_length, en_vocab_size, en_length, padding_output_onehot)
nb_steps = trainX.shape[0] // batch_size
train_writer = tf.summary.FileWriter(logdir+"/train", graph)
validation_writer = tf.summary.FileWriter(logdir+"/validation", graph)
start_epoch = load_epoch if load_model else 0
total_steps = nb_steps * start_epoch
with tf.Session(graph=graph) as session:
        saver = tf.train.Saver()
        if load_model:
            saver.restore(session, "nmt_saves/" + str(load_path) + "/model-" + str(load_epoch) + ".ckpt")
            print("Restored model " + str(load_epoch))
        else:
            tf.global_variables_initializer().run()
            print("Initialized")
            
        for epoch in range(start_epoch, num_epochs):
            print("Epoch : " + str(epoch))
            ids = []
            test_ids = []
            for step in range(nb_steps):
                total_steps+=1
                
                batch_ids, ids = get_minibatch(batch_size, ids, trainX.shape[0])
                batch_data = trainX[batch_ids, :]
                batch_labels = trainY[batch_ids, :]

                feed_dict = {X : batch_data, Y : batch_labels}
                _, train_l, train_predictions, train_summary = session.run(
                  [update_step, loss, predicted_encoded_word, merged_tb], feed_dict=feed_dict)

                if total_steps % 1 == 0:
                    print("Minibatch loss at epoch %d and step %d: %f" % (epoch, step, train_l))
                    test_batch_ids, test_ids = get_minibatch(batch_size, test_ids, testX.shape[0])
                    test_batch_data = testX[test_batch_ids, :]
                    test_batch_labels = testY[test_batch_ids, :]

                    feed_dict = {X : test_batch_data, Y : test_batch_labels}
                    test_l, test_outputs, test_predictions, validation_summary = session.run(
                      [loss, encoder_decoder_outputs, predicted_encoded_word, merged_tb], feed_dict=feed_dict)
                    print("Validation loss at epoch %d and step %d: %f" % (epoch, step, test_l))
                    train_writer.add_summary(train_summary, total_steps)
                    validation_writer.add_summary(validation_summary, total_steps)
                    for i in range(3):
                        print('Input : [%s]' % get_decoded_sequence(test_batch_data[i], fr_tokenizer))
                        print('Output : [%s]' % get_decoded_sequence(test_batch_labels[i], en_tokenizer))
                        print('Predicted Output : [%s]' % get_decoded_sequence(test_predictions[i], en_tokenizer))
                        print('Predicted Distribution : [%s]' % test_outputs[3][i])
                        print('Word weights : [%s]' % test_outputs[4][i])
                    
            if epoch % 100 == 0 and epoch != 0:
                saved_path = saver.save(session, save_path + "model-" + str(epoch) + ".ckpt")
                print("Epoch : %s => Model saved in path: %s" % (epoch, saved_path))

INFO:tensorflow:Summary name Variable:0-grad is illegal; using Variable_0-grad instead.
INFO:tensorflow:Summary name Variable_1:0-grad is illegal; using Variable_1_0-grad instead.
INFO:tensorflow:Summary name Variable_2:0-grad is illegal; using Variable_2_0-grad instead.
INFO:tensorflow:Summary name Variable_3:0-grad is illegal; using Variable_3_0-grad instead.
INFO:tensorflow:Summary name Variable_4:0-grad is illegal; using Variable_4_0-grad instead.
INFO:tensorflow:Summary name Variable_5:0-grad is illegal; using Variable_5_0-grad instead.
INFO:tensorflow:Summary name Variable_6:0-grad is illegal; using Variable_6_0-grad instead.
INFO:tensorflow:Summary name Variable_7:0-grad is illegal; using Variable_7_0-grad instead.
INFO:tensorflow:Summary name Variable_8:0-grad is illegal; using Variable_8_0-grad instead.
INFO:tensorflow:Summary name Variable_9:0-grad is illegal; using Variable_9_0-grad instead.
INFO:tensorflow:Summary name Variable_10:0-grad is illegal; using Variable_10_0-grad

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "D:\Programmes\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2910, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-67-d2be95f67787>", line 29, in <module>
    [update_step, loss, predicted_encoded_word, merged_tb], feed_dict=feed_dict)
  File "D:\Programmes\Anaconda3\lib\site-packages\tensorflow\python\client\session.py", line 895, in run
    run_metadata_ptr)
  File "D:\Programmes\Anaconda3\lib\site-packages\tensorflow\python\client\session.py", line 1128, in _run
    feed_dict_tensor, options, run_metadata)
  File "D:\Programmes\Anaconda3\lib\site-packages\tensorflow\python\client\session.py", line 1344, in _do_run
    options, run_metadata)
  File "D:\Programmes\Anaconda3\lib\site-packages\tensorflow\python\client\session.py", line 1350, in _do_call
    return fn(*args)
  File "D:\Programmes\Anaconda3\lib\site-packages\tensorflow\python\client\session.py", line 1329, in _run_fn
 

KeyboardInterrupt: 

### Inference

In [10]:
graph, X, Y, encoder_decoder_outputs, predicted_encoded_word, loss, update_step, merged_tb = seq2seq_model(fr_vocab_size, fr_length, en_vocab_size, en_length, padding_output_onehot)
with tf.Session(graph=graph) as session:
        saver = tf.train.Saver()
        saver.restore(session, "nmt_saves/" + str(load_path) + "/model-" + str(load_epoch) + ".ckpt")
        print("Restored model " + str(load_epoch))
        
        ids, batch_ids = [], []
        test_batch_ids, ids = get_minibatch(batch_size, ids, trainX.shape[0])
        test_batch_data = trainX[test_batch_ids, :]
        test_batch_labels = trainY[test_batch_ids, :]

#         test_batch_ids, test_ids = get_minibatch(batch_size, test_ids, testX.shape[0])
#         test_batch_data = testX[test_batch_ids, :]
#         test_batch_labels = testY[test_batch_ids, :]

        feed_dict = {X : test_batch_data, Y : test_batch_labels}
        test_l, test_outputs, test_predictions, validation_summary = session.run(
          [loss, encoder_decoder_outputs, predicted_encoded_word, merged_tb], feed_dict=feed_dict)

        for i in range(5):
            print('Input : [%s]' % get_decoded_sequence(test_batch_data[i], fr_tokenizer))
            print('Output : [%s]' % get_decoded_sequence(test_batch_labels[i], en_tokenizer))
            print('Predicted Output : [%s]' % get_decoded_sequence(test_predictions[i], en_tokenizer))
            print('Predicted Distribution : [%s]' % test_outputs[3][i])
            print('Word weights : [%s]' % test_outputs[4][i])

NameError: name 'seq2seq_model' is not defined