In [2]:
from collections import Counter

import nltk
nltk.download('punkt')
import NMT_Model
import nmt_data_utils
import nmt_model_utils

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vinzgoldwin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
with open('news-commentary-v8.de-en.en',
          'r',
          encoding = 'utf-8') as f:
    en = f.readlines()
    
with open('news-commentary-v8.de-en.de',
          'r',
          encoding = 'utf-8') as f:
    de = f.readlines()

In [4]:
# first 5 sentence pairs. 
for line in zip(en[:5], de[:5]):
    print(line, '\n')

('SAN FRANCISCO – It has never been easy to have a rational conversation about the value of gold.\n', 'SAN FRANCISCO – Es war noch nie leicht, ein rationales Gespräch über den Wert von Gold zu führen.\n') 

('Lately, with gold prices up more than 300% over the last decade, it is harder than ever.\n', 'In letzter Zeit allerdings ist dies schwieriger denn je, ist doch der Goldpreis im letzten Jahrzehnt um über 300 Prozent angestiegen.\n') 

('Just last December, fellow economists Martin Feldstein and Nouriel Roubini each penned op-eds bravely questioning bullish market sentiment, sensibly pointing out gold’s risks.\n', 'Erst letzten Dezember verfassten meine Kollegen Martin Feldstein und Nouriel Roubini Kommentare, in denen sie mutig die vorherrschende optimistische Marktstimmung hinterfragten und sehr überlegt auf die Risiken des Goldes \xa0hinwiesen.\n') 

('Wouldn’t you know it?\n', 'Und es kam, wie es kommen musste.\n') 

('Since their articles appeared, the price of gold has moved u

In [5]:
# remove unnecessary new lines. 
de = [line.strip() for line in de]
en = [line.strip() for line in en]

In [6]:
# we will only use sentences of similar lengths in order to make training easier. 
len_en = [len(sent) for sent in en if 20 < len(sent) < 50]
len_dist = Counter(len_en).most_common()
len_dist

[(49, 599),
 (48, 599),
 (46, 583),
 (47, 547),
 (43, 514),
 (44, 512),
 (45, 511),
 (41, 509),
 (40, 503),
 (42, 490),
 (39, 477),
 (38, 443),
 (37, 438),
 (36, 421),
 (34, 412),
 (33, 365),
 (32, 358),
 (31, 353),
 (35, 346),
 (30, 326),
 (28, 324),
 (27, 273),
 (25, 260),
 (29, 254),
 (26, 250),
 (24, 233),
 (23, 232),
 (22, 214),
 (21, 208)]

In [7]:
# 11554 sentences that contain betwenn 20 and 50 words.
len(len_en)

11554

In [8]:
_de = []
_en = []
for sent_de, sent_en in zip(de, en):
    if 20 < len(sent_en) < 50:
        _de.append(sent_de)
        _en.append(sent_en)

In [9]:
%%time

# but we will not use all 150 000 sentences, only 5000 for the beginning.
text = _en[:2000]
language='english'
lower=True
words = []
tokenized_text = []

for line in text:
    tokenized = nltk.word_tokenize(line, language=language)
    if lower:
        tokenized = [word.lower() for word in tokenized]
    tokenized_text.append(tokenized)
    for word in tokenized:
        words.append(word)

most_common = Counter(words).most_common()
en_preprocessed = tokenized_text
en_most_common = most_common

text = _de[:2000]
language='german'
lower=True
words = []
tokenized_text = []

for line in text:
    tokenized = nltk.word_tokenize(line, language=language)
    if lower:
        tokenized = [word.lower() for word in tokenized]
    tokenized_text.append(tokenized)
    for word in tokenized:
        words.append(word)

most_common = Counter(words).most_common()
de_preprocessed = tokenized_text
de_most_common = most_common

Wall time: 787 ms


In [10]:
len(en_preprocessed), len(de_preprocessed)

(2000, 2000)

In [11]:
# for some of the sentences there is not german or english counterpart, i.e. only an empy array []
# therefore we will remove those sentence pairs.
en_preprocessed_clean, de_preprocessed_clean = [], []

for sent_en, sent_de in zip(en_preprocessed, de_preprocessed):
    if sent_en != [] and sent_de != []:
        en_preprocessed_clean.append(sent_en)
        de_preprocessed_clean.append(sent_de)
    else:
        continue

In [12]:
len(en_preprocessed_clean), len(de_preprocessed_clean)

(1992, 1992)

In [13]:
for e, d in zip(en_preprocessed_clean, de_preprocessed_clean[:5]):
    print('English:\n', e)
    print('German:\n', d, '\n'*3)

English:
 ['wouldn', '’', 't', 'you', 'know', 'it', '?']
German:
 ['und', 'es', 'kam', ',', 'wie', 'es', 'kommen', 'musste', '.'] 



English:
 ['since', 'then', ',', 'the', 'index', 'has', 'climbed', 'above', '10,000', '.']
German:
 ['seit', 'damals', 'ist', 'er', 'auf', 'über', '10.000', 'punkte', 'gestiegen', '.'] 



English:
 ['they', 'departed', 'pledging', 'to', 'revive', 'europe', "'s", 'growth', '.']
German:
 ['mit', 'der', 'zusicherung', ',', 'das', 'wachstum', 'in', 'europa', 'wieder', 'zu', 'beleben', ',', 'gingen', 'sie', 'auseinander', '.'] 



English:
 ['we', "'ve", 'heard', 'that', 'empty', 'promise', 'before', '.']
German:
 ['dieses', 'leere', 'versprechen', 'haben', 'wir', 'schon', 'einmal', 'gehört', '.'] 



English:
 ['many', 'europeans', 'are', 'sick', 'of', 'british', 'vetoes', '.']
German:
 ['viele', 'europäer', 'sind', 'die', 'britischen', 'vetos', 'leid', '.'] 





In [14]:
en_most_common[:15], len(en_most_common), len(de_most_common)

([('.', 1747),
  ('the', 648),
  ('is', 559),
  (',', 437),
  ('to', 297),
  ('this', 258),
  ('a', 239),
  ('but', 234),
  ('of', 213),
  ('not', 197),
  ('in', 193),
  ('are', 192),
  ('?', 184),
  ('it', 182),
  ('be', 161)],
 3174,
 7078)

In [15]:
# now we can create oyr lookup dicts for english and german, i.e. our vocab. 
# we will also include special tokens, later on used in the model. 
specials = ["<unk>", "<s>", "</s>", '<pad>']

en_word2ind, en_ind2word, en_vocab_size = nmt_data_utils.create_vocab(en_most_common, specials)
de_word2ind, de_ind2word, de_vocab_size = nmt_data_utils.create_vocab(de_most_common, specials)

In [16]:
# in order to feed the sentences to the network, we have to convert them to ints, corresponding to their indices
# in the lookup dicts. 
# we reverse the source language sentences, i.e. the english sentences as this alleviates learning for the seq2seq 
# model. Apart from this we also include EndOfSentence and StartOfSentence tags, which are needed as well. 
en_inds, en_unknowns = nmt_data_utils.convert_to_inds(en_preprocessed_clean, en_word2ind, reverse = True, eos = True)
de_inds, de_unknowns = nmt_data_utils.convert_to_inds(de_preprocessed_clean, de_word2ind, sos = True, eos = True)

In [17]:
[nmt_data_utils.convert_to_words(sentence, en_ind2word) for sentence in  en_inds[:2]]

[['?', 'it', 'know', 'you', 't', '’', 'wouldn', '</s>'],
 ['.',
  '10,000',
  'above',
  'climbed',
  'has',
  'index',
  'the',
  ',',
  'then',
  'since',
  '</s>']]

In [18]:
[nmt_data_utils.convert_to_words(sentence, de_ind2word) for sentence in  de_inds[:2]]

[['<s>',
  'und',
  'es',
  'kam',
  ',',
  'wie',
  'es',
  'kommen',
  'musste',
  '.',
  '</s>'],
 ['<s>',
  'seit',
  'damals',
  'ist',
  'er',
  'auf',
  'über',
  '10.000',
  'punkte',
  'gestiegen',
  '.',
  '</s>']]

In [19]:
# hyperparams. 
# those are probably not perfect, but work fine for now. 
num_layers_encoder = 4
num_layers_decoder = 4
rnn_size_encoder = 128
rnn_size_decoder = 128
embedding_dim = 300

batch_size = 64
epochs = 100
clip = 5
keep_probability = 0.8
learning_rate = 0.01
learning_rate_decay_steps = 1000
learning_rate_decay = 0.9

In [20]:
# create the graph and train the model. 
nmt_model_utils.reset_graph()

nmt = NMT_Model.NMT(en_word2ind,
                    en_ind2word,
                    de_word2ind,
                    de_ind2word,
                    './models/local_one/my_model',
                    'TRAIN',
                    embedding_dim = embedding_dim,
                    num_layers_encoder = num_layers_encoder,
                    num_layers_decoder = num_layers_decoder,
                    batch_size = batch_size,
                    clip = clip,
                    keep_probability = keep_probability,
                    learning_rate = learning_rate,
                    epochs = epochs,
                    rnn_size_encoder = rnn_size_encoder,
                    rnn_size_decoder = rnn_size_decoder, 
                    learning_rate_decay_steps = learning_rate_decay_steps,
                    learning_rate_decay = learning_rate_decay)
  
nmt.build_graph()
nmt.train(en_inds, de_inds)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.Bidirectional(keras.layers.RNN(cell))`, which is equivalent to this API
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
This class is equivalent as tf.keras.layers.StackedRNNCells, and will be replaced by that in Tensorflow 2.0.

For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Graph built.
-------------------- Epoch 0 of

Iteration: 8 of 31	train_loss: 6.0239
Iteration: 10 of 31	train_loss: 5.9435
Iteration: 12 of 31	train_loss: 6.3232
Iteration: 14 of 31	train_loss: 6.0963
Iteration: 16 of 31	train_loss: 6.2354
Iteration: 18 of 31	train_loss: 5.9653
Iteration: 20 of 31	train_loss: 6.2521
Iteration: 22 of 31	train_loss: 6.3977
Iteration: 24 of 31	train_loss: 6.2661
Iteration: 26 of 31	train_loss: 6.1277
Iteration: 28 of 31	train_loss: 6.0440
Iteration: 30 of 31	train_loss: 5.7323
Iteration: 31 of 31	train_loss: 5.9002
Average Score for this Epoch: 6.032395839691162
--- new best score ---


-------------------- Epoch 8 of 100 --------------------
Iteration: 0 of 31	train_loss: 5.7467
Iteration: 2 of 31	train_loss: 5.6890
Iteration: 4 of 31	train_loss: 5.7433
Iteration: 6 of 31	train_loss: 5.8070
Iteration: 8 of 31	train_loss: 5.7085
Iteration: 10 of 31	train_loss: 5.9460
Iteration: 12 of 31	train_loss: 6.1672
Iteration: 14 of 31	train_loss: 6.0839
Iteration: 16 of 31	train_loss: 5.9948
Iteration: 18 of 3

Iteration: 24 of 31	train_loss: 5.2162
Iteration: 26 of 31	train_loss: 5.7135
Iteration: 28 of 31	train_loss: 5.4243
Iteration: 30 of 31	train_loss: 5.2654
Iteration: 31 of 31	train_loss: 5.2633
Average Score for this Epoch: 5.321332931518555
--- new best score ---


-------------------- Epoch 18 of 100 --------------------
Iteration: 0 of 31	train_loss: 4.8557
Iteration: 2 of 31	train_loss: 5.2551
Iteration: 4 of 31	train_loss: 5.1069
Iteration: 6 of 31	train_loss: 5.1006
Iteration: 8 of 31	train_loss: 5.2928
Iteration: 10 of 31	train_loss: 5.2884
Iteration: 12 of 31	train_loss: 5.3599
Iteration: 14 of 31	train_loss: 5.3122
Iteration: 16 of 31	train_loss: 5.4240
Iteration: 18 of 31	train_loss: 5.1597
Iteration: 20 of 31	train_loss: 5.3267
Iteration: 22 of 31	train_loss: 5.2219
Iteration: 24 of 31	train_loss: 5.3784
Iteration: 26 of 31	train_loss: 5.4980
Iteration: 28 of 31	train_loss: 5.5893
Iteration: 30 of 31	train_loss: 5.4813
Iteration: 31 of 31	train_loss: 4.8793
Average Score fo

Iteration: 0 of 31	train_loss: 4.6543
Iteration: 2 of 31	train_loss: 4.8616
Iteration: 4 of 31	train_loss: 4.5556
Iteration: 6 of 31	train_loss: 4.9726
Iteration: 8 of 31	train_loss: 4.5749
Iteration: 10 of 31	train_loss: 4.6963
Iteration: 12 of 31	train_loss: 4.6731
Iteration: 14 of 31	train_loss: 4.8812
Iteration: 16 of 31	train_loss: 4.6806
Iteration: 18 of 31	train_loss: 5.0221
Iteration: 20 of 31	train_loss: 4.9882
Iteration: 22 of 31	train_loss: 4.8831
Iteration: 24 of 31	train_loss: 4.8225
Iteration: 26 of 31	train_loss: 5.0650
Iteration: 28 of 31	train_loss: 4.8162
Iteration: 30 of 31	train_loss: 4.8003
Iteration: 31 of 31	train_loss: 4.6289
Average Score for this Epoch: 4.816605567932129
--- new best score ---


-------------------- Epoch 29 of 100 --------------------
Iteration: 0 of 31	train_loss: 4.7634
Iteration: 2 of 31	train_loss: 4.6164
Iteration: 4 of 31	train_loss: 4.8278
Iteration: 6 of 31	train_loss: 4.7641
Iteration: 8 of 31	train_loss: 4.8409
Iteration: 10 of 31	t

Iteration: 16 of 31	train_loss: 4.6569
Iteration: 18 of 31	train_loss: 4.7628
Iteration: 20 of 31	train_loss: 4.2499
Iteration: 22 of 31	train_loss: 4.1984
Iteration: 24 of 31	train_loss: 4.3874
Iteration: 26 of 31	train_loss: 4.6468
Iteration: 28 of 31	train_loss: 4.0280
Iteration: 30 of 31	train_loss: 4.3759
Iteration: 31 of 31	train_loss: 4.3472
Average Score for this Epoch: 4.471663951873779
--- new best score ---


-------------------- Epoch 39 of 100 --------------------
Iteration: 0 of 31	train_loss: 4.1960
Iteration: 2 of 31	train_loss: 4.4980
Iteration: 4 of 31	train_loss: 4.4434
Iteration: 6 of 31	train_loss: 4.5013
Iteration: 8 of 31	train_loss: 4.4618
Iteration: 10 of 31	train_loss: 4.3016
Iteration: 12 of 31	train_loss: 4.5214
Iteration: 14 of 31	train_loss: 4.4220
Iteration: 16 of 31	train_loss: 4.6204
Iteration: 18 of 31	train_loss: 4.3614
Iteration: 20 of 31	train_loss: 4.3747
Iteration: 22 of 31	train_loss: 4.3656
Iteration: 24 of 31	train_loss: 4.4485
Iteration: 26 of

Iteration: 31 of 31	train_loss: 4.0170
Average Score for this Epoch: 4.196211814880371
--- new best score ---


-------------------- Epoch 49 of 100 --------------------
Iteration: 0 of 31	train_loss: 4.2463
Iteration: 2 of 31	train_loss: 4.2675
Iteration: 4 of 31	train_loss: 4.2238
Iteration: 6 of 31	train_loss: 4.2741
Iteration: 8 of 31	train_loss: 4.1197
Iteration: 10 of 31	train_loss: 4.2523
Iteration: 12 of 31	train_loss: 4.1349
Iteration: 14 of 31	train_loss: 4.2477
Iteration: 16 of 31	train_loss: 4.1217
Iteration: 18 of 31	train_loss: 4.3293
Iteration: 20 of 31	train_loss: 4.1250
Iteration: 22 of 31	train_loss: 4.3006
Iteration: 24 of 31	train_loss: 4.2090
Iteration: 26 of 31	train_loss: 4.2953
Iteration: 28 of 31	train_loss: 4.4796
Iteration: 30 of 31	train_loss: 4.4938
Iteration: 31 of 31	train_loss: 4.3968
Average Score for this Epoch: 4.1916399002075195
--- new best score ---


-------------------- Epoch 50 of 100 --------------------
Iteration: 0 of 31	train_loss: 4.0832
It

Iteration: 10 of 31	train_loss: 3.8631
Iteration: 12 of 31	train_loss: 3.7453
Iteration: 14 of 31	train_loss: 3.8707
Iteration: 16 of 31	train_loss: 3.6843
Iteration: 18 of 31	train_loss: 3.8314
Iteration: 20 of 31	train_loss: 4.2060
Iteration: 22 of 31	train_loss: 4.3216
Iteration: 24 of 31	train_loss: 4.0986
Iteration: 26 of 31	train_loss: 4.2137
Iteration: 28 of 31	train_loss: 4.0845
Iteration: 30 of 31	train_loss: 4.2382
Iteration: 31 of 31	train_loss: 3.8607
Average Score for this Epoch: 3.9736907482147217
--- new best score ---


-------------------- Epoch 60 of 100 --------------------
Iteration: 0 of 31	train_loss: 4.0040
Iteration: 2 of 31	train_loss: 4.0583
Iteration: 4 of 31	train_loss: 3.8432
Iteration: 6 of 31	train_loss: 4.0153
Iteration: 8 of 31	train_loss: 3.4668
Iteration: 10 of 31	train_loss: 3.9540
Iteration: 12 of 31	train_loss: 4.1521
Iteration: 14 of 31	train_loss: 3.9931
Iteration: 16 of 31	train_loss: 3.9577
Iteration: 18 of 31	train_loss: 3.8436
Iteration: 20 o

Iteration: 26 of 31	train_loss: 3.6975
Iteration: 28 of 31	train_loss: 4.2965
Iteration: 30 of 31	train_loss: 4.0844
Iteration: 31 of 31	train_loss: 3.7569
Average Score for this Epoch: 3.780916213989258
--- new best score ---


-------------------- Epoch 70 of 100 --------------------
Iteration: 0 of 31	train_loss: 3.6496
Iteration: 2 of 31	train_loss: 3.5325
Iteration: 4 of 31	train_loss: 3.8304
Iteration: 6 of 31	train_loss: 3.8498
Iteration: 8 of 31	train_loss: 3.8319
Iteration: 10 of 31	train_loss: 3.8251
Iteration: 12 of 31	train_loss: 3.9673
Iteration: 14 of 31	train_loss: 3.8365
Iteration: 16 of 31	train_loss: 4.0494
Iteration: 18 of 31	train_loss: 3.8727
Iteration: 20 of 31	train_loss: 3.4715
Iteration: 22 of 31	train_loss: 3.5972
Iteration: 24 of 31	train_loss: 3.8564
Iteration: 26 of 31	train_loss: 3.9984
Iteration: 28 of 31	train_loss: 4.2110
Iteration: 30 of 31	train_loss: 3.7998
Iteration: 31 of 31	train_loss: 3.5842
Average Score for this Epoch: 3.758573532104492
--- new

Iteration: 4 of 31	train_loss: 3.7658
Iteration: 6 of 31	train_loss: 3.4158
Iteration: 8 of 31	train_loss: 3.5356
Iteration: 10 of 31	train_loss: 4.1665
Iteration: 12 of 31	train_loss: 3.4573
Iteration: 14 of 31	train_loss: 3.6801
Iteration: 16 of 31	train_loss: 3.9270
Iteration: 18 of 31	train_loss: 3.5245
Iteration: 20 of 31	train_loss: 3.8032
Iteration: 22 of 31	train_loss: 3.6263
Iteration: 24 of 31	train_loss: 3.4774
Iteration: 26 of 31	train_loss: 3.5321
Iteration: 28 of 31	train_loss: 3.6601
Iteration: 30 of 31	train_loss: 3.2903
Iteration: 31 of 31	train_loss: 3.9225
Average Score for this Epoch: 3.6144165992736816
-------------------- Epoch 81 of 100 --------------------
Iteration: 0 of 31	train_loss: 3.7123
Iteration: 2 of 31	train_loss: 3.6177
Iteration: 4 of 31	train_loss: 3.6603
Iteration: 6 of 31	train_loss: 3.5395
Iteration: 8 of 31	train_loss: 3.4092
Iteration: 10 of 31	train_loss: 3.6737
Iteration: 12 of 31	train_loss: 3.2144
Iteration: 14 of 31	train_loss: 3.6022
Iter

Iteration: 24 of 31	train_loss: 3.4973
Iteration: 26 of 31	train_loss: 3.7026
Iteration: 28 of 31	train_loss: 3.9013
Iteration: 30 of 31	train_loss: 3.3102
Iteration: 31 of 31	train_loss: 3.5658
Average Score for this Epoch: 3.4817116260528564
-------------------- Epoch 91 of 100 --------------------
Iteration: 0 of 31	train_loss: 3.6375
Iteration: 2 of 31	train_loss: 3.4034
Iteration: 4 of 31	train_loss: 3.3268
Iteration: 6 of 31	train_loss: 3.4427
Iteration: 8 of 31	train_loss: 3.0395
Iteration: 10 of 31	train_loss: 3.5248
Iteration: 12 of 31	train_loss: 3.1942
Iteration: 14 of 31	train_loss: 3.6229
Iteration: 16 of 31	train_loss: 3.2848
Iteration: 18 of 31	train_loss: 3.2298
Iteration: 20 of 31	train_loss: 3.7195
Iteration: 22 of 31	train_loss: 3.3511
Iteration: 24 of 31	train_loss: 3.5717
Iteration: 26 of 31	train_loss: 3.3146
Iteration: 28 of 31	train_loss: 3.5086
Iteration: 30 of 31	train_loss: 3.4620
Iteration: 31 of 31	train_loss: 3.6544
Average Score for this Epoch: 3.45963978

In [None]:
_de_inds, _de_unknowns = nmt_data_utils.convert_to_inds(de_preprocessed_clean, de_word2ind, sos = True,  eos = True)

In [None]:
# the inference model does not necessaryly need to get input batches. we can just give it. the whole input
# data, but the the batchsize has to be specified as the lenght of the input data.
nmt_model_utils.reset_graph()

nmt = NMT_Model.NMT(en_word2ind,
                    use_gru = true,
                    en_ind2word,
                    de_word2ind,
                    de_ind2word,
                    './models/local_one/my_model',
                    'INFER',
                    num_layers_encoder = num_layers_encoder,
                    num_layers_decoder = num_layers_decoder,
                    batch_size = len(en_inds[:50]),
                    keep_probability = 1.0,
                    learning_rate = 0.0,
                    beam_width = 0,
                    rnn_size_encoder = rnn_size_encoder,
                    rnn_size_decoder = rnn_size_decoder)

nmt.build_graph()
preds = nmt.infer(en_inds[:50], restore_path =  './models/local_one/my_model', targets = _de_inds[:50])

In [None]:
# show some of the created translations
# Note: the way bleu score is probably not the perfect way to do it
nmt_model_utils.sample_results(preds, en_ind2word, de_ind2word, en_word2ind, de_word2ind, _de_inds[:50], en_inds[:50])