In [1]:
from collections import Counter

import nltk
nltk.download('punkt')
import NMT_Model
import nmt_data_utils
import nmt_model_utils

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vinzgoldwin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
with open('news-commentary-v8.de-en.en',
          'r',
          encoding = 'utf-8') as f:
    en = f.readlines()
    
with open('news-commentary-v8.de-en.de',
          'r',
          encoding = 'utf-8') as f:
    de = f.readlines()

In [3]:
# first 5 sentence pairs. 
for line in zip(en[:5], de[:5]):
    print(line, '\n')

('SAN FRANCISCO – It has never been easy to have a rational conversation about the value of gold.\n', 'SAN FRANCISCO – Es war noch nie leicht, ein rationales Gespräch über den Wert von Gold zu führen.\n') 

('Lately, with gold prices up more than 300% over the last decade, it is harder than ever.\n', 'In letzter Zeit allerdings ist dies schwieriger denn je, ist doch der Goldpreis im letzten Jahrzehnt um über 300 Prozent angestiegen.\n') 

('Just last December, fellow economists Martin Feldstein and Nouriel Roubini each penned op-eds bravely questioning bullish market sentiment, sensibly pointing out gold’s risks.\n', 'Erst letzten Dezember verfassten meine Kollegen Martin Feldstein und Nouriel Roubini Kommentare, in denen sie mutig die vorherrschende optimistische Marktstimmung hinterfragten und sehr überlegt auf die Risiken des Goldes \xa0hinwiesen.\n') 

('Wouldn’t you know it?\n', 'Und es kam, wie es kommen musste.\n') 

('Since their articles appeared, the price of gold has moved u

In [4]:
# remove unnecessary new lines. 
de = [line.strip() for line in de]
en = [line.strip() for line in en]

In [5]:
# we will only use sentences of similar lengths in order to make training easier. 
len_en = [len(sent) for sent in en if 20 < len(sent) < 50]
len_dist = Counter(len_en).most_common()
len_dist

[(49, 599),
 (48, 599),
 (46, 583),
 (47, 547),
 (43, 514),
 (44, 512),
 (45, 511),
 (41, 509),
 (40, 503),
 (42, 490),
 (39, 477),
 (38, 443),
 (37, 438),
 (36, 421),
 (34, 412),
 (33, 365),
 (32, 358),
 (31, 353),
 (35, 346),
 (30, 326),
 (28, 324),
 (27, 273),
 (25, 260),
 (29, 254),
 (26, 250),
 (24, 233),
 (23, 232),
 (22, 214),
 (21, 208)]

In [6]:
# 11554 sentences that contain betwenn 20 and 50 words.
len(len_en)

11554

In [7]:
_de = []
_en = []
for sent_de, sent_en in zip(de, en):
    if 20 < len(sent_en) < 50:
        _de.append(sent_de)
        _en.append(sent_en)

In [8]:
%%time

# but we will not use all 150 000 sentences, only 5000 for the beginning.
text = _en[:3000]
language='english'
lower=True
words = []
tokenized_text = []

for line in text:
    tokenized = nltk.word_tokenize(line, language=language)
    if lower:
        tokenized = [word.lower() for word in tokenized]
    tokenized_text.append(tokenized)
    for word in tokenized:
        words.append(word)

most_common = Counter(words).most_common()
en_preprocessed = tokenized_text
en_most_common = most_common

text = _de[:3000]
language='german'
lower=True
words = []
tokenized_text = []

for line in text:
    tokenized = nltk.word_tokenize(line, language=language)
    if lower:
        tokenized = [word.lower() for word in tokenized]
    tokenized_text.append(tokenized)
    for word in tokenized:
        words.append(word)

most_common = Counter(words).most_common()
de_preprocessed = tokenized_text
de_most_common = most_common

Wall time: 420 ms


In [9]:
len(en_preprocessed), len(de_preprocessed)

(1000, 1000)

In [10]:
# for some of the sentences there is not german or english counterpart, i.e. only an empy array []
# therefore we will remove those sentence pairs.
en_preprocessed_clean, de_preprocessed_clean = [], []

for sent_en, sent_de in zip(en_preprocessed, de_preprocessed):
    if sent_en != [] and sent_de != []:
        en_preprocessed_clean.append(sent_en)
        de_preprocessed_clean.append(sent_de)
    else:
        continue

In [11]:
len(en_preprocessed_clean), len(de_preprocessed_clean)

(997, 997)

In [12]:
for e, d in zip(en_preprocessed_clean, de_preprocessed_clean[:5]):
    print('English:\n', e)
    print('German:\n', d, '\n'*3)

English:
 ['wouldn', '’', 't', 'you', 'know', 'it', '?']
German:
 ['und', 'es', 'kam', ',', 'wie', 'es', 'kommen', 'musste', '.'] 



English:
 ['since', 'then', ',', 'the', 'index', 'has', 'climbed', 'above', '10,000', '.']
German:
 ['seit', 'damals', 'ist', 'er', 'auf', 'über', '10.000', 'punkte', 'gestiegen', '.'] 



English:
 ['they', 'departed', 'pledging', 'to', 'revive', 'europe', "'s", 'growth', '.']
German:
 ['mit', 'der', 'zusicherung', ',', 'das', 'wachstum', 'in', 'europa', 'wieder', 'zu', 'beleben', ',', 'gingen', 'sie', 'auseinander', '.'] 



English:
 ['we', "'ve", 'heard', 'that', 'empty', 'promise', 'before', '.']
German:
 ['dieses', 'leere', 'versprechen', 'haben', 'wir', 'schon', 'einmal', 'gehört', '.'] 



English:
 ['many', 'europeans', 'are', 'sick', 'of', 'british', 'vetoes', '.']
German:
 ['viele', 'europäer', 'sind', 'die', 'britischen', 'vetos', 'leid', '.'] 





In [13]:
en_most_common[:15], len(en_most_common), len(de_most_common)

([('.', 868),
  ('the', 331),
  ('is', 263),
  (',', 198),
  ('to', 145),
  ('this', 129),
  ('a', 122),
  ('but', 108),
  ('of', 103),
  ('not', 100),
  ('are', 97),
  ('in', 97),
  ('it', 92),
  ('be', 85),
  ('?', 81)],
 2013,
 2444)

In [14]:
# now we can create oyr lookup dicts for english and german, i.e. our vocab. 
# we will also include special tokens, later on used in the model. 
specials = ["<unk>", "<s>", "</s>", '<pad>']

en_word2ind, en_ind2word, en_vocab_size = nmt_data_utils.create_vocab(en_most_common, specials)
de_word2ind, de_ind2word, de_vocab_size = nmt_data_utils.create_vocab(de_most_common, specials)

In [15]:
# in order to feed the sentences to the network, we have to convert them to ints, corresponding to their indices
# in the lookup dicts. 
# we reverse the source language sentences, i.e. the english sentences as this alleviates learning for the seq2seq 
# model. Apart from this we also include EndOfSentence and StartOfSentence tags, which are needed as well. 
en_inds, en_unknowns = nmt_data_utils.convert_to_inds(en_preprocessed_clean, en_word2ind, reverse = True, eos = True)
de_inds, de_unknowns = nmt_data_utils.convert_to_inds(de_preprocessed_clean, de_word2ind, sos = True, eos = True)

In [16]:
[nmt_data_utils.convert_to_words(sentence, en_ind2word) for sentence in  en_inds[:2]]

[['?', 'it', 'know', 'you', 't', '’', 'wouldn', '</s>'],
 ['.',
  '10,000',
  'above',
  'climbed',
  'has',
  'index',
  'the',
  ',',
  'then',
  'since',
  '</s>']]

In [17]:
[nmt_data_utils.convert_to_words(sentence, de_ind2word) for sentence in  de_inds[:2]]

[['<s>',
  'und',
  'es',
  'kam',
  ',',
  'wie',
  'es',
  'kommen',
  'musste',
  '.',
  '</s>'],
 ['<s>',
  'seit',
  'damals',
  'ist',
  'er',
  'auf',
  'über',
  '10.000',
  'punkte',
  'gestiegen',
  '.',
  '</s>']]

In [18]:
# hyperparams. 
# those are probably not perfect, but work fine for now. 
num_layers_encoder = 4
num_layers_decoder = 4
rnn_size_encoder = 128
rnn_size_decoder = 128
embedding_dim = 300

batch_size = 64
epochs = 250
clip = 5
keep_probability = 0.8
learning_rate = 0.01
learning_rate_decay_steps = 1000
learning_rate_decay = 0.9

In [None]:
# create the graph and train the model. 
nmt_model_utils.reset_graph()

nmt = NMT_Model.NMT(en_word2ind,
                    en_ind2word,
                    de_word2ind,
                    de_ind2word,
                    './models/local_one/my_model',
                    'TRAIN',
                    embedding_dim = embedding_dim,
                    num_layers_encoder = num_layers_encoder,
                    num_layers_decoder = num_layers_decoder,
                    batch_size = batch_size,
                    clip = clip,
                    keep_probability = keep_probability,
                    learning_rate = learning_rate,
                    epochs = epochs,
                    rnn_size_encoder = rnn_size_encoder,
                    rnn_size_decoder = rnn_size_decoder, 
                    learning_rate_decay_steps = learning_rate_decay_steps,
                    learning_rate_decay = learning_rate_decay)
  
nmt.build_graph()
nmt.train(en_inds, de_inds)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.Bidirectional(keras.layers.RNN(cell))`, which is equivalent to this API
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
This class is equivalent as tf.keras.layers.StackedRNNCells, and will be replaced by that in Tensorflow 2.0.

For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Graph built.
-------------------- Epoch 0 of

Iteration: 0 of 15	train_loss: 3.7786
Iteration: 2 of 15	train_loss: 3.9165
Iteration: 4 of 15	train_loss: 3.8842
Iteration: 6 of 15	train_loss: 4.1170
Iteration: 8 of 15	train_loss: 4.0415
Iteration: 10 of 15	train_loss: 4.0676
Iteration: 12 of 15	train_loss: 4.0868
Iteration: 14 of 15	train_loss: 4.0057
Iteration: 15 of 15	train_loss: 3.7603
Average Score for this Epoch: 3.9627456665039062
--- new best score ---


-------------------- Epoch 13 of 150 --------------------
Iteration: 0 of 15	train_loss: 3.8823
Iteration: 2 of 15	train_loss: 3.7612
Iteration: 4 of 15	train_loss: 3.9176
Iteration: 6 of 15	train_loss: 3.7326
Iteration: 8 of 15	train_loss: 3.9204
Iteration: 10 of 15	train_loss: 3.8732
Iteration: 12 of 15	train_loss: 3.9084
Iteration: 14 of 15	train_loss: 4.0497
Iteration: 15 of 15	train_loss: 4.0004
Average Score for this Epoch: 3.8537750244140625
--- new best score ---


-------------------- Epoch 14 of 150 --------------------
Iteration: 0 of 15	train_loss: 3.5590
Iterat

In [None]:
_de_inds, _de_unknowns = nmt_data_utils.convert_to_inds(de_preprocessed_clean, de_word2ind, sos = True,  eos = True)

In [None]:
# the inference model does not necessaryly need to get input batches. we can just give it. the whole input
# data, but the the batchsize has to be specified as the lenght of the input data.
nmt_model_utils.reset_graph()

nmt = NMT_Model.NMT(en_word2ind,
                    use_gru = true,
                    en_ind2word,
                    de_word2ind,
                    de_ind2word,
                    './models/local_one/my_model',
                    'INFER',
                    num_layers_encoder = num_layers_encoder,
                    num_layers_decoder = num_layers_decoder,
                    batch_size = len(en_inds[:50]),
                    keep_probability = 1.0,
                    learning_rate = 0.0,
                    beam_width = 0,
                    rnn_size_encoder = rnn_size_encoder,
                    rnn_size_decoder = rnn_size_decoder)

nmt.build_graph()
preds = nmt.infer(en_inds[:50], restore_path =  './models/local_one/my_model', targets = _de_inds[:50])

In [None]:
# show some of the created translations
# Note: the way bleu score is probably not the perfect way to do it
nmt_model_utils.sample_results(preds, en_ind2word, de_ind2word, en_word2ind, de_word2ind, _de_inds[:50], en_inds[:50])