In [1]:
%%html
<style>
.container {
        width: 100%;
        padding: 0px !important;
    }

@media (max-width:1200px) {
    #notebook[tabindex="-1"] {
        padding-top: 0px !important;
    }
    .prompt {
        min-width: 8ex;
    }
    .input_prompt {
        font-weight: bold;
        position: absolute;
        padding-right: 0px;
        padding-top: 2px;
        padding-bottom: 0px;
        padding-left: 8px;
        font-size: 10px !important;
    }
    .CodeMirror-lines {
        padding-top:12px !important;
    }
    pre {
        white-space: pre-wrap !important;
    }
}    

## Imports and set params experiments

In [2]:
%load_ext autoreload
%autoreload 2

import numpy as np
import os, re, sys
import tensorflow as tf
import tqdm
from itertools import compress
import pickle
from preprocessing import *
import glob

tf.__version__

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


'1.8.0'

##  Data preprocessing

In [3]:
path= 'data/multi30k/'
os.listdir(path)

['train.de', 'train.en', 'val.en', 'test.en', 'test.de', 'val.de']

### Run preprocessing IWSLT16

In [4]:
# path multi30k
p = os.path.join('data','multi30k')

# dict of paths
dict_srcl_tgtl_path_multi30k = {
                       'train': {
                           'src': os.path.join(p, 'train.en'), 'tgt': os.path.join(p, 'train.de')
                           },\
                       'val': {
                           'src': os.path.join(p, 'val.en'), 'tgt': os.path.join(p, 'val.de')
                           },\
                       'test': {
                           'src': os.path.join(p, 'test.en'), 'tgt': os.path.join(p, 'test.de')
                           }
                      }
# -> for preprocessing
path_save = 'data/saved_preprocess/multi30k'
# -> to load preprocessing
# path_save = 'data/saved_preprocess/multi30k_20000_50_glove_charemb.p'


if path_save.split('.')[-1] != 'p':
    # preprocess data...
    Preprocess_multi30K = PreprocessorMT(dict_srcl_tgtl_path_multi30k,\
                     path_save=path_save,\
                     max_vocab_src=20000, max_vocab_tgt=20000, max_seq_length=50,\
                     glove_path='data/GloVe/glove.840B.300d.txt',\
                     charemb_path='data/kazuma/charNgram.txt')

    Preprocess_multi30K.process_and_save()
else:
    # or load preprocessed file
    print ('Loading the preprocessed data...')
    Preprocess_multi30K = pickle.load(open(path_save, 'rb'))
    print ('Done loading the preprocessed data!')

Loading the preprocessed data...
Done loading the preprocessed data!


In [5]:
for mode in ['train', 'val', 'test']:
    print ("Size of set {}: {}".format(mode, Preprocess_multi30K.get_length_set(mode)))

Size of set train: 4750
Size of set val: 148
Size of set test: 181


## Modelling

In [6]:
from NMT import *
dataset_MT = dataset(Preprocess_multi30K)

default_params = {
    'dataset': None,
    'save_path': '',
    'mode': 'TRAIN',
    'nb_layers_enc': None,
    'nb_layers_dec': None,
    'embedding_dim': 300,
    'train_emb_src': True,
    'init_emb': None,
    'lstm_size_enc': 600,
    'lstm_size_dec': 600,
    'nb_layers_enc': 2,
    'nb_layers_dec': 2,
    'keep_probability': 0.8,
    'batch_size': 64,
    'epochs': 20,
    'EOS': "</s>",
    'SOS': "<s>",
    'PAD': '<pad>',
    'clip': 5,
    'lr': 0.001,
    'lr_decay': 0.9,
    'lr_decay_steps': 100,
    'print_update': 25,
    'patience': 8
}

### Start training of seq2seq model

In [7]:
reset_graph()

train_params = default_params
train_params['dataset'] = dataset_MT
train_params['lstm_size_enc'] = 600
train_params['lstm_size_dec'] = 600
train_params['nb_layers_enc'] = 2
train_params['nb_layers_dec'] = 2
train_params['embedding_dim'] = 400 # because GLoVe+charemb
train_params['init_emb'] = np.vstack(list(map(lambda i: dataset_MT.data.vocab_src.embeddings[dataset_MT.data.vocab_src.idx2lab[i]], np.arange(dataset_MT.data.vocab_src.size)))).astype(np.float32)
train_params['batch_size'] = 256
train_params['epochs'] = 1
train_params['mode'] = 'TRAIN' 
train_params['save_path'] = 'save_run/MT_S/run1'

nmt  =  NMT(train_params)
  
nmt.build_model()

model_summary()

Graph of model is built.
---------
Variables: name (type shape) [size]
---------
src_embedding:0 (float32_ref 3023x400) [1209200, bytes: 4836800]
tgt_embedding:0 (float32_ref 3819x400) [1527600, bytes: 6110400]
dynamic_seq2seq/encoder/bidirectional_rnn/fw/lstm_cell/kernel:0 (float32_ref 700x1200) [840000, bytes: 3360000]
dynamic_seq2seq/encoder/bidirectional_rnn/fw/lstm_cell/bias:0 (float32_ref 1200) [1200, bytes: 4800]
dynamic_seq2seq/encoder/bidirectional_rnn/bw/lstm_cell/kernel:0 (float32_ref 700x1200) [840000, bytes: 3360000]
dynamic_seq2seq/encoder/bidirectional_rnn/bw/lstm_cell/bias:0 (float32_ref 1200) [1200, bytes: 4800]
dynamic_seq2seq/decoder/memory_layer/kernel:0 (float32_ref 600x600) [360000, bytes: 1440000]
dynamic_seq2seq/decoder/attention_wrapper/multi_rnn_cell/cell_0/lstm_cell/kernel:0 (float32_ref 1600x2400) [3840000, bytes: 15360000]
dynamic_seq2seq/decoder/attention_wrapper/multi_rnn_cell/cell_0/lstm_cell/bias:0 (float32_ref 2400) [2400, bytes: 9600]
dynamic_seq2seq/

In [8]:
nmt.train(save=True, print_nb=2, print_bleu=100)

-------------------- Epoch 0 of 1 --------------------
Iteration: 0 of 17	train_loss: 8.2484


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Overall BLEU-4 on 256 sentences: 0.0000
----------------------------------------
Example n°1:

Src sentence:  a man practices boxing <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
Tgt sentence:  <s> ein mann <unk> </s> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
Tgt predicted: <s> <s> <s> . . <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
Bleu score:    0
----------------------------------------
Example n°2:

Src sentence:  a man and two children crossing a street . <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <

### Inference

##### Inference on training set (2048 samples)

In [9]:
reset_graph()

infer_params = default_params
infer_params['dataset'] = dataset_MT
infer_params['lstm_size_enc'] = 600
infer_params['lstm_size_dec'] = 600
infer_params['nb_layers_enc'] = 2
infer_params['nb_layers_dec'] = 2
infer_params['embedding_dim'] = 400 # because GLoVe+charemb
infer_params['init_emb'] = np.vstack(list(map(lambda i: dataset_MT.data.vocab_src.embeddings[dataset_MT.data.vocab_src.idx2lab[i]],\
                                              np.arange(dataset_MT.data.vocab_src.size)))).astype(np.float32)
infer_params['batch_size'] = 2048
infer_params['mode'] = 'INFER' 
infer_params['save_path'] = 'save_run/MT_S/run1'

nmt_infer  =  NMT(infer_params)
  
nmt_infer.build_model()

model_summary()

idx = np.random.choice(dataset_MT.data.get_length_set('train'), 2048)
preds = nmt_infer.infer(dataset_MT.data.data_tok['train'], idx, restore_path = infer_params['save_path'])

Graph of model is built.
---------
Variables: name (type shape) [size]
---------
src_embedding:0 (float32_ref 3023x400) [1209200, bytes: 4836800]
tgt_embedding:0 (float32_ref 3819x400) [1527600, bytes: 6110400]
dynamic_seq2seq/encoder/bidirectional_rnn/fw/lstm_cell/kernel:0 (float32_ref 700x1200) [840000, bytes: 3360000]
dynamic_seq2seq/encoder/bidirectional_rnn/fw/lstm_cell/bias:0 (float32_ref 1200) [1200, bytes: 4800]
dynamic_seq2seq/encoder/bidirectional_rnn/bw/lstm_cell/kernel:0 (float32_ref 700x1200) [840000, bytes: 3360000]
dynamic_seq2seq/encoder/bidirectional_rnn/bw/lstm_cell/bias:0 (float32_ref 1200) [1200, bytes: 4800]
dynamic_seq2seq/decoder/memory_layer/kernel:0 (float32_ref 600x600) [360000, bytes: 1440000]
dynamic_seq2seq/decoder/attention_wrapper/multi_rnn_cell/cell_0/lstm_cell/kernel:0 (float32_ref 1600x2400) [3840000, bytes: 15360000]
dynamic_seq2seq/decoder/attention_wrapper/multi_rnn_cell/cell_0/lstm_cell/bias:0 (float32_ref 2400) [2400, bytes: 9600]
dynamic_seq2seq/

In [10]:
BLEU_set(dataset_MT, idx, preds, 'train', samples=15)

Overall BLEU-4 on 2048 sentences: 0.0184
------------------------------------------------------------
Example n°1:

Src sentence:  four boys run away up an incline .
Tgt sentence:  vier jungen rennen einen hang hinauf davon .
Tgt predicted: <s> ein ein in einem einem . . </s> </s> </s> </s>
Bleu score:    0.0
------------------------------------------------------------
Example n°2:

Src sentence:  four women interact at a conference .
Tgt sentence:  vier frauen interagieren auf einer konferenz .
Tgt predicted: <s> ein mann in einem . . </s> </s> </s> </s> </s>
Bleu score:    0.0
------------------------------------------------------------
Example n°3:

Src sentence:  a woman in a dress doing a craft .
Tgt sentence:  eine frau in einem kleid bastelt etwas .
Tgt predicted: <s> ein ein mann in einem einem . . </s> </s> </s>
Bleu score:    0.0
------------------------------------------------------------
Example n°4:

Src sentence:  men are sitting on bags on the ground .
Tgt sentence:  män

#### Inference on validation set

In [11]:
reset_graph()

infer_params = default_params
infer_params['dataset'] = dataset_MT
infer_params['lstm_size_enc'] = 600
infer_params['lstm_size_dec'] = 600
infer_params['nb_layers_enc'] = 2
infer_params['nb_layers_dec'] = 2
infer_params['embedding_dim'] = 400 # because GLoVe+charemb
infer_params['init_emb'] = np.vstack(list(map(lambda i: dataset_MT.data.vocab_src.embeddings[dataset_MT.data.vocab_src.idx2lab[i]],\
                                              np.arange(dataset_MT.data.vocab_src.size)))).astype(np.float32)
infer_params['batch_size'] = dataset_MT.data.get_length_set('val')
infer_params['mode'] = 'INFER' 
infer_params['save_path'] = 'save_run/MT_S/run1'

nmt_infer  =  NMT(infer_params)
  
nmt_infer.build_model()

model_summary()

idx = np.arange(dataset_MT.data.get_length_set('val')) # predict whole set at once
preds = nmt_infer.infer(dataset_MT.data.data_tok['val'], idx, restore_path = infer_params['save_path'])

Graph of model is built.
---------
Variables: name (type shape) [size]
---------
src_embedding:0 (float32_ref 3023x400) [1209200, bytes: 4836800]
tgt_embedding:0 (float32_ref 3819x400) [1527600, bytes: 6110400]
dynamic_seq2seq/encoder/bidirectional_rnn/fw/lstm_cell/kernel:0 (float32_ref 700x1200) [840000, bytes: 3360000]
dynamic_seq2seq/encoder/bidirectional_rnn/fw/lstm_cell/bias:0 (float32_ref 1200) [1200, bytes: 4800]
dynamic_seq2seq/encoder/bidirectional_rnn/bw/lstm_cell/kernel:0 (float32_ref 700x1200) [840000, bytes: 3360000]
dynamic_seq2seq/encoder/bidirectional_rnn/bw/lstm_cell/bias:0 (float32_ref 1200) [1200, bytes: 4800]
dynamic_seq2seq/decoder/memory_layer/kernel:0 (float32_ref 600x600) [360000, bytes: 1440000]
dynamic_seq2seq/decoder/attention_wrapper/multi_rnn_cell/cell_0/lstm_cell/kernel:0 (float32_ref 1600x2400) [3840000, bytes: 15360000]
dynamic_seq2seq/decoder/attention_wrapper/multi_rnn_cell/cell_0/lstm_cell/bias:0 (float32_ref 2400) [2400, bytes: 9600]
dynamic_seq2seq/

In [12]:
BLEU_set(dataset_MT, idx, preds, 'val', samples=15)

Overall BLEU-4 on 148 sentences: 0.0162
------------------------------------------------------------
Example n°1:

Src sentence:  a man in a cluttered office is using the telephone
Tgt sentence:  ein mann telefoniert in einem unaufgeräumten büro
Tgt predicted: <s> ein ein mann in einem einem . . </s> </s>
Bleu score:    0.0
------------------------------------------------------------
Example n°2:

Src sentence:  a female playing a song on her violin .
Tgt sentence:  eine frau spielt ein lied auf ihrer geige .
Tgt predicted: <s> ein ein mann in einem . . . </s> </s>
Bleu score:    0.0
------------------------------------------------------------
Example n°3:

Src sentence:  a young woman is making rugs in the rain forest
Tgt sentence:  eine junge frau fertigt im regenwald teppiche an
Tgt predicted: <s> ein ein mann in einem einem . . </s> </s>
Bleu score:    0
------------------------------------------------------------
Example n°4:

Src sentence:  a cute baby is smiling at another child

#### Inference on test set

In [13]:
reset_graph()

infer_params = default_params
infer_params['dataset'] = dataset_MT
infer_params['lstm_size_enc'] = 600
infer_params['lstm_size_dec'] = 600
infer_params['nb_layers_enc'] = 2
infer_params['nb_layers_dec'] = 2
infer_params['embedding_dim'] = 400 # because GLoVe+charemb
infer_params['init_emb'] = np.vstack(list(map(lambda i: dataset_MT.data.vocab_src.embeddings[dataset_MT.data.vocab_src.idx2lab[i]],\
                                              np.arange(dataset_MT.data.vocab_src.size)))).astype(np.float32)
infer_params['batch_size'] = dataset_MT.data.get_length_set('test')
infer_params['mode'] = 'INFER' 
infer_params['save_path'] = 'save_run/MT_S/run1'

nmt_infer  =  NMT(infer_params)
  
nmt_infer.build_model()

model_summary()

Graph of model is built.
---------
Variables: name (type shape) [size]
---------
src_embedding:0 (float32_ref 3023x400) [1209200, bytes: 4836800]
tgt_embedding:0 (float32_ref 3819x400) [1527600, bytes: 6110400]
dynamic_seq2seq/encoder/bidirectional_rnn/fw/lstm_cell/kernel:0 (float32_ref 700x1200) [840000, bytes: 3360000]
dynamic_seq2seq/encoder/bidirectional_rnn/fw/lstm_cell/bias:0 (float32_ref 1200) [1200, bytes: 4800]
dynamic_seq2seq/encoder/bidirectional_rnn/bw/lstm_cell/kernel:0 (float32_ref 700x1200) [840000, bytes: 3360000]
dynamic_seq2seq/encoder/bidirectional_rnn/bw/lstm_cell/bias:0 (float32_ref 1200) [1200, bytes: 4800]
dynamic_seq2seq/decoder/memory_layer/kernel:0 (float32_ref 600x600) [360000, bytes: 1440000]
dynamic_seq2seq/decoder/attention_wrapper/multi_rnn_cell/cell_0/lstm_cell/kernel:0 (float32_ref 1600x2400) [3840000, bytes: 15360000]
dynamic_seq2seq/decoder/attention_wrapper/multi_rnn_cell/cell_0/lstm_cell/bias:0 (float32_ref 2400) [2400, bytes: 9600]
dynamic_seq2seq/

In [14]:
idx = np.arange(dataset_MT.data.get_length_set('test')) # predict whole set at once
preds = nmt_infer.infer(dataset_MT.data.data_tok['test'], idx, restore_path = infer_params['save_path'])

Restore graph from save_run/MT_S/run1
INFO:tensorflow:Restoring parameters from save_run/MT_S/run1
Graph restored!


In [15]:
BLEU_set(dataset_MT, idx, preds, 'test', samples=15)

Overall BLEU-4 on 181 sentences: 0.0143
------------------------------------------------------------
Example n°1:

Src sentence:  people are fixing the roof of a house .
Tgt sentence:  leute reparieren das dach eines hauses .
Tgt predicted: <s> ein mann in auf einem . . </s> </s> </s>
Bleu score:    0.0
------------------------------------------------------------
Example n°2:

Src sentence:  a group of people standing in front of an igloo .
Tgt sentence:  eine gruppe von menschen steht vor einem iglu .
Tgt predicted: <s> ein ein mann auf einem einem . . . </s>
Bleu score:    0.0
------------------------------------------------------------
Example n°3:

Src sentence:  a guy works on a building .
Tgt sentence:  ein typ arbeitet an einem gebäude .
Tgt predicted: <s> ein mann in einem einem . . </s> </s> </s>
Bleu score:    0.0
------------------------------------------------------------
Example n°4:

Src sentence:  three people sit in a cave .
Tgt sentence:  drei leute sitzen in einer höh

#### Running the decoder to get CoVe Embedding

In [16]:
idx = np.arange(dataset_MT.data.get_length_set('test')) # predict whole set at once
preds = nmt_infer.infer(dataset_MT.data.data_tok['test'], idx, restore_path = infer_params['save_path'])

Restore graph from save_run/MT_S/run1
INFO:tensorflow:Restoring parameters from save_run/MT_S/run1
Graph restored!


In [17]:
BLEU_set(dataset_MT, idx, preds, 'test', samples=2)

Overall BLEU-4 on 181 sentences: 0.0144
------------------------------------------------------------
Example n°1:

Src sentence:  people are fixing the roof of a house .
Tgt sentence:  leute reparieren das dach eines hauses .
Tgt predicted: <s> ein mann in auf einem . . </s> </s> </s>
Bleu score:    0.0
------------------------------------------------------------
Example n°2:

Src sentence:  a group of people standing in front of an igloo .
Tgt sentence:  eine gruppe von menschen steht vor einem iglu .
Tgt predicted: <s> ein ein mann auf einem einem . . . </s>
Bleu score:    0.0
------------------------------------------------------------


In [18]:
emb = nmt_infer.CoVe(dataset_MT.data.data_tok['test'], idx, restore_path=infer_params['save_path'], with_emb=True) # with_emb to add or not original embeddings (GloVe+char emb)

Restore graph from save_run/MT_S/run1
INFO:tensorflow:Restoring parameters from save_run/MT_S/run1
Graph restored!


Verifying the number of tokens is matching the non-zero embedding outputed by the model.
Sentence 0 has 9 tokens while sentence 1 has 11 tokens.

In [26]:
emb.shape

(181, 50, 1000)

In [24]:
((emb[0]!=0).sum(axis=1)==1000).sum()

9

In [25]:
((emb[1]!=0).sum(axis=1)==1000).sum()

11