# OpenNMT-tf 2.1.1

https://opennmt.net/OpenNMT-tf/package/opennmt.html

## OpenNMT是一个开源生态系统，用于神经机器翻译和神经序列学习。支持pytorch与tensorflow，介绍基于tensorflow2.0的openNMT 2.1.1

In [41]:
!pip install OpenNMT-tf




In [42]:
import opennmt
import tensorflow as tf

# src

In [43]:
with open('src.txt', 'r') as f:
    src_data = f.read().split('\n')[:-1]
src_data

['which group are you in ?', 'where you are ?']

# target

In [44]:
with open('tgt.txt', 'r') as f:
    tgt_data = f.read().split('\n')[:-1]
tgt_data

['the search group .', 'i am in ir .']

# package opennmt.data

#### 字典构建 opennmt.data.Vocab(special_tokens=None)

In [46]:
special_tokens = ['<s>', '</s>', '<unk>', '<pad>']

In [47]:
vocab = opennmt.data.Vocab(special_tokens)
vocab.words

['<s>', '</s>', '<unk>', '<pad>']

In [48]:
vocab.add_from_text('src.txt')
vocab.words

['<s>',
 '</s>',
 '<unk>',
 '<pad>',
 'which',
 'group',
 'are',
 'you',
 'in',
 '?',
 'where']

In [49]:
vocab.add_from_text('tgt.txt')
vocab.words

['<s>',
 '</s>',
 '<unk>',
 '<pad>',
 'which',
 'group',
 'are',
 'you',
 'in',
 '?',
 'where',
 'the',
 'search',
 '.',
 'i',
 'am',
 'ir']

In [50]:
vocab.lookup('group')

5

In [51]:
vocab.lookup(5)

'group'

In [52]:
vocab.prune(max_size=0, min_frequency=2).words

['<s>', '</s>', '<unk>', '<pad>', 'group', 'are', 'you', 'in', '?', '.']

In [53]:
vocab.serialize('vocab.txt')

In [54]:
with open('vocab.txt') as f:
    for w in f.readlines():
        print(w[:-1])

<s>
</s>
<unk>
<pad>
which
group
are
you
in
?
where
the
search
.
i
am
ir


#### 数据噪声 opennmt.data.WordOmission(1)

In [55]:
noise = opennmt.data.WordOmission(1)
noise(tf.constant(src_data[0].split())).numpy()

array([b'which', b'group', b'are', b'you', b'in'], dtype=object)

In [56]:
noise = opennmt.data.WordOmission(2)
noise(tf.constant(src_data[0].split())).numpy()

array([b'which', b'are', b'you', b'?'], dtype=object)

#### 数据噪声 opennmt.data.WordDropout

In [57]:
noise = opennmt.data.WordDropout(0.5)
noise(tf.constant(src_data[0].split())).numpy()

array([b'group', b'you'], dtype=object)

In [58]:
noise = opennmt.data.WordDropout(0.9)
noise(tf.constant(src_data[0].split())).numpy()

array([b'?'], dtype=object)

In [59]:
noise = opennmt.data.WordDropout(0.2)
src_data = [noise(tf.constant(line.split())).numpy().tolist() for line in src_data]
src_data

[[b'group', b'are', b'you', b'in', b'?'], [b'where', b'?']]

In [60]:
tgt_data = [noise(tf.constant(line.split())).numpy().tolist() for line in tgt_data]
tgt_data

[[b'the', b'search', b'group'], [b'i', b'am', b'.']]

#### lookup

In [61]:
src_id = [[vocab.lookup(word) if word in vocab else 
           vocab.lookup('<unk>') for word in line] for line in src_data]
src_id

[[5, 6, 7, 8, 9], [10, 9]]

In [62]:
tgt_id = [[vocab.lookup(word) if word in vocab else 
           vocab.lookup('<unk>') for word in line] for line in tgt_data]
tgt_id

[[11, 12, 5], [14, 15, 13]]

# Package opennmt.encoders
##### class opennmt.encoders.LSTMEncoder(num_layers, num_units, bidirectional=False, residual_connections=False, dropout=0.3)

In [63]:
import numpy

In [64]:
max_len = 10
pad_src_id = tf.keras.preprocessing.sequence.pad_sequences(src_id, value = vocab.lookup('<pad>'), 
                                                       maxlen = max_len)
src_seq_len = [len(x) for x in src_data]
pad_src_id

array([[ 3,  3,  3,  3,  3,  5,  6,  7,  8,  9],
       [ 3,  3,  3,  3,  3,  3,  3,  3, 10,  9]], dtype=int32)

In [65]:
src_seq_len

[5, 2]

In [66]:
max_len = 10
pad_tgt_id = tf.keras.preprocessing.sequence.pad_sequences(tgt_id, value = vocab.lookup('<pad>'), 
                                                       maxlen = max_len)
tgt_seq_len = [len(x) for x in tgt_data]
pad_tgt_id

array([[ 3,  3,  3,  3,  3,  3,  3, 11, 12,  5],
       [ 3,  3,  3,  3,  3,  3,  3, 14, 15, 13]], dtype=int32)

In [67]:
emb_dim = 20
pad_src_emb = tf.keras.layers.Embedding(vocab.size, emb_dim, input_length=10)(pad_src_id)
pad_src_emb.shape

TensorShape([2, 10, 20])

In [68]:
enc = opennmt.encoders.LSTMEncoder(1, 15)
outputs, states, sequence_length = enc(pad_src_emb, src_seq_len)
outputs.shape

TensorShape([2, 10, 15])

In [69]:
outputs[0,-1,:]

<tf.Tensor: shape=(15,), dtype=float32, numpy=
array([ 0.04737591,  0.00166916,  0.00071735, -0.01284687,  0.01742313,
       -0.04498446,  0.02106411,  0.00564089, -0.03366548,  0.01916166,
       -0.01094256, -0.00107302, -0.00540767, -0.00640422,  0.00955995],
      dtype=float32)>

In [70]:
states

((<tf.Tensor: shape=(2, 15), dtype=float32, numpy=
  array([[ 0.04737591,  0.00166916,  0.00071735, -0.01284687,  0.01742313,
          -0.04498446,  0.02106411,  0.00564089, -0.03366548,  0.01916166,
          -0.01094256, -0.00107302, -0.00540767, -0.00640422,  0.00955995],
         [ 0.02719584,  0.00054847, -0.00033527, -0.01090283,  0.007499  ,
          -0.0263254 ,  0.01515887,  0.00537936, -0.01710399,  0.01374206,
          -0.00555701, -0.0001357 , -0.00215719, -0.00632402,  0.00690264]],
        dtype=float32)>,
  <tf.Tensor: shape=(2, 15), dtype=float32, numpy=
  array([[ 0.09311996,  0.00331793,  0.00139865, -0.02529183,  0.0345932 ,
          -0.09121793,  0.04238915,  0.01127814, -0.06712874,  0.03761647,
          -0.02202717, -0.00213244, -0.01092287, -0.01310171,  0.01894908],
         [ 0.05332884,  0.00108977, -0.00065884, -0.02153684,  0.01488502,
          -0.05316443,  0.03050686,  0.01073244, -0.03401763,  0.02713889,
          -0.01113   , -0.00027121, -0.00435

In [71]:
outputs[:, -1, :]

<tf.Tensor: shape=(2, 15), dtype=float32, numpy=
array([[ 0.04737591,  0.00166916,  0.00071735, -0.01284687,  0.01742313,
        -0.04498446,  0.02106411,  0.00564089, -0.03366548,  0.01916166,
        -0.01094256, -0.00107302, -0.00540767, -0.00640422,  0.00955995],
       [ 0.02719584,  0.00054847, -0.00033527, -0.01090283,  0.007499  ,
        -0.0263254 ,  0.01515887,  0.00537936, -0.01710399,  0.01374206,
        -0.00555701, -0.0001357 , -0.00215719, -0.00632402,  0.00690264]],
      dtype=float32)>

In [72]:
sequence_length

[5, 2]

* ConvEncoder
* GNMTEncoder
* LSTMEncoder 
* MeanEncoder 
* SelfAttentionEncoder 
* ...

#### class opennmt.encoders.ParallelEncoder(encoders)

In [73]:
enc1 = opennmt.encoders.LSTMEncoder(1, 20)
enc2 = opennmt.encoders.LSTMEncoder(1, 20)

In [74]:
p_enc = opennmt.encoders.ParallelEncoder([
    enc1,
    enc2,
], outputs_reducer = opennmt.layers.ConcatReducer())
outputs, states, sequence_length = p_enc(pad_src_emb, src_seq_len)
outputs[0,src_seq_len[0]-1,:]

<tf.Tensor: shape=(40,), dtype=float32, numpy=
array([-1.6207565e-02, -5.9631411e-03, -1.5756657e-02,  2.4876524e-02,
       -2.8930521e-03,  4.3384757e-04, -1.7809410e-02,  1.0466436e-02,
       -6.8887551e-03,  2.1672165e-02, -6.8886746e-03, -2.3159022e-03,
       -3.4424413e-03, -7.1377545e-03,  6.8965665e-04,  2.8718684e-02,
        2.2850968e-02,  3.8266361e-02,  3.2604844e-03, -1.0929327e-02,
        2.1015802e-02,  4.0262691e-03,  1.0469296e-02,  8.2764233e-04,
        2.9064114e-03,  1.3466297e-04,  9.1320379e-03, -1.1087879e-02,
       -1.0378932e-02,  1.6162093e-03, -6.6105556e-04,  9.6056797e-03,
        1.3419278e-02,  1.8408947e-02,  9.9306898e-03, -9.6405456e-03,
       -1.8454060e-02, -9.0356507e-06, -1.3730046e-02, -9.9743390e-03],
      dtype=float32)>

In [75]:
outputs1, _, _ = enc1(pad_src_emb, src_seq_len)
outputs2, _, _ = enc2(pad_src_emb, src_seq_len)
tf.concat([outputs1, outputs2], axis = -1)[0,src_seq_len[0]-1,:]

<tf.Tensor: shape=(40,), dtype=float32, numpy=
array([-1.6207565e-02, -5.9631411e-03, -1.5756657e-02,  2.4876524e-02,
       -2.8930521e-03,  4.3384757e-04, -1.7809410e-02,  1.0466436e-02,
       -6.8887551e-03,  2.1672165e-02, -6.8886746e-03, -2.3159022e-03,
       -3.4424413e-03, -7.1377545e-03,  6.8965665e-04,  2.8718684e-02,
        2.2850968e-02,  3.8266361e-02,  3.2604844e-03, -1.0929327e-02,
        2.1015802e-02,  4.0262691e-03,  1.0469296e-02,  8.2764233e-04,
        2.9064114e-03,  1.3466297e-04,  9.1320379e-03, -1.1087879e-02,
       -1.0378932e-02,  1.6162093e-03, -6.6105556e-04,  9.6056797e-03,
        1.3419278e-02,  1.8408947e-02,  9.9306898e-03, -9.6405456e-03,
       -1.8454060e-02, -9.0356507e-06, -1.3730046e-02, -9.9743390e-03],
      dtype=float32)>

In [76]:
p_enc = opennmt.encoders.ParallelEncoder([
    enc1,
    enc2,
], outputs_reducer = opennmt.layers.DenseReducer(10))
outputs, states, sequence_length = p_enc(pad_src_emb, src_seq_len)
outputs[0,src_seq_len[0]-1,:]

<tf.Tensor: shape=(10,), dtype=float32, numpy=
array([-0.01656474,  0.01210644,  0.00080729,  0.00348837,  0.00234598,
       -0.03267   , -0.00923768,  0.02688324, -0.00740472,  0.03477048],
      dtype=float32)>

In [77]:
p_enc = opennmt.encoders.ParallelEncoder([
    enc1,
    enc2,
], outputs_reducer = opennmt.layers.SumReducer())
outputs, states, sequence_length = p_enc(pad_src_emb, src_seq_len)
outputs[0,src_seq_len[0]-1,:]

<tf.Tensor: shape=(20,), dtype=float32, numpy=
array([ 4.8082378e-03, -1.9368720e-03, -5.2873613e-03,  2.5704166e-02,
        1.3359357e-05,  5.6851056e-04, -8.6773718e-03, -6.2144268e-04,
       -1.7267687e-02,  2.3288375e-02, -7.5497301e-03,  7.2897775e-03,
        9.9768369e-03,  1.1271192e-02,  1.0620346e-02,  1.9078139e-02,
        4.3969080e-03,  3.8257327e-02, -1.0469561e-02, -2.0903666e-02],
      dtype=float32)>

In [78]:
outputs, state, _ = enc(pad_src_emb, src_seq_len)
logits = opennmt.layers.Dense(2)(state[0][0])
logits

<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[1.06895175e-02, 4.13929336e-02],
       [1.24075450e-06, 2.27661896e-02]], dtype=float32)>

# Package opennmt.models

* opennmt.models.GPT2Small
* opennmt.models.LanguageModel
* opennmt.models.LstmCnnCrfTagger
* opennmt.models.SequenceClassifier
* opennmt.models.SequenceGenerator
* opennmt.models.SequenceTagger
* opennmt.models.SequenceToSequence
* opennmt.models.Transformer
* ...

# Package opennmt.utils

* opennmt.utils.BLEUScorer
* opennmt.utils.PRFScorer
* opennmt.utils.ROUGEScorer
* opennmt.utils.BeamSearch
* opennmt.utils.GreedySearch

# Package opennmt.decoders

* opennmt.decoders.AttentionalRNNDecoder
* opennmt.decoders.RNNDecoder
* opennmt.decoders.SelfAttentionDecoder