# OpenNMT-tf 2.1.1

## OpenNMT是一个开源生态系统，用于神经机器翻译和神经序列学习。支持pytorch与tensorflow，介绍基于tensorflow2.0的openNMT 2.1.1

In [44]:
!pip install OpenNMT-tf






In [45]:
import opennmt
import tensorflow as tf

# src

In [46]:
with open('src.txt', 'r') as f:
    src_data = f.read().split('\n')[:-1]
src_data

['which group are you in ?', 'where you are ?']

# target

In [47]:
with open('tgt.txt', 'r') as f:
    tgt_data = f.read().split('\n')[:-1]
tgt_data

['the search group .', 'i am in ir .']

# package opennmt.data

#### 字典构建 opennmt.data.Vocab(special_tokens=None)

In [5]:
special_tokens = ['<s>', '</s>', '<unk>', '<pad>']

In [6]:
vocab = opennmt.data.Vocab(special_tokens)
vocab.words

['<s>', '</s>', '<unk>', '<pad>']

In [7]:
vocab.add_from_text('src.txt')
vocab.words

['<s>',
 '</s>',
 '<unk>',
 '<pad>',
 'which',
 'group',
 'are',
 'you',
 'in',
 '?',
 'where']

In [8]:
vocab.add_from_text('tgt.txt')
vocab.words

['<s>',
 '</s>',
 '<unk>',
 '<pad>',
 'which',
 'group',
 'are',
 'you',
 'in',
 '?',
 'where',
 'the',
 'search',
 '.',
 'i',
 'am',
 'ir']

In [9]:
vocab.lookup('group')

5

In [10]:
vocab.lookup(5)

'group'

In [11]:
vocab.prune(max_size=0, min_frequency=2).words

['<s>', '</s>', '<unk>', '<pad>', 'group', 'are', 'you', 'in', '?', '.']

In [12]:
vocab.serialize('vocab.txt')

In [13]:
with open('vocab.txt') as f:
    for w in f.readlines():
        print(w[:-1])

<s>
</s>
<unk>
<pad>
which
group
are
you
in
?
where
the
search
.
i
am
ir


#### 数据噪声 noise = opennmt.data.WordOmission(1)

In [14]:
noise = opennmt.data.WordOmission(1)
noise(tf.constant(src_data[0].split())).numpy()

array([b'which', b'group', b'are', b'you', b'?'], dtype=object)

In [15]:
noise = opennmt.data.WordOmission(2)
noise(tf.constant(src_data[0].split())).numpy()

array([b'which', b'group', b'in', b'?'], dtype=object)

#### 数据噪声 opennmt.data.WordDropout

In [16]:
noise = opennmt.data.WordDropout(0.5)
noise(tf.constant(src_data[0].split())).numpy()

array([b'which', b'group', b'are', b'you', b'in'], dtype=object)

In [17]:
noise = opennmt.data.WordDropout(0.9)
noise(tf.constant(src_data[0].split())).numpy()

array([b'which'], dtype=object)

In [18]:
noise = opennmt.data.WordDropout(0.2)
src_data = [noise(tf.constant(line.split())).numpy().tolist() for line in src_data]
src_data

[[b'group', b'are', b'in'], [b'where', b'you', b'are', b'?']]

In [19]:
tgt_data = [noise(tf.constant(line.split())).numpy().tolist() for line in tgt_data]
tgt_data

[[b'the', b'search', b'group', b'.'], [b'am', b'in', b'ir', b'.']]

#### lookup

In [20]:
src_id = [[vocab.lookup(word) if word in vocab else 
           vocab.lookup('<unk>') for word in line] for line in src_data]
src_id

[[5, 6, 8], [10, 7, 6, 9]]

In [21]:
tgt_id = [[vocab.lookup(word) if word in vocab else 
           vocab.lookup('<unk>') for word in line] for line in tgt_data]
tgt_id

[[11, 12, 5, 13], [15, 8, 16, 13]]

# Package opennmt.encoders
##### class opennmt.encoders.LSTMEncoder(num_layers, num_units, bidirectional=False, residual_connections=False, dropout=0.3)

In [22]:
import numpy

In [23]:
max_len = 10
pad_src_id = tf.keras.preprocessing.sequence.pad_sequences(src_id, value = vocab.lookup('<pad>'), 
                                                       maxlen = max_len)
src_seq_len = [len(x) for x in src_data]
pad_src_id

array([[ 3,  3,  3,  3,  3,  3,  3,  5,  6,  8],
       [ 3,  3,  3,  3,  3,  3, 10,  7,  6,  9]], dtype=int32)

In [24]:
src_seq_len

[3, 4]

In [25]:
max_len = 10
pad_tgt_id = tf.keras.preprocessing.sequence.pad_sequences(tgt_id, value = vocab.lookup('<pad>'), 
                                                       maxlen = max_len)
tgt_seq_len = [len(x) for x in tgt_data]
pad_tgt_id

array([[ 3,  3,  3,  3,  3,  3, 11, 12,  5, 13],
       [ 3,  3,  3,  3,  3,  3, 15,  8, 16, 13]], dtype=int32)

In [26]:
emb_dim = 20
pad_src_emb = tf.keras.layers.Embedding(vocab.size, emb_dim, input_length=10)(pad_src_id)
pad_src_emb.shape

TensorShape([2, 10, 20])

In [27]:
enc = opennmt.encoders.LSTMEncoder(1, 15)
outputs, states, sequence_length = enc(pad_src_emb, src_seq_len)
outputs.shape

TensorShape([2, 10, 15])

In [28]:
outputs[0,-1,:]

<tf.Tensor: shape=(15,), dtype=float32, numpy=
array([ 0.00263265,  0.00270151, -0.00246869, -0.01116184,  0.0044781 ,
       -0.00544795, -0.00939717, -0.00074116,  0.00283719,  0.00317579,
       -0.0209918 , -0.00500038,  0.00414584, -0.00178641,  0.00918154],
      dtype=float32)>

In [29]:
states

((<tf.Tensor: shape=(2, 15), dtype=float32, numpy=
  array([[ 0.00263265,  0.00270151, -0.00246869, -0.01116184,  0.0044781 ,
          -0.00544795, -0.00939717, -0.00074116,  0.00283719,  0.00317579,
          -0.0209918 , -0.00500038,  0.00414584, -0.00178641,  0.00918154],
         [ 0.00260052,  0.0035042 , -0.00318885, -0.01382197,  0.00480902,
          -0.00694631, -0.01076731, -0.00148092,  0.00358938,  0.00395388,
          -0.02524526, -0.00543948,  0.00530553, -0.00180258,  0.00998819]],
        dtype=float32)>,
  <tf.Tensor: shape=(2, 15), dtype=float32, numpy=
  array([[ 0.00526918,  0.00541064, -0.00498785, -0.02257553,  0.00897897,
          -0.01069824, -0.01888357, -0.00147908,  0.00574807,  0.0063295 ,
          -0.04177443, -0.00991749,  0.0082333 , -0.00360373,  0.01835097],
         [ 0.00520667,  0.00702028, -0.00644645, -0.02795127,  0.00965023,
          -0.01363943, -0.02165095, -0.00295668,  0.00727191,  0.00786874,
          -0.0502619 , -0.01078528,  0.01052

In [30]:
outputs[:, -1, :]

<tf.Tensor: shape=(2, 15), dtype=float32, numpy=
array([[ 0.00263265,  0.00270151, -0.00246869, -0.01116184,  0.0044781 ,
        -0.00544795, -0.00939717, -0.00074116,  0.00283719,  0.00317579,
        -0.0209918 , -0.00500038,  0.00414584, -0.00178641,  0.00918154],
       [ 0.00260052,  0.0035042 , -0.00318885, -0.01382197,  0.00480902,
        -0.00694631, -0.01076731, -0.00148092,  0.00358938,  0.00395388,
        -0.02524526, -0.00543948,  0.00530553, -0.00180258,  0.00998819]],
      dtype=float32)>

In [31]:
sequence_length

[3, 4]

* ConvEncoder
* GNMTEncoder
* LSTMEncoder 
* MeanEncoder 
* SelfAttentionEncoder 
* PositionEncoder

#### class opennmt.encoders.ParallelEncoder(encoders)

In [32]:
enc1 = opennmt.encoders.LSTMEncoder(1, 20)
enc2 = opennmt.encoders.LSTMEncoder(1, 20)

In [33]:
p_enc = opennmt.encoders.ParallelEncoder([
    enc1,
    enc2,
], outputs_reducer = opennmt.layers.ConcatReducer())
outputs, states, sequence_length = p_enc(pad_src_emb, src_seq_len)
outputs[0,src_seq_len[0]-1,:]

<tf.Tensor: shape=(40,), dtype=float32, numpy=
array([ 4.27721580e-03, -1.23676815e-04,  3.24125448e-03, -1.46875530e-03,
       -1.34765105e-02, -1.64073566e-03,  1.74806651e-03, -6.06455002e-03,
       -8.01201910e-03, -6.79677445e-03,  1.38312075e-02, -4.11572959e-03,
        2.75489129e-03, -3.81474989e-03, -9.28935595e-03, -2.24214066e-02,
       -1.05688795e-02,  4.38976288e-03, -1.19652329e-02, -9.21650697e-03,
        8.09337478e-03,  3.73086799e-03,  5.36764553e-03, -9.33439191e-03,
        1.46115869e-02, -8.60170182e-03,  6.91711390e-03,  7.95671996e-03,
        3.59884161e-03, -9.58624575e-03, -1.82081982e-02,  2.73429730e-04,
        7.97878113e-03,  1.07945474e-02, -1.93337929e-02,  1.48923192e-02,
        3.65795591e-03,  4.78065666e-03, -3.62304017e-05,  1.04617607e-02],
      dtype=float32)>

In [34]:
outputs1, _, _ = enc1(pad_src_emb, src_seq_len)
outputs2, _, _ = enc2(pad_src_emb, src_seq_len)
tf.concat([outputs1, outputs2], axis = -1)[0,src_seq_len[0]-1,:]

<tf.Tensor: shape=(40,), dtype=float32, numpy=
array([ 4.27721580e-03, -1.23676815e-04,  3.24125448e-03, -1.46875530e-03,
       -1.34765105e-02, -1.64073566e-03,  1.74806651e-03, -6.06455002e-03,
       -8.01201910e-03, -6.79677445e-03,  1.38312075e-02, -4.11572959e-03,
        2.75489129e-03, -3.81474989e-03, -9.28935595e-03, -2.24214066e-02,
       -1.05688795e-02,  4.38976288e-03, -1.19652329e-02, -9.21650697e-03,
        8.09337478e-03,  3.73086799e-03,  5.36764553e-03, -9.33439191e-03,
        1.46115869e-02, -8.60170182e-03,  6.91711390e-03,  7.95671996e-03,
        3.59884161e-03, -9.58624575e-03, -1.82081982e-02,  2.73429730e-04,
        7.97878113e-03,  1.07945474e-02, -1.93337929e-02,  1.48923192e-02,
        3.65795591e-03,  4.78065666e-03, -3.62304017e-05,  1.04617607e-02],
      dtype=float32)>

In [42]:
outputs, state, _ = enc(pad_src_emb, src_seq_len)
logits = opennmt.layers.Dense(2)(state[0][0])
logits

<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[0.0188968 , 0.00430593],
       [0.02246414, 0.00638116]], dtype=float32)>

# Package opennmt.models

* opennmt.models.GPT2Small
* opennmt.models.LanguageModel
* opennmt.models.LstmCnnCrfTagger
* opennmt.models.SequenceClassifier
* opennmt.models.SequenceGenerator
* opennmt.models.SequenceTagger
* opennmt.models.SequenceToSequence
* opennmt.models.Transformer
* ...

# Package opennmt.utils

* opennmt.utils.BLEUScorer
* opennmt.utils.BeamSearch
* opennmt.utils.GreedySearch
* opennmt.utils.PRFScorer
* opennmt.utils.ROUGEScorer