In [4]:
!pip3 install tensorflow -i https://pypi.tuna.tsinghua.edu.cn/simple

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting tensorflow
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/72/8a/033b584f8dd863c07aa8877c2dd231777de0bb0b1338f4ac6a81999980ee/tensorflow-2.7.0-cp38-cp38-manylinux2010_x86_64.whl (489.6 MB)
[K     |████████████████████████████████| 489.6 MB 46 kB/s s eta 0:00:01     |██████████████████              | 276.2 MB 17.7 MB/s eta 0:00:13     |██████████████████▍             | 281.6 MB 17.7 MB/s eta 0:00:12     |█████████████████████▎          | 325.5 MB 27.5 MB/s eta 0:00:06     |████████████████████████▉       | 379.2 MB 56.1 MB/s eta 0:00:02     |███████████████████████████     | 412.6 MB 17.0 MB/s eta 0:00:05     |████████████████████████████▏   | 431.5 MB 9.8 MB/s eta 0:00:06
[?25hCollecting libclang>=9.0.1
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/76/2d/7b0f7f5519669f11e66028fa227d4bcda4d77411d52d26c661676db82338/libclang-12.0.0-py2.py3-none-manylinux1_x86_64.whl (13.4 MB)
[K     |██████████

# Seq2seq 结构

## RNN 结构

### 普通的 RNN

In [8]:
import tensorflow as tf
import warnings

warnings.filterwarnings("ignore")

In [9]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.SimpleRNN(4, input_shape=(3, 2)))

![](assets/rnn.jpg)

In [17]:
x = tf.random.normal((1, 3, 2))

layer = tf.keras.layers.SimpleRNN(4, input_shape=(3, 2))
output = layer(x)

print(output.shape)

print(output)

(1, 4)
tf.Tensor([[0.8968392  0.53277504 0.07782626 0.42022803]], shape=(1, 4), dtype=float32)


## 输入的 embedding 层

https://www.tensorflow.org/api_docs/python/tf/keras/layers/Embedding

In [18]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(3, 2))
model.add(tf.keras.layers.SimpleRNN(4, input_shape=(3, 2)))

In [19]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 2)           6         
_________________________________________________________________
simple_rnn_3 (SimpleRNN)     (None, 4)                 28        
Total params: 34
Trainable params: 34
Non-trainable params: 0
_________________________________________________________________


两个trick
- 使用已经有的 embedding 作为参数
- embedding_lookup


https://www.tensorflow.org/api_docs/python/tf/nn/embedding_lookup

In [65]:
embedding_matrix = tf.constant(
        [[0.21,0.41,0.51,0.11],
        [0.22,0.42,0.52,0.12],
        [0.23,0.43,0.53,0.13],
        [0.24,0.44,0.54,0.14]],dtype=tf.float32)

tf.keras.layers.Embedding(4, 
                          4,  
                          embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
                          trainable=True)

<tensorflow.python.keras.layers.embeddings.Embedding at 0x7fd25d0d9460>

In [20]:
# embedding
embedding = tf.constant(
        [[0.21,0.41,0.51,0.11],
        [0.22,0.42,0.52,0.12],
        [0.23,0.43,0.53,0.13],
        [0.24,0.44,0.54,0.14]],dtype=tf.float32)

feature_batch = tf.constant([2,3,1,0])

get_embedding1 = tf.nn.embedding_lookup(embedding,feature_batch)
print(get_embedding1)

tf.Tensor(
[[0.23 0.43 0.53 0.13]
 [0.24 0.44 0.54 0.14]
 [0.22 0.42 0.52 0.12]
 [0.21 0.41 0.51 0.11]], shape=(4, 4), dtype=float32)


### 多输出的 RNN

In [10]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.SimpleRNN(4, input_shape=(3, 2), 
                    return_sequences=True))

![](assets/rnn-mul.jpg)

In [22]:
x = tf.random.normal((1, 3, 2))

layer = tf.keras.layers.SimpleRNN(4, input_shape=(3, 2), return_sequences=True)
output = layer(x)

print(output.shape)

output

(1, 3, 4)


<tf.Tensor: shape=(1, 3, 4), dtype=float32, numpy=
array([[[-0.08282938, -0.50415444,  0.17402259,  0.38521335],
        [-0.56408477,  0.6669254 ,  0.8670968 ,  0.15518458],
        [ 0.28368235, -0.06337585,  0.7859039 , -0.07593489]]],
      dtype=float32)>

### 每个时间步增加层

In [13]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.SimpleRNN(4, input_shape=(3, 2), 
                    return_sequences=True))
model.add(tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(4, activation='softmax')))

![](assets/rnn-time-distributed.jpg)

### 多层叠加

In [14]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.SimpleRNN(4, input_shape=(3, 2), return_sequences=True))
model.add(tf.keras.layers.SimpleRNN(4, input_shape=(3, 2), return_sequences=True))
model.add(tf.keras.layers.SimpleRNN(4))

![rnn-stacking.jpg](assets/rnn-stacking.jpg)

## 双向的RNN

![](assets/bi-rnn.jpg)

In [44]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(10, return_sequences=True), input_shape=(5, 10)))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(10)))
model.add(tf.keras.layers.Dense(5))
model.add(tf.keras.layers.Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

In [48]:
model.summary()

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_3 (Bidirection (None, 5, 20)             1680      
_________________________________________________________________
dense_3 (Dense)              (None, 5, 5)              105       
_________________________________________________________________
activation_1 (Activation)    (None, 5, 5)              0         
Total params: 1,785
Trainable params: 1,785
Non-trainable params: 0
_________________________________________________________________


## LSTM

In [23]:
inputs = tf.random.normal([32, 10, 8])
lstm = tf.keras.layers.LSTM(4)
output = lstm(inputs)
print(output.shape)

(32, 4)


![](assets/lstm.jpg)

In [24]:
lstm = tf.keras.layers.LSTM(4, return_sequences=True, return_state=True)
out, h_state, c_state = lstm(inputs)
print(out.shape)
print(h_state.shape)
print(c_state.shape)

(32, 10, 4)
(32, 4)
(32, 4)


## GRU

In [18]:
inputs = tf.random.normal([32, 10, 8])
gru = tf.keras.layers.GRU(4)
output = gru(inputs)
print(output.shape)

(32, 4)


![](assets/gru.jpg)

In [25]:
gru = tf.keras.layers.GRU(4, return_sequences=True, return_state=True)
out, final_state = gru(inputs)
print(out.shape)
print(final_state.shape)

(32, 10, 4)
(32, 4)


## Seq2seq

![](assets/seq2seq.jpg)

In [26]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.enc_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')

    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = hidden)
        return output, state

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))

In [27]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.dec_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)

        # 用于注意力
        self.attention = BahdanauAttention(self.dec_units)

    def call(self, x, hidden, enc_output):
        # 编码器输出 （enc_output） 的形状 == （批大小，最大长度，隐藏层大小）
        context_vector, attention_weights = self.attention(hidden, enc_output)

        # x 在通过嵌入层后的形状 == （批大小，1，嵌入维度）
        x = self.embedding(x)

        # x 在拼接 （concatenation） 后的形状 == （批大小，1，嵌入维度 + 隐藏层大小）
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        # 将合并后的向量传送到 GRU
        output, state = self.gru(x)

        # 输出的形状 == （批大小 * 1，隐藏层大小）
        output = tf.reshape(output, (-1, output.shape[2]))

        # 输出的形状 == （批大小，vocab）
        x = self.fc(output)

        return x, state, attention_weights

### N v 1 

In [12]:
model = tf.keras.Sequential([
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])

## Attention 机制

In [28]:
!pip install tensorflow-addons



In [26]:
import tensorflow_addons as tfa

In [24]:
batch_size = 4
max_time = 7
hidden_size = 32

memory = tf.random.uniform([batch_size, max_time, hidden_size])
memory_sequence_length = tf.fill([batch_size], max_time)

print(memory.shape)
print(memory_sequence_length)

(4, 7, 32)
tf.Tensor([7 7 7 7], shape=(4,), dtype=int32)


In [27]:
attention_mechanism = tfa.seq2seq.LuongAttention(hidden_size)
attention_mechanism.setup_memory(memory, memory_sequence_length)

In [28]:
cell = tf.keras.layers.LSTMCell(hidden_size)
cell = tfa.seq2seq.AttentionWrapper(
    cell, attention_mechanism, attention_layer_size=hidden_size)

In [31]:
inputs = tf.random.uniform([batch_size, hidden_size])
state = cell.get_initial_state(inputs)

outputs, state = cell(inputs, state)
outputs.shape

TensorShape([4, 32])

In [None]:
# https://www.tensorflow.org/addons/api_docs/python/tfa/seq2seq/BahdanauAttention
tfa.seq2seq.BahdanauAttention(units, memory = memory, memory_sequence_length=memory_sequence_length)

In [None]:
# https://www.tensorflow.org/addons/api_docs/python/tfa/seq2seq/LuongAttention
tfa.seq2seq.LuongAttention(units, memory = memory, memory_sequence_length=memory_sequence_length)

## tf2 一些 API 操作

学习教程 https://github.com/lyhue1991/eat_tensorflow2_in_30_days

###   一些tensor操作的转化操作

In [2]:
import tensorflow as tf

### 连接的操作

tf.concat

In [32]:
t1 = [[1, 2, 3], [4, 5, 6]] # 2, 3
t2 = [[7, 8, 9], [10, 11, 12]] 
tf.concat([t1, t2], axis=1) 

<tf.Tensor: shape=(2, 6), dtype=int32, numpy=
array([[ 1,  2,  3,  7,  8,  9],
       [ 4,  5,  6, 10, 11, 12]], dtype=int32)>

### 增加维度的操作
tf.expand_dims

In [3]:
t3 = [[1, 2, 3],[4, 5, 6]] # shape [2, 3] 

In [4]:
tf.shape(t3)

<tf.Tensor: shape=(2,), dtype=int32, numpy=array([2, 3], dtype=int32)>

In [5]:
tf.expand_dims(t3, axis=2) 

<tf.Tensor: shape=(2, 3, 1), dtype=int32, numpy=
array([[[1],
        [2],
        [3]],

       [[4],
        [5],
        [6]]], dtype=int32)>

In [6]:
tf.expand_dims(t3, 1) 

<tf.Tensor: shape=(2, 1, 3), dtype=int32, numpy=
array([[[1, 2, 3]],

       [[4, 5, 6]]], dtype=int32)>

In [7]:
tf.expand_dims(t3, 2) 

<tf.Tensor: shape=(2, 3, 1), dtype=int32, numpy=
array([[[1],
        [2],
        [3]],

       [[4],
        [5],
        [6]]], dtype=int32)>

### 减维操作

tf.squeeze

In [29]:
t4 = tf.expand_dims(t3, 2) 

In [30]:
t4

<tf.Tensor: shape=(2, 3, 1), dtype=int32, numpy=
array([[[1],
        [2],
        [3]],

       [[4],
        [5],
        [6]]], dtype=int32)>

In [31]:
tf.squeeze(t4, 2)

<tf.Tensor: shape=(2, 3), dtype=int32, numpy=
array([[1, 2, 3],
       [4, 5, 6]], dtype=int32)>

### 更改维度操作

tf.reshape

### 类型转换操作

tf.cast

In [9]:
tf.cast

<function tensorflow.python.ops.math_ops.cast(x, dtype, name=None)>

In [10]:
x = tf.constant([1.8, 2.2], dtype=tf.float32)
tf.dtypes.cast(x, tf.int32) 
# mask = [True , False] loss.astype

<tf.Tensor: shape=(2,), dtype=int32, numpy=array([1, 2], dtype=int32)>

### 堆叠操作

tf.stack

In [11]:
x = tf.constant([1, 4]) 
y = tf.constant([2, 5]) 
z = tf.constant([3, 6]) 

In [12]:
tf.stack([x, y, z], axis=0) 

<tf.Tensor: shape=(3, 2), dtype=int32, numpy=
array([[1, 4],
       [2, 5],
       [3, 6]], dtype=int32)>

In [13]:
tf.stack([x, y, z], axis=1) 

<tf.Tensor: shape=(2, 3), dtype=int32, numpy=
array([[1, 2, 3],
       [4, 5, 6]], dtype=int32)>

## Layer

### Encoder

In [None]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz, embedding_matrix):
       
    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state=hidden)
        return output, state

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))

### Attention

In [None]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, dec_hidden, enc_output):
        

### Decoder

In [None]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz, embedding_matrix):
        super(Decoder, self).__init__()
       

    def call(self, x, context_vector):
        # enc_output shape == (batch_size, max_length, hidden_size)
        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)
        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        # passing the concatenated vector to the GRU
        output, state = self.gru(x)
        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))
        # output shape == (batch_size, vocab)
        out = self.fc(output)
        return x, out, state

## Model

In [None]:
class SEQ2SEQ(tf.keras.Model):
    def __init__(self):
        self.encoder = Encoder()
    
    def call(self, enc_output, dec_hidden, enc_inp, dec_inp):
        predictions = []
        attentions = []
        self.encoder
        self.decoder 
        return tf.stack(predictions, 1) [batchsize, 20,30000], dec_hidden, attentions

## 定义优化器和损失函数

In [38]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

## 训练

- 将 输入 传送至 编码器，编码器返回 编码器输出 和 编码器隐藏层状态。
- 将编码器输出、编码器隐藏层状态和解码器输入（即 开始标记）传送至解码器。
- 解码器返回 预测 和 解码器隐藏层状态。
- 解码器隐藏层状态被传送回模型，预测被用于计算损失。
- 使用 教师强制 （teacher forcing） 决定解码器的下一个输入。
- 教师强制 是将 目标词 作为 下一个输入 传送至解码器的技术。
- 最后一步是计算梯度，并将其应用于优化器和反向传播。

In [None]:
@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0

    with tf.GradientTape() as tape:
    enc_output, enc_hidden = encoder(inp, enc_hidden)

    dec_hidden = enc_hidden

    dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * BATCH_SIZE, 1)

    # 教师强制 - 将目标词作为下一个输入
    for t in range(1, targ.shape[1]):
        # 将编码器输出 （enc_output） 传送至解码器
        predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)

        loss += loss_function(targ[:, t], predictions)

        # 使用教师强制
        dec_input = tf.expand_dims(targ[:, t], 1)

    batch_loss = (loss / int(targ.shape[1]))

    variables = encoder.trainable_variables + decoder.trainable_variables

    gradients = tape.gradient(loss, variables)

    optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss

In [None]:
EPOCHS = 10

for epoch in range(EPOCHS):
    start = time.time()

    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0

    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss

        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                         batch,
                                                         batch_loss.numpy()))
    # 每 2 个周期（epoch），保存（检查点）一次模型
    if (epoch + 1) % 2 == 0:
        checkpoint.save(file_prefix = checkpoint_prefix)

    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                          total_loss / steps_per_epoch))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))