In [1]:
%load_ext autoreload
%autoreload 2

import os
import re
import sys
sys.path.append('/home/abner/Documents/Project_02/lecture04/kaikeba_project02/')
import jieba
import pandas as pd
import numpy as np
import tensorflow as tf
tf.__version__

'2.0.0'

In [2]:
def config_gpu(use_cpu=False):
    if use_cpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = '1'  # gpu报错 使用cpu运行
    else:
        gpus = tf.config.experimental.list_physical_devices('GPU')
        if gpus:
            try:
                tf.config.experimental.set_virtual_device_configuration(
                    gpus[0],
                    [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=2048)])
                logical_gpus = tf.config.experimental.list_logical_devices('GPU')
                print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")

                # for gpu in gpus:
                #     tf.config.experimental.set_memory_growth(gpu, True)

            except RuntimeError as e:
                print(e)

In [3]:
config_gpu()

1 Physical GPUs, 1 Logical GPUs


# 导入数据

In [4]:
import time
from data_helper import data_loader,root
from collections import namedtuple,defaultdict

In [5]:
params=namedtuple('params',['data_path','vocab_save_dir','vocab_size','padding_size'])
params.data_path=os.path.join(root,'data','baidu_95.csv')
params.vocab_save_dir=os.path.join(root,'data')
params.vocab_size=50000
params.padding_size=300
params.BUFFER_SIZE=3000
params.BATCH_SIZE=32

In [6]:
X_train, X_test, y_train, y_test, vocab , mlb=data_loader(params,is_rebuild_dataset=False)

In [7]:
y_train=tf.constant(y_train,tf.float32)
y_test=tf.constant(y_test,tf.float32)

In [8]:
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))

# 将数据集缓存到内存中以加快读取速度。
train_dataset = train_dataset.cache()
train_dataset = train_dataset.shuffle(params.BUFFER_SIZE,reshuffle_each_iteration=True).batch(params.BATCH_SIZE, drop_remainder=True)

test_dataset=test_dataset.batch(params.BATCH_SIZE)
# 流水线技术 重叠训练的预处理和模型训练步骤。当加速器正在执行训练步骤 N 时，CPU 开始准备步骤 N + 1 的数据。这样做可以将步骤时间减少到模型训练与抽取转换数据二者所需的最大时间（而不是二者时间总和）。
# 没有流水线技术，CPU 和 GPU/TPU 大部分时间将处于闲置状态:
train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [9]:
x, y = next(iter(train_dataset))
x, y

(<tf.Tensor: id=30, shape=(32, 300), dtype=int32, numpy=
 array([[3983,   74,    4, ...,    0,    0,    0],
        [   4,  975,    2, ...,    0,    0,    0],
        [6728,   15,  102, ...,    0,    0,    0],
        ...,
        [   4,  415,  569, ...,    0,    0,    0],
        [1640,  180,   38, ...,    0,    0,    0],
        [ 671, 5018,   37, ...,    0,    0,    0]], dtype=int32)>,
 <tf.Tensor: id=31, shape=(32, 95), dtype=float32, numpy=
 array([[0., 0., 0., ..., 0., 1., 0.],
        [0., 0., 0., ..., 0., 1., 0.],
        [0., 0., 0., ..., 0., 1., 0.],
        ...,
        [0., 0., 0., ..., 0., 1., 0.],
        [0., 0., 0., ..., 0., 1., 0.],
        [0., 0., 0., ..., 0., 1., 0.]], dtype=float32)>)

<img src="https://tensorflow.google.cn/images/tutorials/transformer/transformer.png" width="600" alt="transformer">

## 创建 Transformer

Transformer 包括编码器，解码器和最后的线性层。解码器的输出是线性层的输入，返回线性层的输出。

In [10]:
from model.transformer_tf2.layers import Encoder,Decoder

In [11]:
class Transformer(tf.keras.Model):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, 
               output_dim, maximum_position_encoding, rate=0.1):
        super(Transformer, self).__init__()

        self.encoder = Encoder(num_layers, d_model, num_heads, dff, 
                               input_vocab_size, maximum_position_encoding, rate)
        
        self.x_flatten = tf.keras.layers.Flatten()
        
        self.final_layer = tf.keras.layers.Dense(output_dim, activation='sigmoid')
    
    def call(self, inp, training, enc_padding_mask):

        enc_output = self.encoder(inp, training, enc_padding_mask)  # (batch_size, inp_seq_len, d_model)

        flatten_output=self.x_flatten(enc_output)
        
        final_output = self.final_layer(flatten_output)  # (batch_size, tar_seq_len, target_vocab_size)

        return final_output

In [12]:
sample_transformer = Transformer(
    num_layers=2, d_model=512, num_heads=8, dff=2048, 
    input_vocab_size=50000, output_dim=95, 
    maximum_position_encoding=10000)

temp_input = tf.random.uniform((64, 300))

fn_out = sample_transformer(temp_input, training=False, 
                               enc_padding_mask=None)

fn_out.shape  # (batch_size, tar_seq_len, target_vocab_size)

TensorShape([64, 95])

## 配置超参数（hyperparameters）

为了让本示例小且相对较快，已经减小了*num_layers、 d_model 和  dff* 的值。 

Transformer 的基础模型使用的数值为：*num_layers=6*，*d_model = 512*，*dff = 2048*。关于所有其他版本的 Transformer，请查阅[论文](https://arxiv.org/abs/1706.03762)。

Note：通过改变以下数值，您可以获得在许多任务上达到最先进水平的模型。

In [13]:
num_layers = 4
d_model = 128
dff = 512
num_heads = 8

input_vocab_size = 50000
output_dim = 95
dropout_rate = 0.1
maximum_position_encoding=10000

## 优化器（Optimizer）

根据[论文](https://arxiv.org/abs/1706.03762)中的公式，将 Adam 优化器与自定义的学习速率调度程序（scheduler）配合使用。

$$\Large{lrate = d_{model}^{-0.5} * min(step{\_}num^{-0.5}, step{\_}num * warmup{\_}steps^{-1.5})}$$


In [14]:
from model.transformer_tf2.layers import CustomSchedule

In [15]:
learning_rate = CustomSchedule(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, 
                                     epsilon=1e-9)

## 损失函数与指标（Loss and metrics）

由于目标序列是填充（padded）过的，因此在计算损失函数时，应用填充遮挡非常重要。

In [16]:
loss_function=tf.keras.losses.BinaryCrossentropy(from_logits=False, reduction='none')

In [17]:
from utils.metrics import micro_f1,macro_f1
from model.transformer_tf2.utils import create_padding_mask

In [18]:
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.BinaryAccuracy(
    name='train_accuracy')

## 训练与检查点（Training and checkpointing）

In [19]:
transformer = Transformer(num_layers, d_model, num_heads, dff,
                          input_vocab_size, output_dim, 
                          maximum_position_encoding, 
                          rate=dropout_rate)

创建检查点的路径和检查点管理器（manager）。这将用于在每 `n` 个周期（epochs）保存检查点。

In [20]:
checkpoint_path = "data/checkpoints/train"

ckpt = tf.train.Checkpoint(transformer=transformer,
                           optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

# 如果检查点存在，则恢复最新的检查点。
if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print ('Latest checkpoint restored!!')

Latest checkpoint restored!!


In [21]:
EPOCHS = 10

In [22]:
# 该 @tf.function 将追踪-编译 train_step 到 TF 图中，以便更快地
# 执行。该函数专用于参数张量的精确形状。为了避免由于可变序列长度或可变
# 批次大小（最后一批次较小）导致的再追踪，使用 input_signature 指定
# 更多的通用形状。

train_step_signature = [
    tf.TensorSpec(shape=(None, None), dtype=tf.int32),
    tf.TensorSpec(shape=(None, None), dtype=tf.float32),
]

# @tf.function(input_signature=train_step_signature)
@tf.function()
def train_step(inp, tar):
  
    enc_padding_mask = create_padding_mask(inp)
  
    with tf.GradientTape() as tape:
        predictions = transformer(inp,True,enc_padding_mask=enc_padding_mask)
        loss = loss_function(tar, predictions)
    gradients = tape.gradient(loss, transformer.trainable_variables)
    optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))
  
    train_loss(loss)
    train_accuracy(tar, predictions)
    
    mi_f1=micro_f1(tar, predictions)
    ma_f1=macro_f1(tar, predictions)
    return mi_f1 ,ma_f1

In [23]:
def predict(inp,tar,enc_padding_mask):
    predictions = transformer(inp,False,enc_padding_mask=enc_padding_mask)
    mi_f1=micro_f1(tar, predictions)
    ma_f1=macro_f1(tar, predictions)
    return mi_f1,ma_f1

葡萄牙语作为输入语言，英语为目标语言。

In [24]:
for epoch in range(EPOCHS):
    start = time.time()
  
    train_loss.reset_states()
    train_accuracy.reset_states()
  
    # inp -> portuguese, tar -> english
    for (batch, (inp, tar)) in enumerate(train_dataset):
        mic_f1,mac_f1=train_step(inp, tar)
        
        if batch % 50 == 0:
            test_input,test_target= next(iter(test_dataset))
            enc_padding_mask = create_padding_mask(test_input)
            val_mic_f1,val_mac_f1=predict(test_input,test_target,enc_padding_mask)
            
            print ('Epoch {} Batch {} Loss {:.4f} micro_f1 {:.4f} macro_f1 {:.4f} val_micro_f1 {:.4f} val_macro_f1 {:.4f}'.format(
                epoch + 1, batch, train_loss.result(), mic_f1, mac_f1,val_mic_f1,val_mac_f1))
      
    if (epoch + 1) % 5 == 0:
        ckpt_save_path = ckpt_manager.save()
        print ('Saving checkpoint for epoch {} at {}'.format(epoch+1,
                                                         ckpt_save_path))
    
    print ('Epoch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1, 
                                                train_loss.result(), 
                                                train_accuracy.result()))

    print ('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 0.0186 micro_f1 0.9505 macro_f1 0.5956 val_micro_f1 0.9388 val_macro_f1 0.5000


KeyboardInterrupt: 

## 评估（Evaluate）

In [25]:
from sklearn.metrics import f1_score
from tqdm import tqdm

In [26]:
def evaluate(test_dataset):
    predictions=[]
    tars=[]
    for (batch, (inp, tar)) in tqdm(enumerate(test_dataset)):
        enc_padding_mask = create_padding_mask(inp)
        predict = transformer(inp,False,enc_padding_mask=enc_padding_mask)
        predictions.append(predict)
        tars.append(tar)
    predictions=tf.concat(predictions,axis=0)
    tars=tf.concat(tars,axis=0)
    mi_f1=micro_f1(tars, predictions)
    ma_f1=macro_f1(tars, predictions)
    
    predictions=np.where(predictions>0.5,1,0)
    tars=np.where(tars>0.5,1,0)
    
    smaple_f1=f1_score(tars,predictions,average='samples')
    return mi_f1,ma_f1,smaple_f1,tars,predictions

In [27]:
mi_f1,ma_f1,smaple_f1,tars,predictions=evaluate(test_dataset)

142it [00:15,  8.90it/s]


In [28]:
mi_f1,ma_f1,smaple_f1

(<tf.Tensor: id=226404, shape=(), dtype=float32, numpy=0.90773433>,
 <tf.Tensor: id=226441, shape=(), dtype=float32, numpy=0.79114944>,
 0.9054041007810072)

In [None]:
mlb.inverse_transform(tars)

In [None]:
mlb.inverse_transform(predictions)

## 总结

在本教程中，您已经学习了位置编码，多头注意力，遮挡的重要性以及如何创建一个 transformer。

尝试使用一个不同的数据集来训练 transformer。您可也可以通过修改上述的超参数来创建基础 transformer 或者 transformer XL。您也可以使用这里定义的层来创建 [BERT](https://arxiv.org/abs/1810.04805) 并训练最先进的模型。此外，您可以实现 beam search 得到更好的预测。