In [1]:
pip install torchsummary

Note: you may need to restart the kernel to use updated packages.


In [2]:
!nvidia-smi

Tue Oct 25 16:40:16 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.76       Driver Version: 515.76       CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:01:00.0  On |                  N/A |
| 40%   36C    P8    N/A /  75W |    223MiB /  4096MiB |     25%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

In [3]:
import torch
from torch import nn

torch.device('cpu'), torch.device('cuda'), torch.device('cuda:1')
# torch.cuda.set_device(0)

(device(type='cpu'), device(type='cuda'), device(type='cuda', index=1))

In [4]:
# 1 导入必备的库
import copy
import math

import tensorflow as tf
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from torchsummary import summary
from torch.optim.lr_scheduler import LambdaLR
import altair as alt

import time
import pandas as pd

import torch
import torch.nn as nn
from torch.optim.lr_scheduler import LambdaLR


import altair as alt

# 3 Model Architecture
class EncoderDecoder(nn.Module):
    def __init__(self, encoder, decoder, src_embed, tgt_embed, generator):
        """
        初始化函数
        :param encoder: 编码器对象
        :param decoder: 解码器对象
        :param src_embed: 源数据嵌入函数
        :param tgt_embed: 目标数据嵌入函数
        :param generator: 类别生成器对象
        """
        super(EncoderDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.generator = generator

    def forward(self, src, tgt, src_mask, tgt_mask):
        """
        将src, src_mask传入编码函数，得到结果后与src_mask, tgt和tgt_mask一同传给解码函数
        :param src: 源数据
        :param tgt: 目标数据
        :param src_mask: 源数据掩码张量
        :param tgt_mask: 目标数据掩码张量
        """
        memory = self.encode(src, src_mask)
        res = self.decode(memory, src_mask, tgt, tgt_mask)
        return res

    def encode(self, src, src_mask):
        """
        编码函数，使用src_embed对source做处理，然后和src_mask一起传给self.encoder
        """
        source_embeddings = self.src_embed(src)
        return self.encoder(source_embeddings, src_mask)

    def decode(self, memory, src_mask, tgt, tgt_mask):
        """
        解码函数，使用tgt_embed对target做处理，然后和src_mask,tgt_mask,memory一起传给self.decoder
        """
        target_embeddings = self.tgt_embed(tgt)
        return self.decoder(target_embeddings, memory, src_mask, tgt_mask)


class Generator(nn.Module):
    """
    将线性层和softmax计算层一起实现， 把类的名字叫做Generator，生成器类
    """

    def __init__(self, d_model, vocab):
        """
        初始化函数
        :param d_model: 嵌入的维度
        :param vocab: vocab.size -> 词表的大小
        """
        super(Generator, self).__init__()
        self.proj = nn.Linear(in_features=d_model, out_features=vocab)

    def forward(self, x):
        """
        输入是上一层的输出张量x
        使用上一步得到的self.proj对x进行线性变化, 然后使用F中已经实现的log_softmax进行softmax处理。
        """
        softmax = F.log_softmax(self.proj(x), dim=-1)
        return softmax


# 3.1 Encoder and Decoder Stacks

# 3.1.1 Encoder
def clone(mudule, N):
    """
    用于克隆多份结构
    """
    return nn.ModuleList([copy.deepcopy(mudule) for _ in range(N)])


class Encoder(nn.Module):
    def __init__(self, layer, N):
        super(Encoder, self).__init__()
        self.layers = clone(layer, N)  # 实现简单的克隆，在叠加在一起
        self.norm = LayerNorm(layer.size)

    def forward(self, x, mask):
        """
        将输入（和掩码）依次通过每一层。
        """
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)


# 3.1.2 Layer Normalization and Residual Connections
class LayerNorm(nn.Module):

    def __init__(self, feature_size, eps=1e-6):
        """
        初始化函数
        :param feature_size: 词嵌入的维度
        :param eps: 防止分母为0，默认是1e-6
        """
        super(LayerNorm, self).__init__()
        # 使用nn.parameter封装，代表他们是模型的参数
        self.gamma = nn.Parameter(torch.ones(feature_size))  # 缩放参数向量 初始化为1张量
        self.beta = nn.Parameter(torch.zeros(feature_size))  # 平移参数向量 初始化为0张量
        self.eps = eps

    def forward(self, x):
        """
        1. 对输入变量x求其最后一个维度,即词嵌入维度的均值，并保持输出维度与输入维度一致
        2. 求最后一个维度的标准差，进行规范化：用x减去均值除以标准差
        3. 对结果乘以我们的缩放参数gamma, *表示点乘，加上位移参beta
        :param x: 来自上一层的输出
        """
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return (x - mean) / (std + self.eps) * self.gamma + self.beta  ##########


class SublayerConnection(nn.Module):
    """
    SublayerConnection类:实现子层连接结构.𝑥表示上一层添加了残差连接的输出，这一层添加了残差连接的输出需要将  𝑥  执行层级归一化，
    然后馈送到 Multi-Head Attention 层或全连接层，添加 Dropout 操作后可作为这一子层级的输出。最后将该子层的输出向量与输入向量相加得到下一层的输入。
    """

    def __init__(self, size, dropout):
        """
        :param size: 𝑑𝑚𝑜𝑑𝑒𝑙=512
        :param dropout: 丢弃参数
        """
        super(SublayerConnection, self).__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, x, sublayer):
        # 原方案：先将x执行层级归一化
        # sublayer_out = sublayer(self.norm(x))
        # return x + self.dropout(sublayer_out)
        # 改进版本：取出norm 加快收敛速度
        sublayer_out = sublayer(x)
        sublayer_out = self.dropout(sublayer_out)
        return x + self.norm(sublayer_out)


# 3.1.3 Encoder Layer
class EncoderLayer(nn.Module):
    def __init__(self, size, self_attention, feed_forward, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = self_attention
        self.feed_forward = feed_forward
        self.sublayer = clone(mudule=SublayerConnection(size, dropout), N=2)  # 两次的跳过连接
        self.size = size

    def forward(self, x, mask):
        """
        第一个子层包括一个多头自注意力层和规范化层以及一个残差连接
        第二个子层包括一个前馈全连接层和规范化层以及一个残差连接
        """
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask))  # 输入 Query、Key 和 Value 都为 x 就表示自注意力。
        z = self.sublayer[1](x, self.feed_forward)
        return z


# 3.1.4 Decoder

class Decoder(nn.Module):
    def __init__(self, layer, N):
        """
        :param layer: 解码器层layer
        :param N: 解码器层的个数N
        """
        super(Decoder, self).__init__()
        self.layers = clone(layer, N)  # 实现简单的克隆，在叠加在一起
        self.norm = LayerNorm(layer.size)

    def forward(self, x, memory, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, memory, src_mask, tgt_mask)
        return self.norm(x)


# 3.1.5 Decoder Layer
class DecoderLayer(nn.Module):
    def __init__(self, size, self_attention, src_attn, feed_forward, dropout):
        """
        :param self_attention: 多头自注意力对象，该注意力机制需要Q=K=V
        :param src_attn: 多头注意力对象，这里Q!=K=V
        :param dropout: dropout置0比率
        """
        super(DecoderLayer, self).__init__()
        self.self_attn = self_attention
        self.src_attn = src_attn
        self.feed_forward = feed_forward
        self.sublayer = clone(mudule=SublayerConnection(size, dropout), N=3)  # 三次的跳过连接
        self.size = size

    def forward(self, x, memory, src_mask, tgt_mask):
        """
        :param x: 上一层的输入
        :param memory: 来自编码器层的语义存储变量
        :param src_mask: 源数据掩码张量
        :param tgt_mask: 目标数据掩码张量
        """

        """
        将x传入第一个子层结构，第一个子层结构的输入分别是x和self-attn函数，因为是自注意力机制，所以Q,K,V都是x，
        最后一个参数时目标数据掩码张量，这时要对目标数据进行遮掩，因为此时模型可能还没有生成任何目标数据。
        比如在解码器准备生成第一个字符或词汇时，我们其实已经传入了第一个字符以便计算损失，但是我们不希望在生成第一个字符时模型能利用这个信息，
        因此我们会将其遮掩，同样生成第二个字符或词汇时，模型只能使用第一个字符或词汇信息，第二个字符以及之后的信息都不允许被模型使用。
        """
        m = memory
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))  # 输入 Query、Key 和 Value 都为 x 就表示自注意力。
        """
        接着进入第二个子层，这个子层中常规的注意力机制，q是输入x;
        k,v是编码层输出memory，同样也传入source_mask，但是进行源数据遮掩的原因并非是抑制信息泄露，而是遮蔽掉对结果没有意义的padding。
        """
        x = self.sublayer[1](x, lambda x: self.src_attn(x, m, m, src_mask))
        """
        最后一个子层就是前馈全连接子层，经过它的处理后就可以返回结果，这就是我们的解码器结构
        """
        z = self.sublayer[2](x, self.feed_forward)
        return z


# 3.1.6 Mask
def subsequent_mask(size):
    """
    生成向后遮掩的掩码张量->形成一个三角矩阵
    :param size: 掩码张量最后两个维度的大小, 最后两维形成一个方阵
    """
    attn_shape = (1, size, size)

    # 然后使用np.ones()向这个形状中添加1元素，np.triu()形成上三角阵
    mask = np.triu(np.ones(attn_shape), k=1).astype('uint8')

    # 最后将numpy类型转化为torch中的tensor，内部做一个1- 的操作。这个其实是做了一个三角阵的反转，subsequent_mask中的每个元素都会被1减。
    # 如果是0，subsequent_mask中的该位置由0变成1
    # 如果是1，subsequent_mask中的该位置由1变成0
    return torch.from_numpy(mask) == 0


# 3.2 Attention

# 3.2.1 Scaled Dot-Product Attention
def attention(query, key, value, mask=None, dropout=None):
    """
    实现了缩放点积注意力
    1. 首先取query的最后一维的大小，对应词嵌入维度
    2. 利用公式计算注意力分数scores, 这里面key是将最后两个维度进行转置 -> (句子长度维度,词(多头)向量维度)
    3. 判断是否使用掩码张量
    4. 对scores的最后一维进行softmax操作，获得最终的注意力张量
    5. 判断是否使用dropout进行随机置0
    6. 最后，将p_attn与value张量相乘获得最终的query注意力表示，同时返回注意力张量
    """
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)

    if mask is not None:
        # 将掩码张量和scores张量每个位置一一比较
        # 如果掩码张量则对应的scores张量相同，则用-1e9来替换
        scores = scores.masked_fill(mask == 0, value=-1e9)

    p_attn = F.softmax(scores, dim=-1)

    if dropout is not None:
        p_attn = dropout(p_attn)

    attn = torch.matmul(p_attn, value)
    return attn, p_attn


# 3.2.2 Multi-Head Attention


class MultiHeadedAttention(nn.Module):
    def __init__(self, n_heads, d_model, dropout=0.1):
        """
        :param n_heads: 注意力头数
        :param d_model: 词嵌入维度
        :param dropout: 比率默认为0.1
        """
        super(MultiHeadedAttention, self).__init__()
        # 判断n_heads是否能被d_model整除 -> embedding_dim / n_heads
        assert d_model % n_heads == 0
        self.d_k = d_model // n_heads  # 512//8=64
        self.h = n_heads  # 8

        # 创建linear层，并且克隆4个 -> Q,K,V各一个，最后拼接的矩阵还需要一个
        self.linear = clone(mudule=nn.Linear(d_model, d_model), N=4)  # 512*512
        self.p_attn = None  # 代表最后得到的注意力张量
        self.dropout = nn.Dropout(dropout)

    def forward(self, query, key, value, mask=None):
        """
        1. 从 d_model(512) --> h*d_k(8*64) 批量执行所有线性投影
        2. 将注意力集中在所有投影向量上
        3. Concat并最终应用到线性层
        """
        if mask is not None:
            mask = mask.unsqueeze(1)  # 拓展维度，表示多头中的第n头
        n_batches = query.size(0)  # batch_size代表有多少条样本

        """
        1. 首先利用zip将输入QKV与三个线性层组到一起，然后利用for循环，将输入QKV分别传到线性层中,
           使用view()对线性变换的结构进行维度重塑，为每个头分割输入
               多加了一个维度h代表头，这样就意味着每个头可以获得一部分词特征组成的句子
               其中的-1代表自适应维度，即m句子长度维度，将自动计算这里的值
           然后对第二维和第三维进行转置操作：
               原因：为了让代表句子长度维度和词向量维度能够相邻，这样注意力机制才能找到词义与句子位置的关系，
               从attention函数中可以看到，利用的是原始输入的倒数第一和第二维，这样我们就得到了每个头的输入
        """
        query, key, value = [
            lin(x).view(n_batches, -1, self.h, self.d_k).transpose(1, 2)  # -1 <-> self.h
            for lin, x in zip(self.linear, (query, key, value))
        ]
        """
        2. 得到每个头的输入后，接下来就是将他们传入到attention中，
           这里直接调用我们之前实现的attention函数，同时也将mask和dropout传入其中
        """
        attn, self.p_attn = attention(query, key, value, mask, self.dropout)
        """
        3. 通过多头注意力计算后，我们就得到了每个头计算结果组成的4维张量，我们需要将其转换为输入的形状以方便后续的计算，
           因此这里开始进行第一步处理环节的逆操作，先对第二和第三维进行转置，
           然后使用contiguous(): 能够让转置后的张量应用view()，否则将无法直接使用.
           下一步就是使用view重塑形状，变成和输入形状相同。  
           最后使用线性层列表中的最后一个线性变换得到最终的多头注意力结构的输出
        """
        concat = attn.transpose(1, 2).contiguous().view(n_batches, -1, self.h * self.d_k)
        x = self.linear[-1](concat)

        return x


# 3.3 Position-wise Feed-Forward Networks
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        """
        :param d_model: 通过前馈全连接层后输入和输出的维度不变
        :param d_ff: 内部维度：第二个线性层的输入维度和第一个线性层的输出
        """
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        """
        首先经过第一个线性层，然后使用F中的relu函数进行激活，
        之后再使用dropout进行随机置0，最后通过第二个线性层w2，返回最终结果
        """
        x = self.w_2(self.dropout(F.relu(self.w_1(x))))
        return x


# 3.4 Embeddings and Softmax
class Embeddings(nn.Module):
    def __init__(self, d_model, vocab):
        super(Embeddings, self).__init__()
        self.lut = nn.Embedding(vocab, d_model)
        self.d_model = d_model

    def forward(self, x):
        """
        :param x: 这里代表输入给模型的单词文本通过词表映射后的one-hot向量
        :return: 将x传给self.lut并与根号下self.d_model相乘作为结果返回
        """
        return self.lut(x) * math.sqrt(self.d_model)


# 3.5 Positional Encoding
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout, max_len=5000):
        """
        :param d_model: 词嵌入维度 这里是512维
        :param dropout: 词嵌入维度
        :param max_len: 每个句子的最大长度
        """
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(dropout)
        # 使用与原公式等价的表示
        # 目的是避免中间的数值计算结果超出float的范围
        pos_embed = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)  # 0->4999 再插入一个维度(5000,1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model)
        )  # shape=[256]
        # div_term 实现的是分母
        # pe[:, 0::2] 表示第二个维度从 0 开始以间隔为 2 取值，即偶数。
        pos_embed[:, ::2] = torch.sin(position * div_term)  # shape=[max_len, 256]
        pos_embed[:, 1::2] = torch.cos(position * div_term)

        pos_embed = pos_embed.unsqueeze(0)  # shape=[1, 500, 512]
        self.register_buffer('pe', pos_embed)

    def forward(self, x):
        x = x + self.pe[:, : x.size(1)].requires_grad_(False)
        return self.dropout(x)


# 3.6 Full Model
def make_model(src_vocab, tgt_vocab, N=6, d_model=512, d_ff=2048, h=8, dropout=0.1):
    """
    构建模型
    :param src_vocab: 输入词表大小
    :param tgt_vocab: 目标词表大小
    :param N: 编码器和解码器堆叠的基础模块个数
    :param d_model: 词嵌入的维度
    :param d_ff: 逐位置的前馈网络中的内部维度
    :param h: 注意力头的个数
    """
    c = copy.deepcopy
    attn = MultiHeadedAttention(n_heads=h, d_model=d_model, dropout=dropout)
    ff = PositionwiseFeedForward(d_model=d_model, d_ff=d_ff, dropout=dropout)
    position = PositionalEncoding(d_model=d_model, dropout=dropout)
    # -------
    encoderLayer = EncoderLayer(size=d_model, self_attention=c(attn), feed_forward=c(ff), dropout=dropout)
    decoderLayer = DecoderLayer(size=d_model, self_attention=c(attn), src_attn=c(attn), feed_forward=c(ff),
                                dropout=dropout)
    srcEmbed = Embeddings(d_model=d_model, vocab=src_vocab)
    tgtEmbed = Embeddings(d_model=d_model, vocab=tgt_vocab)
    generator = Generator(d_model=d_model, vocab=tgt_vocab)
    # -------

    model = EncoderDecoder(
        encoder=Encoder(layer=encoderLayer, N=N),
        decoder=Decoder(layer=decoderLayer, N=N),
        src_embed=nn.Sequential(srcEmbed, c(position)),
        tgt_embed=nn.Sequential(tgtEmbed, c(position)),
        generator=generator
    )
    # 初始化参数: 使用Glorot初始化: 1/𝑓𝑎𝑛_𝑎𝑣𝑔, 𝑓𝑎𝑛_𝑎𝑣𝑔=(𝑓𝑎𝑛_𝑖𝑛 +𝑓𝑎𝑛_𝑜𝑢𝑡)/2
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
    return model

In [5]:
# 5 Training
# 5.1 Batched and Masking
class Batch:
    def __init__(self, src, tgt, pad=2):
        """
        :param pad: 默认2 表示<blank>
        """
        self.src = src
        # 将与令牌匹配的位置表示为False, 否则为True
        # 并在倒数第二个维度后面添加一维度
        self.src_mask = (src != pad).unsqueeze(-2)

        if tgt is not None:
            self.tgt = tgt[:, :-1]  # Decoder的输入，即除去最后一个结束token的部分
            self.tgt_y = tgt[:, 1:]  # Decoder的期望输入，即除去首个一个起始token的部分
            self.tgt_mask = self.make_std_mask(self.tgt, pad)
            self.ntokens = (self.tgt_y != pad).data.sum()  # 所有True的词元数量

    @staticmethod
    # staticmethod 返回函数的静态方法 可以不实例化即可调用方法
    def make_std_mask(tgt, pad):
        """
        pad 和 future words 均在mask中用pad表示
        """
        tgt_mask = (tgt != pad).unsqueeze(-2)
        sequence_len = tgt.size(-1)  # 或是batch中最长时间步数
        tgt_mask = tgt_mask & subsequent_mask(size=sequence_len).type_as(
            tgt_mask.data
            # &:进行位运算
            # subsequent_mask()返回维度为(1, size, size)
            # type_as():将数据类型转换为tgt_mask的数据类型
        )
        return tgt_mask


# 5.2 Training Loop
class TrainState:
    """
    跟踪处理的步骤、示例和标记的数量
    """
    step: int = 0  # 当前epoch的步
    accum_step: int = 0  # 梯度累积步数
    samples: int = 0  # 使用的示例总数
    tokens: int = 0  # 处理的tokens总数


def run_epoch(data_iter, model, loss_compute,
              optimizer, scheduler,
              mode="train", accum_iter=1,
              train_state=TrainState(),
              device=None):
    """
    完成了一个epoch训练的所有工作
    包括数据加载、模型推理、损失计算与方向传播，同时将训练过程信息进行打印
    """
    # 训练单个epoch
    start = time.time()
    total_tokens = 0
    total_loss = 0
    tokens = 0
    n_accum = 0  # 梯度累积步数
    for i, batch in enumerate(data_iter):
        # model是一个EncoderDecoder对象
        # 前向传播将src, src_mask传入编码函数，得到结果后与src_mask, tgt和tgt_mask一同传给解码函数
        out = model.forward(src=batch.src, tgt=batch.tgt, src_mask=batch.src_mask, tgt_mask=batch.tgt_mask)
        # 梯度累加技术 loss_node = loss_node / accum_iter
        # accum_iter:小批次数 默认是1 不使用梯度累加技术
        loss, loss_node = loss_compute(x=out, y=batch.tgt_y, norm=batch.ntokens)  # 计算损失->SimpleLossCompute
        if mode == "train" or mode == "train+log":
            loss_node.backward()  # 反向传播->不进行梯度清零, 执行梯度累加的操作
            train_state.step += 1
            train_state.samples += batch.src.shape[0]
            train_state.tokens += batch.ntokens
            if i % accum_iter == 0:  # 梯度累加达到固定次数之后
                optimizer.step()  # 更新参数
                optimizer.zero_grad(set_to_none=True)  # 梯度清零
                n_accum += 1
                train_state.accum_step += 1
            scheduler.step()

        total_loss += loss
        total_tokens += batch.ntokens
        tokens += batch.ntokens

        if i % 40 == 1 and (mode == "train" or mode == "train+log"):
            lr = optimizer.param_groups[0]["lr"]  # 获取学习率
            elapsed = time.time() - start  # 计算40个迭代所需时间
            print(("Epoch Step: %6d | Accumulation Step: %3d | Loss: %6.2f " +
                   "| Tokens / Sec: %7.1f | Learning Rate: %6.1e") %
                  (i, n_accum, loss / batch.ntokens, tokens / elapsed, lr))
            start = time.time()
            tokens = 0

        del loss
        del loss_node
    return total_loss / total_tokens, train_state


# 5.2 Optimizer
def rate(step, model_size, factor, warmup):
    """
    对于 Lambda LR 函数，我们必须将步骤默认为 1 避免零提升为负幂。
    :param step: 时间步长
    :param model_size: 模型维度
    :param factor: 示例中为1
    :param warmup: 预热迭代数
    :return:
    """
    if step == 0:
        step = 1
    return factor * (
            model_size ** (-0.5) * min(step ** (-0.5), step * warmup ** (-1.5))
    )


def example_learning_schedule():
    """
    学习率调度示例: 在 opts 列表中有 3 个示例。
    为每个示例运行 20000 个 epoch
    学习率调度使用 自定义调整学习率LambdaLR
    数据可视化工具: Altair
    """
    opts = [
        [512, 1, 4000],  # example 1
        [512, 1, 8000],  # example 2
        [256, 1, 4000],  # example 3
    ]
    dummy_model = torch.nn.Linear(1, 1)
    learning_rates = []

    for idx, example in enumerate(opts):
        optimizer = torch.optim.Adam(dummy_model.parameters(),
                                     lr=1,
                                     betas=(0.9, 0.98),
                                     eps=1e-9)
        lr_scheduler = LambdaLR(
            optimizer=optimizer,
            lr_lambda=lambda step: rate(step, *example))
        tmp = []
        #  采取20000次的虚拟训练步骤，并保存每一步的学习率
        for step in range(20000):
            # optimizer.param_groups[0]：长度为6的字典，
            # 包括[‘amsgrad’, ‘params’, ‘lr’, ‘betas’, ‘weight_decay’, ‘eps’]
            tmp.append(optimizer.param_groups[0]["lr"])
            optimizer.step()  # 更新参数
            lr_scheduler.step()  # 更新参数
        learning_rates.append(tmp)

    learning_rates = torch.tensor(learning_rates)
    # ----数据可视化----
    # 使 altair 能够处理超过 5000 行
    alt.data_transformers.disable_max_rows()

    opts_data = pd.concat(
        [
            pd.DataFrame(
                {
                    "Learning Rate": learning_rates[warmup_idx, :],
                    "model_size:warmup": ["512:4000", "512:8000", "256:4000"][
                        warmup_idx
                    ],
                    "step": range(20000),
                }
            )
            for warmup_idx in [0, 1, 2]
        ]
    )
    return (
        alt.Chart(opts_data)
            .mark_line()
            .properties(width=600)
            .encode(x="step", y="Learning Rate", color="model_size:warmup:N")
            .interactive()
    )


# 5.3 Regularization

# 5.3.2 Label Smoothing
class LabelSmoothing(nn.Module):
    def __init__(self, size, padding_idx, smoothing=0.0):
        super(LabelSmoothing, self).__init__()
        self.criterion = nn.KLDivLoss(reduction="sum")
        self.padding_idx = padding_idx
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.size = size
        self.true_dist = None

    def forward(self, x, target):
        assert x.size(1) == self.size
        true_dist = x.data.clone()
        true_dist.fill_(self.smoothing / (self.size - 2))
        true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
        true_dist[:, self.padding_idx] = 0
        mask = torch.nonzero(target.data == self.padding_idx)
        if mask.dim() > 0:
            true_dist.index_fill_(0, mask.squeeze(), 0.0)
        self.true_dist = true_dist
        return self.criterion(x, true_dist.clone().detach())


RUN_EXAMPLES = True


def show_example(fn, args=[]):
    if __name__ == "__main__" and RUN_EXAMPLES:
        return fn(*args)


def example_label_smoothing():
    crit = LabelSmoothing(5, 0, 0.4)
    predict = torch.FloatTensor(
        [
            [0, 0.2, 0.7, 0.1, 0],
            [0, 0.2, 0.7, 0.1, 0],
            [0, 0.2, 0.7, 0.1, 0],
            [0, 0.2, 0.7, 0.1, 0],
            [0, 0.2, 0.7, 0.1, 0],
        ]
    )
    crit(x=predict.log(), target=torch.LongTensor([2, 1, 0, 3, 3]))
    LS_data = pd.concat(
        [
            pd.DataFrame(
                {
                    "target distribution": crit.true_dist[x, y].flatten(),
                    "columns": y,
                    "rows": x,
                }
            )
            for y in range(5)
            for x in range(5)
        ]
    )

    return (
        alt.Chart(LS_data)
            .mark_rect(color="Blue", opacity=1)
            .properties(height=200, width=200)
            .encode(
            alt.X("columns:O", title=None),
            alt.Y("rows:O", title=None),
            alt.Color(
                "target distribution:Q", scale=alt.Scale(scheme="viridis")
            ),
        )
            .interactive()
    )


def loss(x, crit):
    d = x + 3 * 1
    predict = torch.FloatTensor([[0, x / d, 1 / d, 1 / d, 1 / d]])
    return crit(predict.log(), torch.LongTensor([1])).data


def penalization_visualization():
    crit = LabelSmoothing(5, 0, 0.1)
    loss_data = pd.DataFrame({
        "Loss": [loss(x, crit) for x in range(1, 100)],
        "Steps": list(range(99)),
    }).astype("float")

    return (alt.Chart(loss_data).mark_line().properties(width=350).encode(
        x="Steps",
        y="Loss",
    ).interactive())


# 6 A First Example

# 6.1 Synthetic Data
def data_gen(V, n_batches, batch_size, s_len=10, device=None):
    """
    <编码器-解码器数据复制任务> 随机数据生成器
    :param device: 是否使用GPU加速
    :param V: 词典数量，取值范围[0, V-1]，约定0作为特殊符号使用代表padding
    :param batch_size: 批次大小
    :param n_batches: 需要生成的批次数量
    :param s_len: 生成的序列数据的长度
    """
    for i in range(n_batches):
        src_data = torch.randint(2, V, size=(batch_size, s_len))
        # 约定输出为输入除去序列第一个元素，即向后平移一位进行输出，同时输出数据要在第一个时间步添加一个起始符
        tgt_data = src_data.clone()
        tgt_data[:, 0] = 1  # 将序列的第一个时间步置为1(即约定的起始符)
        # .batch()
        # 返回一个新的tensor，从当前计算图中分离下来的，但是仍指向原变量的存放位置
        # 不同之处只是requires_grad为false，得到的这个tensor永远不需要计算其梯度，不具有grad。
        # requires_grad 默认为False
        src = src_data.requires_grad_(False).clone().detach()
        tgt = tgt_data.requires_grad_(False).clone().detach()
        if device == "cuda":
            src = src.cuda()
            tgt = tgt.cuda()
        yield Batch(src=src, tgt=tgt, pad=0)


# 6.2 Loss Computation
class SimpleLossCompute:
    def __init__(self, generator, criterion):
        self.generator = generator
        self.criterion = criterion  # 使用标签平滑

    def __call__(self, x, y, norm):
        """
        :param x: decoder输出的结果
        :param y: 标签数据
        :param norm: loss的归一化系数，用batch中所有有效token数即可
        """
        x = self.generator(x)
        # contiguous():
        # 1. 由于torch.view等方法操作需要连续的Tensor
        # 2. 出于性能考虑 使用该方法后会重新据开辟一块内存空间保证数是在逻辑顺序和内存中是一致的
        x_ = x.contiguous().view(-1, x.size(-1))
        y_ = y.contiguous().view(-1)
        loss = self.criterion(x_, y_)
        sloss = (loss / norm)

        return sloss.data * norm, loss


# 6.3 Greedy Decoding
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    # encoder()编码函数: 使用src_embed对src做处理，然后和src_mask一起传给self.encoder
    memory = model.encode(src=src, src_mask=src_mask)
    # ys代表目前已生成的序列，最初为仅包含一个起始符的序列，不断将预测结果追加到序列最后
    ys = torch.zeros(1, 1).fill_(start_symbol).type_as(src.data)
    for i in range(max_len - 1):
        # decoder()解码函数: 使用tgt_embed对tgt做处理，然后和src_mask, tgt_mask, memory一起传给self.decoder
        out = model.decode(memory=memory,
                           src_mask=src_mask,
                           tgt=ys,
                           tgt_mask=subsequent_mask(size=ys.size(1)).type_as(src.data))
        # generator: 类别生成器对象 -> linear+softmax
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.data[0]
        # cat(): 实现拼接操作
        ys = torch.cat(
            [ys, torch.zeros(1, 1).type_as(src.data).fill_(next_word)],
            dim=1)
    return ys


# 6.4 Training Example
# def execute_example(fn, args=[]):
#     if __name__ == "__main__" and RUN_EXAMPLES:
#         fn(*args)


class DummyOptimizer(torch.optim.Optimizer):
    def __init__(self):
        self.param_groups = [{"lr": 0}]
        None

    def step(self):
        None

    def zero_grad(self, set_to_none=False):
        None


class DummyScheduler:
    def step(self):
        None


def example_simple_model(device=None):
    V = 11  # 字典的大小
    criterion = LabelSmoothing(size=V, padding_idx=0, smoothing=0.0)
    model = make_model(src_vocab=V, tgt_vocab=V, N=2)
    if device == "cuda":
        model.cuda()
    model_size = model.src_embed[0].d_model  # 512

    n_epochs = 40
    n_batch_train_epoch = 30  # 训练时每个epoch所需批次大小
    n_batch_val_epoch = 10  # 验证时每个epoch所需批次大小
    batch_size = 40

    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=0.5,
                                 betas=(0.9, 0.98),
                                 eps=1e-9)
    lr_scheduler = LambdaLR(
        optimizer=optimizer,
        lr_lambda=lambda step: rate(step=step, model_size=model_size, factor=0.1, warmup=400)
    )

    for epoch in range(n_epochs):
        loss_compute = SimpleLossCompute(generator=model.generator,
                                         criterion=criterion)

        print(f"\n|   批次: {epoch}   |")
        print("*" * 5 + "训练" + "*" * 5)
        model.train()  # self.training=True

        train_data_iter = data_gen(V=V, n_batches=n_batch_train_epoch,
                                   batch_size=batch_size, device=device)
        run_epoch(data_iter=train_data_iter,
                  model=model,
                  loss_compute=loss_compute,
                  optimizer=optimizer,
                  scheduler=lr_scheduler,
                  mode="train",)

        # -----------
        print("*" * 5 + "验证" + "*" * 5)
        model.eval()  # self.training=False

        val_data_iter = data_gen(V=V, n_batches=n_batch_val_epoch,
                                 batch_size=batch_size, device=device)
        valid_mean_loss = run_epoch(data_iter=val_data_iter,
                                    model=model,
                                    loss_compute=loss_compute,
                                    optimizer=DummyOptimizer(),  # None
                                    scheduler=DummyScheduler(),  # None
                                    mode="eval",)[0]  # 返回: total_loss / total_tokens
        print(f"|验证损失: {valid_mean_loss} |")

    model.eval()
    torch.save(model, './example_1_copy.pth')

In [6]:
example_simple_model("cuda")


|   批次: 0   |
*****训练*****
Epoch Step:      1 | Accumulation Step:   2 | Loss:   3.17 | Tokens / Sec:  2327.3 | Learning Rate: 5.5e-07
*****验证*****
|验证损失: 2.394113779067993 |

|   批次: 1   |
*****训练*****
Epoch Step:      1 | Accumulation Step:   2 | Loss:   2.40 | Tokens / Sec:  9412.5 | Learning Rate: 8.8e-06
*****验证*****
|验证损失: 1.9547275304794312 |

|   批次: 2   |
*****训练*****
Epoch Step:      1 | Accumulation Step:   2 | Loss:   2.21 | Tokens / Sec:  9569.1 | Learning Rate: 1.7e-05
*****验证*****
|验证损失: 1.7825837135314941 |

|   批次: 3   |
*****训练*****
Epoch Step:      1 | Accumulation Step:   2 | Loss:   1.97 | Tokens / Sec:  9915.8 | Learning Rate: 2.5e-05
*****验证*****
|验证损失: 1.6604243516921997 |

|   批次: 4   |
*****训练*****
Epoch Step:      1 | Accumulation Step:   2 | Loss:   1.90 | Tokens / Sec:  9883.9 | Learning Rate: 3.4e-05
*****验证*****
|验证损失: 1.5723007917404175 |

|   批次: 5   |
*****训练*****
Epoch Step:      1 | Accumulation Step:   2 | Loss:   1.72 | Tokens / Sec:  9371.5 | Lea

In [7]:
model = torch.load('./example_1_copy.pth')
src = torch.LongTensor([[1, 6, 3, 3, 5, 6, 5, 8, 8, 10]]).cuda()
max_len = src.shape[1]
src_mask = torch.ones(1, 1, max_len).cuda()
print(greedy_decode(model, src, src_mask, max_len=max_len, start_symbol=1))

tensor([[ 1,  6,  3,  3,  5,  6,  5,  8,  8, 10]], device='cuda:0')


In [8]:
from datetime import date
from torch.nn.utils.rnn import pad_sequence

In [9]:
# 1. 随机生成日期, 并以输入格式和目标格式显示
MONTHS = ["January", "February", "March", "April", "May", "June",
          "July", "August", "September", "October", "November", "December"]


def random_dates(n_dates):
    min_date = date(1000, 1, 1).toordinal()
    max_date = date(9999, 12, 31).toordinal()

    ordinals = np.random.randint(low=min_date, high=max_date + 1, size=n_dates)
    dates = [date.fromordinal(ordinal) for ordinal in ordinals]

    X = [MONTHS[date.month - 1] + " " + date.strftime("%d, %Y") for date in dates]
    y = [date.isoformat() for date in dates]

    return X, y


# 2. 确定输入,目标的词汇(字符)表
input_chars = "".join(sorted(set("".join(MONTHS) + "0123456789, ")))
output_chars = "0123456789-"
src_vocab = len(input_chars)
tgt_vocab = len(output_chars)


# 3. 编写函数将字符串转化为IDs形式
def date_str_to_ids(date_str, chars_list):
    return [chars_list.index(c) + 1 for c in date_str]


# 4.处理可变长度的序列
def prepare_date_strs(date_strs, chars=input_chars):
    X_ids = [torch.tensor(date_str_to_ids(date, chars)).cuda() for date in date_strs]
    X = pad_sequence(X_ids, batch_first=True, padding_value=0)
    return X
# def prepare_date_strs(date_strs, chars=input_chars):
#     X_ids = [torch.tensor(date_str_to_ids(date, chars)) for date in date_strs]
#     X = pad_sequence(X_ids, batch_first=True, padding_value=0)
#     return X


# 5. 批量和掩码 Batched and Masking
class Batch:
    def __init__(self, src, tgt, pad=0):
        """
        :param pad: 默认0 表示<blank>
        """
        self.src = src
        # 将与令牌匹配的位置表示为False, 否则为True
        # 并在倒数第二个维度后面添加一维度
        self.src_mask = (src != pad).unsqueeze(-2)

        if tgt is not None:
            self.tgt = tgt[:, :-1]  # Decoder的输入，即除去最后一个结束token的部分
            self.tgt_y = tgt[:, 1:]  # Decoder的期望输入，即除去首个一个起始token的部分
            self.tgt_mask = self.make_std_mask(self.tgt, pad)
            self.ntokens = (self.tgt_y != pad).data.sum()  # 所有True的词元数量

    @staticmethod
    # staticmethod 返回函数的静态方法 可以不实例化即可调用方法
    def make_std_mask(tgt, pad):
        """
        pad 和 future words 均在mask中用pad表示
        """
        tgt_mask = (tgt != pad).unsqueeze(-2)
        sequence_len = tgt.size(-1)  # 或是batch中最长时间步数
        tgt_mask = tgt_mask & subsequent_mask(size=sequence_len).type_as(
            tgt_mask.data
            # &:进行位运算
            # subsequent_mask()返回维度为(1, size, size)
            # type_as():将数据类型转换为tgt_mask的数据类型
        )
        return tgt_mask


# 6. 构建数据集

sos_id = tgt_vocab + 1  # 11+1=12


# def shift_output_sequences(y, device=None):
#     if device == "cuda":
#         sos_token = torch.Tensor(len(y), 1).fill_(sos_id).int().cuda()
#         decoder = torch.cat((sos_token, y[:, :-1]), axis=1).cuda()
#     else:
#         sos_token = torch.Tensor(len(y), 1).fill_(sos_id).int()
#         decoder = torch.cat((sos_token, y[:, :-1]), axis=1)
#     return decoder
def shift_output_sequences(y, device=None):
    if device == "cuda":
        sos_token = torch.Tensor(len(y), 1).fill_(sos_id).int().cuda()
        decoder = torch.cat((sos_token, y), axis=1).cuda()
    else:
        sos_token = torch.Tensor(len(y), 1).fill_(sos_id).int()
        decoder = torch.cat((sos_token, y), axis=1)
    return decoder


def create_dataset(n_dates, device=None):
    X, y = random_dates(n_dates)
    X_pre = prepare_date_strs(X, input_chars)
    y_pre = prepare_date_strs(y, output_chars)
    y_pre_shift = shift_output_sequences(y_pre, device=device)

    #         X_pre[:, 0] = 1  # 将序列的第一个时间步置为1(即约定的起始符)
    #     y_pre[:, 0] = 1
    return X_pre, y_pre_shift


def data_gen(n_batches, batch_size, device=None):
    """
    <编码器-解码器日期字符串转换任务> 随机数据生成器
    :param batch_size: 批次大小
    :param n_batches: 需要生成的批次数量
    """
    for i in range(n_batches):
        X_pre, y_pre = create_dataset(batch_size,device=device)
        # data = torch.randint(2, V, size=(batch_size, s_len))
        # .batch()
        # 返回一个新的tensor，从当前计算图中分离下来的，但是仍指向原变量的存放位置
        # 不同之处只是requires_grad为false，得到的这个tensor永远不需要计算其梯度，不具有grad。
        # requires_grad 默认为False
        src = X_pre.requires_grad_(False).clone().detach()
        tgt = y_pre.requires_grad_(False).clone().detach()
        if device == "cuda":
            src = src.cuda()
            tgt = tgt.cuda()
        yield Batch(src=src, tgt=tgt, pad=0)


# 7. 训练评估模型
def example_simple_model(device=None):
    # V = 11  # 字典的大小
    criterion = LabelSmoothing(size=tgt_vocab + 2, padding_idx=0, smoothing=0.0)
    model = make_model(src_vocab=src_vocab + 1, tgt_vocab=tgt_vocab + 2, N=2)
    if device == "cuda":
        model.cuda()
    model_size = model.src_embed[0].d_model  # 512

    n_epochs = 20
    n_batch_train_epoch = 200  # 训练时每个epoch所需批次大小
    n_batch_val_epoch = 50  # 验证时每个epoch所需批次大小
    batch_size = 100

    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=0.5,
                                 betas=(0.9, 0.98),
                                 eps=1e-9)
    lr_scheduler = LambdaLR(
        optimizer=optimizer,
        lr_lambda=lambda step: rate(
            step=step, model_size=model_size, factor=0.1, warmup=600))

    for epoch in range(n_epochs):
        torch.cuda.empty_cache()
        loss_compute = SimpleLossCompute(generator=model.generator,
                                         criterion=criterion)

        print(f"\n|   批次: {epoch}   |")
        print("*" * 5 + "训练" + "*" * 5)
        model.train()  # self.training=True

        train_data_iter = data_gen(n_batches=n_batch_train_epoch,
                                   batch_size=batch_size,
                                   device=device)
        run_epoch(data_iter=train_data_iter,
                  model=model,
                  loss_compute=loss_compute,
                  optimizer=optimizer,
                  scheduler=lr_scheduler,
                  mode="train")

        # -----------
        print("*" * 5 + "验证" + "*" * 5)
        model.eval()  # self.training=False

        val_data_iter = data_gen(n_batches=n_batch_val_epoch,
                                 batch_size=batch_size,
                                 device=device)
        valid_mean_loss = run_epoch(
            data_iter=val_data_iter,
            model=model,
            loss_compute=loss_compute,
            optimizer=DummyOptimizer(),  # None
            scheduler=DummyScheduler(),  # None
            mode="eval")[0]  # 返回: total_loss / total_tokens
        print(f"|验证损失: {valid_mean_loss} |")

    model.eval()
    torch.save(model, './example_2_date.pth')

In [10]:
example_simple_model("cuda")


|   批次: 0   |
*****训练*****
Epoch Step:      1 | Accumulation Step:   2 | Loss:   3.15 | Tokens / Sec:  9014.5 | Learning Rate: 3.0e-07
Epoch Step:     41 | Accumulation Step:  42 | Loss:   2.39 | Tokens / Sec:  7208.7 | Learning Rate: 6.3e-06
Epoch Step:     81 | Accumulation Step:  82 | Loss:   2.02 | Tokens / Sec:  7394.0 | Learning Rate: 1.2e-05
Epoch Step:    121 | Accumulation Step: 122 | Loss:   1.67 | Tokens / Sec:  7564.0 | Learning Rate: 1.8e-05
Epoch Step:    161 | Accumulation Step: 162 | Loss:   1.36 | Tokens / Sec:  7263.0 | Learning Rate: 2.4e-05
*****验证*****
|验证损失: 0.9065974950790405 |

|   批次: 1   |
*****训练*****
Epoch Step:      1 | Accumulation Step:   2 | Loss:   1.14 | Tokens / Sec:  9374.3 | Learning Rate: 3.0e-05
Epoch Step:     41 | Accumulation Step:  42 | Loss:   0.98 | Tokens / Sec:  7221.5 | Learning Rate: 3.6e-05
Epoch Step:     81 | Accumulation Step:  82 | Loss:   0.83 | Tokens / Sec:  7419.8 | Learning Rate: 4.2e-05
Epoch Step:    121 | Accumulation Step:

Epoch Step:     81 | Accumulation Step:  82 | Loss:   0.00 | Tokens / Sec:  7148.6 | Learning Rate: 4.3e-05
Epoch Step:    121 | Accumulation Step: 122 | Loss:   0.00 | Tokens / Sec:  7350.1 | Learning Rate: 4.2e-05
Epoch Step:    161 | Accumulation Step: 162 | Loss:   0.01 | Tokens / Sec:  7151.9 | Learning Rate: 4.2e-05
*****验证*****
|验证损失: 3.36340963258408e-05 |

|   批次: 14   |
*****训练*****
Epoch Step:      1 | Accumulation Step:   2 | Loss:   0.01 | Tokens / Sec:  9927.8 | Learning Rate: 4.2e-05
Epoch Step:     41 | Accumulation Step:  42 | Loss:   0.00 | Tokens / Sec:  7275.8 | Learning Rate: 4.1e-05
Epoch Step:     81 | Accumulation Step:  82 | Loss:   0.00 | Tokens / Sec:  7266.8 | Learning Rate: 4.1e-05
Epoch Step:    121 | Accumulation Step: 122 | Loss:   0.00 | Tokens / Sec:  7150.6 | Learning Rate: 4.1e-05
Epoch Step:    161 | Accumulation Step: 162 | Loss:   0.01 | Tokens / Sec:  7349.0 | Learning Rate: 4.1e-05
*****验证*****
|验证损失: 2.267956551804673e-05 |

|   批次: 15   |
****

In [11]:
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    # encoder()编码函数: 使用src_embed对src做处理，然后和src_mask一起传给self.encoder
    memory = model.encode(src=src, src_mask=src_mask)
    # ys代表目前已生成的序列，最初为仅包含一个起始符的序列，不断将预测结果追加到序列最后
    ys = torch.zeros(1, 1).fill_(start_symbol).type_as(src.data).cuda()
    for i in range(max_len-1):
        # decoder()解码函数: 使用tgt_embed对tgt做处理，然后和src_mask, tgt_mask, memory一起传给self.decoder
        out = model.decode(memory=memory,
                           src_mask=src_mask,
                           tgt=ys,
                           tgt_mask=subsequent_mask(size=ys.size(1)).type_as(src.data))
        # generator: 类别生成器对象 -> linear+softmax
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.data[0]
        # cat(): 实现拼接操作
        ys = torch.cat(
            [ys, torch.zeros(1, 1).type_as(src.data).fill_(next_word)],
            dim=1).cuda()
    return ys

In [12]:
# 8. 编写函数将IDs转化为字符串形式
def ids_to_date_strs(ids, chars_list):
    return [
        "".join([(" " + chars_list)[index] for index in sequence])
        for sequence in ids
    ]


# 9. 预处理序列 强制进行0填充至length==18(max)
max_input_length = 18


def prepare_date_strs_padded(date_strs):
    X = prepare_date_strs(date_strs, input_chars)
    pd = (0, max_input_length - X.shape[1], 0, 0)

    if X.shape[1] < max_input_length:
        X = F.pad(X, pd, 'constant', 0)
    return X

# 10. 使用模型进行预测 预测日期字符串函数
def pred_date_strs(model, date_strs):
    X = prepare_date_strs_padded(date_strs)
    y_pred_ids = tf.fill(dims=(len(X), 1), value=1)  # 初始位置:<sos>
    src_mask = torch.ones(1, 1, 18).cuda()

#     # for index in range(max_output_length):
#     #     pad_size = max_output_length - y_pred_ids.shape[1]  # 1: 10-1
#     #     X_decoder = tf.pad(y_pred_ids, [[0, 0], [0, pad_size]])
#     #     # 计算目标字符表的字符概率 并输出最大的ids
#     #     y_probas_next = model.predict([X, X_decoder])[:, index:index + 1]
#     #     y_pred_next = tf.argmax(y_probas_next, axis=-1, output_type=tf.int32)
#     #     # 循环将预测字符上一个字符串进行拼接
#     #     y_pred_ids = tf.concat([y_pred_ids, y_pred_next], axis=1)
    ys = greedy_decode(model, src=X, src_mask=src_mask, max_len=tgt_vocab, start_symbol=sos_id)

#     # 排除<sos>
    y_pred_str = ids_to_date_strs(ys[:, 1:], output_chars)

    return y_pred_str

In [13]:
model = torch.load('./example_2_date.pth')

In [14]:
pred_date_strs(model, ["May 15, 2021"])

2022-05-18 15:08:10.450909: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-18 15:08:10.451964: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-18 15:08:10.452621: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-18 15:08:10.454577: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

['2021-05-15']