# 生成掩码张量

In [1]:
import numpy as np
import torch
import math
import torch.nn as nn

In [2]:
def subsequent_mask(size):
    ones = np.ones((1, size, size))
    triu = np.triu(ones, k=1).astype('uint8')
    mask = 1 - triu
    mask = torch.from_numpy(mask)
    return mask

In [3]:
subsequent_mask(5)

tensor([[[1, 0, 0, 0, 0],
         [1, 1, 0, 0, 0],
         [1, 1, 1, 0, 0],
         [1, 1, 1, 1, 0],
         [1, 1, 1, 1, 1]]], dtype=torch.uint8)

## 思路:  生成一个numpy数字 什么样的形状很重要,用np.triu()完成1,0数组, 然后1-这个数组 最后torch.from_numpy()转换成张量 

# attention 方法

## 思路: q点乘k的转置, 除以根号d_k  这一步可以称作1 scale dot_product
2 进行掩码
3 softmax
4 dropout
5 跟v 做点积

In [4]:
def attention(q, k, v, mask=None, dropout=None):
    scaled = torch.matmul(q, k.transpose(-1, -2), -1) / math.sqrt(q.size(-1))
    if mask is not None:
        scaled = scaled.filled_mask(mask==0, -1e9)
    attn = torch.functional.softmax(scaled, -1)
    if droupout is not None:
        attn = dropout(attn)
    output = torch.matmul(attn, v)
    return output

# 多头注意力

## 思路: 1q,k,v 每个进行一个全连接层, 2  转换维度, 形状 3 attention 4 转换维度形状 5 全连接 

In [5]:
import copy
def clones(module, n):
    return torch.nn.ModuleList([copy.deepcopy(module) for _ in range(n)])
    

In [6]:
class MultiHeadedAttention(torch.nn.Module):
    def __init__(self, embedding_dim, head, dropout=0.1):
        super(MultiHeadedAttention, self).__init__()
        self.linears = clones(torch.nn.Linear(embedding_dim, embedding_dim), 4)
        self.norm = LayerNorm(embedding_dim)
        assert embedding_dim % head == 0
        self.head = head
        self.embedding_dim = embedding_dim
        self.d_k = self.embedding_dim // self.head
        self.dropout1 = torch.nn.Dropout(p=dropout)
        self.attn = None
        self.dropout2 = torch.nn.Dropout(p=dropout)
        
    def forward(self, q, k, v, mask=None):
        q_view, k_view, v_view = [model(x).view(q.size(0), -1, self.head, self.d_k).transpose(1, 2) for model,x in zip(self.linears, (q, k, v))]
        if mask is not None:
            mask = mask.unsqueeze(0)
        x, self.attn = attention(q_view, k_view, v_view, mask=mask, dropout=self.dropout1)
        x = x.transpose(1, 2).congurous().view(q.size(0), -1, self.embedding_dim)
        x = self.linears[-1](x)
        return self.norm(q + self.dropout2(x))

In [7]:
clones(torch.nn.Linear(5, 5), 3)

ModuleList(
  (0): Linear(in_features=5, out_features=5, bias=True)
  (1): Linear(in_features=5, out_features=5, bias=True)
  (2): Linear(in_features=5, out_features=5, bias=True)
)

# 前馈全连接层

## 思路: 全连接 > relu> dropout> 全连接

In [8]:
class PositionwiseFeedForward(torch.nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.linears = clones(torch.nn.Linear(d_model, d_ff), 2)
        self.dropout = torch.nn.Dropout(p=dropout)
        self.dropout2 = torch.nn.Dropout(p=dropout)
        self.norm = LayerNorm(d_model)
        
    def forward(self, x):
        output = self.linears[1](self.dropout(torch.functional.relu(self.linears[0](x))))
        return self.norm(x + self.dropout2(output))

# Normalize层

## 思路: 减均值 除标准差+极小值防止分母为零 加参数张量

In [9]:
class LayerNorm(torch.nn.Module):
    def __init__(self, embedding_dim, epsilon=1e-6):
        super(LayerNorm, self).__init__()
        self.gamma = torch.nn.Parameter(torch.ones(embedding_dim))
        self.beta = torch.nn.Parameter(torch.zeros(embedding_dim))
        self.epsilon = epsilon
    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        x = self.gamma * (x - mean) / (std + self.epsilon)  + self.beta
        return x
        
        
        

In [10]:
layernorm = LayerNorm(512)

# 子层链接

## 思路 sublayer > dropout > +x > normalize

In [11]:
# class SublayerConnection(torch.nn.Module):
#     def __init(self, embedding_dim):
#         super(SublayerConnection, self).__init__()
#         self.normalize = LayerNorm(embedding_dim)
#     def forward(x, )
        

# 编码层

## 思路: multiattention > feedforward

In [22]:
class EncoderLayer(torch.nn.Module):
    def __init__(self, self_attn, ff):
        super(EncoderLayer, self).__init__()
        self.self_attn = self_attn
        self.ff = ff
        
    def forward(self, x, mask):
        attention = self.self_attn(x, x, x, mask)
        return self.ff(attention)
        
        

# 编码器

In [13]:
class Encoder(torch.nn.Module):
    def __init__(self, encoder_layer, n):
        super(Encoder, self).__init__()
        self.layers = clones(encoder_layer, n)
    def forward(self, x, mask):
        for layer in self.layers:
            x = layer(x, mask)
        return x
        

# 解码器层

## 思路:  self_attention >  attention > feedforward

In [14]:
class DecoderLayer(torch.nn.Module):
    def __init__(self, attention_layer, ff_layer):
        super(DecoderLayer, self).__init__()
        self.attn_layers =  clones(attention_layer, 2)
        self.ff = ff_layer
    def forward(self, x, encoder_output, source_mask, target_mask):
        self_attn = self.attn_layers[0](x, x, x, target_mask)
        x = self.attn_layers[1](self_attn, encoder_output, encoder_output, source_mask)
        x = self.ff(x)
        return x
        

# 解码器

In [15]:
class Decoder(torch.nn.Module):
    def __init__(self, DecoderLayer, n):
        super(Decoder, self).__init__()
        self.decoder_layers = clones(DecoderLayer, n)
        
    def forward(self, x, encoder_output, source_mask, target_mask):
        for layer in self.decoder_layers:
            x = layer(x, encoder_output, source_mask, target_mask)
        return x
        

# Generator

## 思路: 全连接 > log_softmax

In [16]:
class Generator(torch.nn.Module):
    def __init__(self, embedding_dim, output_dim):
        super(Generator, self).__init__()
        self.linear = torch.nn.Linear(embedding_dim, output_dim)
    def forward(self, x):
        x = torch.functional.log_softmax(self.linear(x), dim=-1)
        return x
        

# encoderdecoder

## 思路: source, source_mask > encoder > encoder_output,    target, encoder_output, source_mask, target_mask>decoder > decoder_output>generator

In [17]:
class EncoderDecoder(torch.nn.Module):
    def __init__(self, encoder, decoder, source_embedding, target_embedding, generator):
        super(EncoderDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.source_embedding = source_embedding
        self.target_embedding = target_embedding
        self.generator = generator
    def forward(self, source, target, source_mask, target_mask):
        encoder_output = self.encoder(self.source_embedding(x), source_mask)
        decoder_output = self.decoder(self.target_embedding(target), encoder_output, source_mask, target_mask)
        return self.generator(decoder_output)

In [18]:
def make_model(source_vocab, target_vocab, N=6, 
               d_model=512, d_ff=2048, head=8, dropout=0.1):
    c = copy.deepcopy
    attn = MultiHeadedAttention(d_model, head)
    ff = PositionwiseFeedForward(d_model, d_ff, dropout)
    position = PositionalEncoding(d_model, dropout)
    model = EncoderDecoder(Encoder(EncoderLayer(c(attn), c(ff)), N),
        Decoder(DecoderLayer(c(attn), c(ff)), N),
        torch.nn.Sequential(Embeddings(d_model, source_vocab), c(position)),
        torch.nn.Sequential(Embeddings(d_model, target_vocab), c(position)),
        Generator(d_model, target_vocab))
    for p in model.parameters():
        if p.dim() > 1:
            torch.nn.init.xavier_uniform(p)
    return model

In [19]:
class Embeddings(torch.nn.Module):
    def __init__(self, d_model, vocab):
        """类的初始化函数, 有两个参数, d_model: 指词嵌入的维度, vocab: 指词表的大小."""
        # 接着就是使用super的方式指明继承nn.Module的初始化函数, 我们自己实现的所有层都会这样去写.
        super(Embeddings, self).__init__()
        # 之后就是调用nn中的预定义层Embedding, 获得一个词嵌入对象self.lut
        self.lut = torch.nn.Embedding(vocab, d_model)
        # 最后就是将d_model传入类中
        self.d_model = d_model

    def forward(self, x):
        """可以将其理解为该层的前向传播逻辑，所有层中都会有此函数
           当传给该类的实例化对象参数时, 自动调用该类函数
           参数x: 因为Embedding层是首层, 所以代表输入给模型的文本通过词汇映射后的张量"""

        # 将x传给self.lut并与根号下self.d_model相乘作为结果返回
        return self.lut(x) * math.sqrt(self.d_model)

In [20]:
# 定义位置编码器类, 我们同样把它看做一个层, 因此会继承nn.Module    
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout, max_len=5000):
        """位置编码器类的初始化函数, 共有三个参数, 分别是d_model: 词嵌入维度, 
           dropout: 置0比率, max_len: 每个句子的最大长度"""
        super(PositionalEncoding, self).__init__()

        # 实例化nn中预定义的Dropout层, 并将dropout传入其中, 获得对象self.dropout
        self.dropout = nn.Dropout(p=dropout)

        # 初始化一个位置编码矩阵, 它是一个0阵，矩阵的大小是max_len x d_model.
        pe = torch.zeros(max_len, d_model)

        
        position = torch.arange(0, max_len).unsqueeze(1)

     
        div_term = torch.exp(torch.arange(0, d_model, 2) *
                             -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

       
        pe = pe.unsqueeze(0)

        # 最后把pe位置编码矩阵注册成模型的buffer，什么是buffer呢，
        # 我们把它认为是对模型效果有帮助的，但是却不是模型结构中超参数或者参数，不需要随着优化步骤进行更新的增益对象. 
        # 注册之后我们就可以在模型保存后重加载时和模型结构与参数一同被加载.
        self.register_buffer('pe', pe)

    def forward(self, x):
        """forward函数的参数是x, 表示文本序列的词嵌入表示"""
       
        x = x + Variable(self.pe[:, :x.size(1)], 
                         requires_grad=False)
        # 最后使用self.dropout对象进行'丢弃'操作, 并返回结果.
        return self.dropout(x)

In [23]:
make_model(11, 11)
# model = MultiHeadedAttention(embedding_dim=512, head=8)

  torch.nn.init.xavier_uniform(p)


EncoderDecoder(
  (encoder): Encoder(
    (layers): ModuleList(
      (0): EncoderLayer(
        (self_attn): MultiHeadedAttention(
          (linears): ModuleList(
            (0): Linear(in_features=512, out_features=512, bias=True)
            (1): Linear(in_features=512, out_features=512, bias=True)
            (2): Linear(in_features=512, out_features=512, bias=True)
            (3): Linear(in_features=512, out_features=512, bias=True)
          )
          (norm): LayerNorm()
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
        (ff): PositionwiseFeedForward(
          (linears): ModuleList(
            (0): Linear(in_features=512, out_features=2048, bias=True)
            (1): Linear(in_features=512, out_features=2048, bias=True)
          )
          (dropout): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
          (norm): LayerNorm()
        )
      )
      (1): EncoderLaye