In [15]:
import torch.nn as nn
import math
from torch.autograd import Variable
import torch
import numpy as np
import copy

In [16]:


def subsequent_mask(size):
    "Mask out subsequent positions."
    attn_shape = (1, size, size)
    subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8')
    return torch.from_numpy(subsequent_mask) == 0

In [17]:
def attention(query, key, value, mask=None, dropout=None):
    "Compute 'Scaled Dot Product Attention'"
    d_k = query.size(-1)
    #print("d_k",d_k)
    scores = torch.matmul(query, key.transpose(-2, -1)) \
             / math.sqrt(d_k)
    #print("scores",scores)
    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)
    #print("scores",scores)
    p_attn = nn.functional.softmax(scores, dim = -1)
    #print("p_attn",p_attn)
    if dropout is not None:
        p_attn = dropout(p_attn)
    #print("p_attn",p_attn)
    return torch.matmul(p_attn, value), p_attn

attention(Variable(torch.rand(1, 8, 512)),
                Variable(torch.rand(1, 8, 512)),
                Variable(torch.rand(1, 8, 512)),
                mask=subsequent_mask(8),dropout=None)


(tensor([[[0.6555, 0.3519, 0.8120,  ..., 0.2309, 0.8126, 0.7065],
          [0.2924, 0.4052, 0.6049,  ..., 0.5189, 0.7465, 0.5138],
          [0.2548, 0.3130, 0.7644,  ..., 0.4686, 0.8381, 0.3453],
          ...,
          [0.4834, 0.3927, 0.4853,  ..., 0.4872, 0.6983, 0.5081],
          [0.4996, 0.3764, 0.4590,  ..., 0.4866, 0.6616, 0.5476],
          [0.5256, 0.4462, 0.4957,  ..., 0.5151, 0.6428, 0.5716]]]),
 tensor([[[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
          [0.3953, 0.6047, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
          [0.2441, 0.3541, 0.4017, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
          [0.1972, 0.2863, 0.2930, 0.2236, 0.0000, 0.0000, 0.0000, 0.0000],
          [0.1861, 0.2338, 0.2298, 0.1864, 0.1640, 0.0000, 0.0000, 0.0000],
          [0.1347, 0.1948, 0.2270, 0.1456, 0.1485, 0.1493, 0.0000, 0.0000],
          [0.1362, 0.1617, 0.2092, 0.1386, 0.1163, 0.1390, 0.0991, 0.0000],
          [0.1086, 0.1817, 0.1560, 0.1191, 0.1109, 0.1

In [18]:
def clones(module, N):
    "Produce N identical layers."
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

In [19]:
class MultiHeadedAttention(nn.Module):
    def __init__(self, h, d_model, dropout=0.1):
        super(MultiHeadedAttention, self).__init__()
        "Take in model size and number of heads."
        assert d_model % h == 0  # 多头注意力机制的维度必须是h的整数倍
        # We assume d_v always equals d_k
        self.d_k = d_model // h
        self.h = h
        # 为什么是4个线性层？
        # query, key, value, output
        self.linears = clones(nn.Linear(d_model, d_model), 4)
        self.attn = None
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, query, key, value, mask=None):

        if mask is not None:
            # Same mask applied to all h heads.
            mask = mask.unsqueeze(1)
        nbatches = query.size(0)

        # 1) Do all the linear projections in batch from d_model => h x d_k
        query, key, value = \
            [l(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
             for l, x in zip(self.linears, (query, key, value))]

        # 2) Apply attention on all the projected vectors in batch.
        x, self.attn = attention(query, key, value, mask=mask,
                                 dropout=self.dropout)

        # 3) "Concat" using a view and apply a final linear.
        x = x.transpose(1, 2).contiguous() \
             .view(nbatches, -1, self.h * self.d_k)
        return self.linears[-1](x)



In [20]:
class PositionwiseFeedForward(nn.Module):
    "Implements FFN equation."

    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.w_2(self.dropout(nn.functional.relu(self.w_1(x))))

In [21]:
class LayerNorm(nn.Module):
    "Construct a layernorm module."

    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2

In [22]:
class SublayerConnection(nn.Module):
    def __init__(self, size, dropout):
        super(SublayerConnection, self).__init__()
        self.norm = nn.LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        "Apply residual connection to any sublayer with the same size."
        return x + self.dropout(sublayer(self.norm(x)))


In [23]:
size = 512
dropout = 0.1
head = 8
x = torch.rand(2, 4, 512)
mask = subsequent_mask(4)
self_attn = MultiHeadedAttention(head, size)


def sublayer(x): return self_attn(x, x, x, mask)



In [24]:
class EncoderLayer(nn.Module):
    def __init__(self, size, self_attn, feed_forward, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 2)
        self.size = size

    def forward(self, x, mask):
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask))
        return self.sublayer[1](x, self.feed_forward)

In [25]:
class Encoder(nn.Module):
    def __init__(self, layer, N):
        super(Encoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)

    def forward(self, x, mask):
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)