In [21]:
import torch.nn as nn
import math
from torch.autograd import Variable
import torch
import numpy as np
import copy
from ipynb.fs.full.encoder import Encoder, EncoderLayer, MultiHeadedAttention, PositionwiseFeedForward, LayerNorm, sublayerConnection

In [22]:
def clones(module, N):
    "Produce N identical layers."
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

In [23]:
class DecoderLayer(nn.Module):
    def __init__(self, size, self_attn, src_attn, feed_forward, dropout):
        super(DecoderLayer, self).__init__()
        self.size = size
        self.self_attn = self_attn
        self.src_attn = src_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(sublayerConnection(size, dropout), 3)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, memory, src_mask, tgt_mask):
        m = memory
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))
        x = self.sublayer[1](x, lambda x: self.src_attn(x, m, m, src_mask))
        return self.sublayer[2](x, self.feed_forward)

In [24]:
class Decoder(nn.Module):
    def __init__(self, layer, N):
        super(Decoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)

    def forward(self, x, memory, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, memory, src_mask, tgt_mask)
        return self.norm(x)

In [25]:
size = 512
dropout = 0.1
N = 6
head = 8
d_ff = 64
d_k = 64
c = copy.deepcopy
attn = MultiHeadedAttention(head, d_k)
ff = PositionwiseFeedForward(size, d_ff, dropout)
layer = DecoderLayer(size, c(attn), c(attn), c(ff), dropout)