In [2]:
import math
import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as Data

In [3]:
"""
S:decoder的输入
E:decoder的输出
P:如果当前批处理数据大小小于序列长度，将填充空白序列的符号
"""
sentences = [
        # enc_input           dec_input         dec_output
        ['ich mochte ein bier P', 'S i want a beer .', 'i want a beer . E'],
        ['ich mochte ein cola P', 'S i want a coke .', 'i want a coke . E']
]

In [4]:
# Padding 是 0
src_vocab = {'P' : 0, 'ich' : 1, 'mochte' : 2, 'ein' : 3, 'bier' : 4, 'cola' : 5} # 源词典
src_vocab_size = len(src_vocab)

tgt_vocab = {'P' : 0, 'i' : 1, 'want' : 2, 'a' : 3, 'beer' : 4, 'coke' : 5, 'S' : 6, 'E' : 7, '.' : 8} # 目标词词典
tgt_vocab_size = len(tgt_vocab)

In [6]:
id2word = {i:w for i,w in enumerate(tgt_vocab)}

In [7]:
src_len = 5 # enc_input 最大序列长度
tgt_len = 6 # dec_input 最大序列长度

# Transformer 参数
d_model = 512 # Embedding Size
d_ff = 2048   # 前向传播维度
d_k = d_v = 64 # K(=Q),V demension
n_layers = 6 
n_heads = 8

In [10]:
def make_data(sentences):
    enc_inputs, dec_inputs, dec_outputs = [], [], []
    for i in range(len(sentences)):
        enc_input = [[src_vocab[n] for n in sentences[i][0].split()]] # [[1, 2, 3, 4, 0], [1, 2, 3, 5, 0]]
        dec_input = [[tgt_vocab[n] for n in sentences[i][1].split()]] # [[6, 1, 2, 3, 4, 8], [6, 1, 2, 3, 5, 8]]
        dec_output = [[tgt_vocab[n] for n in sentences[i][2].split()]] # [[1, 2, 3, 4, 8, 7], [1, 2, 3, 5, 8, 7]]
        
        enc_inputs.extend(enc_input)
        dec_inputs.extend(dec_input)
        dec_outputs.extend(dec_output)
    return torch.LongTensor(enc_inputs) , torch.LongTensor(dec_inputs) , torch.LongTensor(dec_outputs)

enc_inputs, dec_inputs, dec_outputs = make_data(sentences)

In [12]:
class MyDataSet(Data.Dataset):
    def __init__(self, enc_inputs, dec_inputs, dec_outputs):
        super(MyDataSet, self).__init__()
        self.enc_inputs=enc_inputs
        self.dec_inputs=dec_inputs
        self.dec_outputs=dec_outputs
        
    def __len__(self):
        return self.enc_inputs.shape[0]
    
    def __getitem__(self,idx):
        return self.enc_inputs[idx], self.dec_inputs[idx], self.dec_outputs[idx]
        
loader = Data.DataLoader(MyDataSet(enc_inputs, dec_inputs, dec_outputs), batch_size=2 , shuffle=True)

tensor([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13.,
        14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27.,
        28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40., 41.,
        42., 43., 44., 45., 46., 47., 48., 49.])

In [15]:
class PositionEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p = dropout)
        
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0)/ d_model))
        
        pe[:,0::2] = torch.sin(position * div_term)
        pe[:,1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0,1)
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        # x:[seq_len, batch_size, d_model]
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [None]:
def get_attn_pad_mask(seq_q, seq_k):
    """
    seq_q:[batch_size, seq_len]
    seq_k:[batch_size, seq_len]
    seq_len = src_len or tgt_len 不一定等长
    """
    batch_size, len_q = seq_q.size()
    batch_size, len_k = seq_k,size()
    # eq(zero) is PAD token
    pad_attn_mask = seq_k.data.eq(0).unsqueeze(1) # [batch_size, 1, len_k] False is Masked
    return pad_attn_mask.expand(batch_size, len_q, len_k) # [batch_size, len_q, len_k]
    

In [None]:
class Encoder(nn.Module):
    def __init__(self):
        super(Encoder,self).__init__()
        self.src_emb = nn.Embedding(src_vocab_size, d_model)
        self.pos_emb = PositionEncoding(d_model)
        self.layers = nn.ModuleList([EncoderLayer() for _ in range(n_layers)])
        
    def forward(self, enc_inputs):
        # enc_inputs : [batch_size, src_len]
        enc_outputs = self.src_emb(enc_inputs) # [batch_size,src_len, d_model]
        enc_outputs = self.pos_emb(enc_outputs.transpose(0,1)).transpose(0,1) # [batch_size, stc_len, d_model] 
        
        enc_self_attn_mask = get_attn_pad_mask(enc_inputs, enc_inputs) # [batch_size, src_len, src_len] 要隐藏掉padding 0
        enc_self_attns = []
        for layer in self.layers:
            # enc_outputs:[batch_size, src_len, d_model]
            # enc_self_attn:[batch_size, n_heads, src_len, src_len]
            enc_outputs, enc_self_attn = layer(enc_outputs,enc_self_attn_mask) 
            enc_self_attns.append(enc_self_attn)
        return enc_outputs, enc_self_attns
        

In [None]:
class Transformer(nn.Module):
    def __init__(self):
        super(Transformer, self).__init__()
        self.encoder = Encoder().cuda()
        self.decoder = Decoder().cuda()
        self.projection = nn.Linear(d_model, tgt_vocab_size, bias=False).cuda()
    
    def forward(self,enc_inputs, dec_inputs):
        """
        enc_inputs:[batch_size, src_len]
        dec_inputs:[batch_size, tgt_len]
        """
        # enc_outputs:[batch_size, src_len, d_model]
        # enc_self_attns:[n_layers, batch_size, n_heads, src_len, src_len]
        enc_outputs, enc_self_attns = self.encoder(enc_inputs)
        
        # dec_outputs:[batch_size, tgt_len, d_model]
        # dec_self_attns:[n_layers, batch_size, n_heads, tgt_len, tgt_len]
        # dec_enc_attns:[n_layers, batch_size, n_heads, tgt_len, src_len]
        dec_outputs, dec_self_attns, dec_enc_attns = self.decoder(dec_inputs, enc_inputs, enc_outputs)
        # dec_logits:[batch_size, tgt_len, tgt_vocab_size]
        dec_logits = self.projection(dec_outputs)
        
        return dec_logits.view(-1, dec_logits.size(-1)) , enc_self_attns, dec_self_attns, dec_enc_attns
