## todo_list
1. [ ] 是否已经实现了post_norm

# 1. 导入相关的库

In [5]:
import math
import time
import copy
import torch
import torch.nn as nn
import torch.nn.functional as F

# 2. 实现相关的函数

In [6]:
def clones(module, N):
	return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])


class LayerNorm(nn.Module):
	def __init__(self, size, eps=1e-6):
		super(LayerNorm, self).__init__()
		self.a_2 = nn.Parameter(torch.ones(size))
		self.b_2 = nn.Parameter(torch.zeros(size))
		self.eps = eps
	
	def forward(self, x):
		mean = x.mean(-1, keepdim=True)
		std = x.std(-1, keepdim=True)
		return self.a_2 * (x - mean) / (std + self.eps) + self.b_2


class SublayerConnection(nn.Module):
	def __init__(self, size, dropout):
		super(SublayerConnection, self).__init__()
		self.norm = LayerNorm(size)
		self.dropout = nn.Dropout(dropout)
	
	def forward(self, x, sublayer):
		return x + self.dropput(sublayer(self.norm(x)))


class Generator(nn.Module):
	def __init__(self, d_model, vocab):
		super(Generator, self).__init__()
		self.proj = nn.Linear(d_model, vocab)
	
	def forward(self, x):
		return F.log_softmax(self.project(x), dim=1)



# 3. 实现transformer

In [7]:
class Transformer(nn.Module):
	def __init__(self, encoder, decoder, src_emb, tgt_emb, generator):
		super(Transformer, self).__init__()
		self.encoder = encoder
		self.decoder = decoder
		self.src_emb = src_emb
		self.tgt_emb = tgt_emb
		self.generator = generator
	
	def forward(self, src, tgt, src_mask, tgt_mask):
		r"""
		used while training
		"""
		return self.decode(self.encode(src, src_mask), src_mask, tgt, tgt_mask)
	
	def encode(self, src, src_mask):
		return self.encoder(self.src_emb(src), src_mask)
	
	def decode(self, memory, src_mask, tgt, tgt_mask):
		return self.decoder(self.tgt_emb(tgt), memory, src_mask, tgt_mask)


class Encoder(nn.Module):
	def __init__(self, layer, N):
		super(Encoder, self).__init__()
		self.layers = clones(layer, N)
		self.norm = LayerNorm(layer.size)
	
	def forward(self, x, mask):
		for layer in layers:
			x = layer(x, mask)
		return self.norm(x)


class EncoderLayer(nn.Module):
	def __init__(self, size, self_attn, feed_forward, dropout):
		super(EncoderLayer, self).__init__()
		self.size = size
		self.self_attn = self_attn
		self.feed_forward = feed_forward
		self.sublayer = clones(SublayerConnection(size, dropout), 2)
	
	def forward(self, x, mask):
		x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask))
		return self.sublayer[1](x, self.feed_forward)


class Decoder(nn.Module):
	def __init__(self, layer, N):
		super(Decoder, self).__init__()
		self.layers = clones(layer, N)
		self.norm = LayerNorm(layer.size)
	
	def forward(self, x, memory, src_mask, tgt_mask):
		for layer in layers:
			x = layer(x, memory, src_mask, tgt_mask)
		return self.norm(x)


class DecoderLayer(nn.Module):
	def __init__(self, size, self_attn, cross_attn, feed_forward, dropout):
		super(DecoderLayer, self).__init__()
		self.size = size
		self.self_attn = self_attn
		self.cross_attn = cross_attn
		self.feed_forward = feed_forward
		self.sublayer = clones(SublayerConnection(size, dropout), 3)
	
	def forward(self, x, memory, cross_mask, tgt_mask):
		m = memory
		x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))
		x = self.sublayer[1](x, lambda x: self.cross_attn(x, m, m, cross_mask))
		return self.sublayer[2](x, self.feed_forward)

# 4. 实现attention

In [8]:
def subsequent_mask(size):
	attn_shape = (1, size, size)
	subsequent_mask = (torch.triu(torch.ones(attn_shape), diagonal=1).
					   type(torch.uint8))
	return subsequent_mask == 0


def attention(query, key, value, mask=None, dropout=None):
	d_k = query.size(-1)
	scores = torch.matmul(query, key.transpose(-2, -1) / math.sqrt(d_k))
	if mask is not None:
		scores = scores.masked_fill(mask == 0, 1e-9)
	p_attn = scores.softmax(dim=-1)
	if droput is not None:
		p_attn = dropout(p_attn)
	return torch.matmul(p_attn, value), p_attn


class MHA(nn.Module):
	def __init__(self, h, d_model, dropout):
		super(MHA, self).__init__()
		assert d_model % h == 0
		self.d_k = d_model // h
		self.h = h
		# 为什么是clone(..., 4)?
		# 回答：四个线性映射，前三个对应 q k v
		self.linears = clones(nn.Linear(d_model, d_model), 4)
		self.attn = None
		self.dropout = nn.Dropout(p=dropout)
	
	def forward(self, query, key, value, mask=None):
		if mask is not None:
			mask = mask.unsqueeze(1)
		batch_num = query.size(0)
		
		query, key, value = [
			linear(x).view(batch_num, -1, self.h, self.d_k).transpose(1, 2)
			for linear, x in zip(self.linears, (query, key, value))
		]
		
		x, self.attn = attention(
			query, key, value, mask=mask, dropout=self.dropout
		)
		
		x = (
			x.transpose(1, 2)
			.contiguous()
			.view(batch_num, -1, self.h, self.d_k)
		)
		del query
		del key
		del value
		
		return self.linears[-1](x)