In [2]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import math, copy, time
from torch.autograd import Variable
import matplotlib.pyplot as plt
import seaborn
seaborn.set_context(context="talk")
%matplotlib inline

In [3]:
class EncoderDecoder(nn.Module):
    def __init__(self,encoder,decoder,src_embed,tgt_embed,generator):
        super(EncoderDecoder,self).__init__()
        self.encoder=encoder
        self.decoder=decoder
        self.src_embed=src_embed
        self.tgt_embed=tgt_embed
        self.generator=generator
    def forward(self,src,tgt,src_mask,tgt_mask):
        return self.decoder(self.encoder(src,src_mask),src_mask,
                           tgt,tgt_mask)
    def encode(self,src,src_mask):
        return self.encoder(self.src_embed(src),src_mask)
    
    def decode(self,memory,src_mask,tgt,tgt_mask):
        return self.decoder(self.tgt_embed(tgt),memory,src_mask,tgt_mask) 

In [4]:
class Generator(nn.Module):
    def __init__(self,d_model,vocab):
        super(Generator,self).__init__()
        self.proj=nn.Linear(d_model,vocab)
    def forward(self,x):
        return F.log_softmax(self.proj(x),dim=-1);  

In [5]:
def clones(module,N):
    return nn.ModuleList([copy.deepcopy(module)] for _ in range(N))
class LayerNorm(nn.Module):
    def __init__(self,features,eps=1e-6):
        super(LayerNorm,self).__init__()
        self.a_2=nn.Parameter(torch.ones(features))
        self.b_2=nn.Parameter(torch.zeros(features))
    def forward(self,x):
        mean=x.mean(-1,keepdim=True)
        std=x.std(-1,Keepdim=True)
        return self.a_2*(x-mean)/(std+self.eps)+self.b_2

class Encoder(nn.Module):
    def __init__(self,layer,N):
        super(Encoder,self).__init__()
        self.layers=clones(layer,N)
        self.norm=LayerNorm(layer.size)
    def forward(self,x,mask):
        for layer in self.layers:
            x=layer(x,mask)
        return self.norm(x)
    

In [6]:
class SuberlayerConnection(nn.Module):
    def __init__(self,size,dropout):
        super(SuberlayerConnection,self).__init__()
        self.norm=LayerNorm(size)
        self.dropout=nn.Dropout(dropout)
    def forward(self,x,sublayer):
        return x+self.dropout(sublayer(self.norm(x)))
    
class EncoderLayer(nn.Module):
    def __init__(self,size,self_attn,feed_forward,dropout):
        super(EncoderLayer,self).__init__()
        self.self_attn=self_attm
        self.feed_forward=feed_forward
        self.sublayer=clones(SuberlayerConnection(size,dropout),2)
        self.size=size
    def forward(self,x,mask):
        x=self.sublayer[0](x,lambda x:self.self_attn(x,x,x,mask))
        return self.sublayer[1](x,self.feed_forward)


In [7]:
class Decoder(nn.Module):
    def __init__(self,layer,N):
        super(Decoder,self).__init__()
        self.layers=clones(layer,N)
        self.norm=LayerNorm(layer.size)
    
    def foward(self,x,memory,src_mask,tgt_mask):
        for layer in self.layers:
            x=layer(x,mamory,src_mask,tgt_mask)
        return self.norm(x)

class DecoderLayer(nn.Module):
    def __init__(self,size,self_attn,src_attn,feed_forward,dropout):
        super(DecoderLayer,self).__init__()
        self.size=size
        self.self_attn=self_attn
        self.src_attn=src_attn
        self.feed_forward=feed_forward
        self.sublayer=clones(SublayerConnection(size,dropout),3)
        
    def forward(self,x,memory,src_mask,tgt_mask):
        m=memory
        x=self.sublayer[0](x,lambda x:self.self_attn(x,x,x,tgt_mask))
        x=self.sublayer[1](x,lambda x:self.src_attn(x,m,m,src_mask))
        return self.sublayer[2](x,self.feed_forward)
    
        

In [8]:
def subsequent_mask(size):
    attn_shape=(1,size,size)
    subsequent_mask=np.triu(np.ones(attn_shape),k=1).astype('uint8')
    return torch.from_numpy(subsequent_mask)==0

def attention(query,kay,value,mask=None,dropout=None):
    d_k=query.size(-1)
    scores=torch.matmul(query,key.transpose(-2,-1))/math.sqrt(d_k)
    if mask is not None:
        scores=scores.masked_fill(mask==0,-1e9)
        p_attn=F.softmax(scores,dim=-1)
    if dropout is not None:
        p_attn=dropout(p_attn)
    return torch.matmul(p_attn,value),p_attn

class MultiHeadAttention(nn.Module):
    def __init__(self,h,d_model,dropout=0.1):
        super(MultiHeadAttention,self).__init__()
        assert d_model % h==0
        self.d_k=d_model//h
        self.h=h
        self.linears=clones(nn.Linear(d_model,d_model),4)
        self.attn=None
        self.dropout=nn.Dropout(p=dropout)
        
    def forward(self,query,key,value,mask=None):
        if mask is not None:
            mask=mask.unsqueeze(1)
            nbatches=query.size(0)
        query,key,value=\
            [l(x).view(nbatches,-1,self.h,self.d_k).transpose(1,2)
            for l,x in zip(self.linears,(query,key,value))]
        x,self.attn=attention(query,key,value,mask=mask,dropout=self.dropout)
        x=x.transpose(1,2).contiguous.view(nbatches,-1,self.h*self.d_k)
        return self.linears[-1](x)
        

In [9]:
class PositionwiseFeedForward(nn.Module):
    def __init__(self,d_model,d_ff,dropout=0.1):
        super(PositionwiseFeedForward,self).__init__()
        self.w_1=nn.Linear(d_model,d_ff)
        self.w_2=nn.Linear(d_ff,d_model)
        self.dropout=nn.Dropout(dropout)
        
    def forward(self,x):
        return self.w_2(self.dropout(F.relu(self.w_1(x))))

class Embeddings(nn.Module):
    def __init__(self,d_model,vocab):
        super(Embedding,self).__init__()
        self.lut=nn.Embedding(vocab,d_model)
        self.d_model=d_model
    
    def forward(self,x):
        return self.lut(x)*math.sqrt(self.d_model)

In [10]:
class PositionalEncoding(nn.Module):
    def __init__(self,d_model,dropout,max_len=5000):
        super(PositionalEncoding,self).__init__()
        self.dropout=nn.Dropout(p=dropout)
        pe=torch.zeros(max_len,d_model)
        position=torch.arange(0,max_len).unsqueeze(1)
        div_term=torch.exp(torch.arange(0,d_model,2)* -(math.log(1000.0)/d_model))
        pe[:,0::2]=torch.sin(position*div_term)
        pe[:,1::2]=torch.cos(position*div_term)
        pe=pe.unsqueeze(0)
        self.register_buffer('pe',pe)
    def forward(self,x):
        x=x+Variable(self.pe[:,:x.size(1)],requires_grad=False)
        return self.dropout(x)

    

In [11]:
def make_model(src_vocab,tgt_vocab,N=6,d_model=512,d_ff=2048,h=8,dropout=1):
    c=copy.deepcopy
    attn=MultiHeadAttention(h,d_model)
    ff=PositionwiseFeedForward(d_model,d_ff,dropout)
    position=PositionalEncoding(d_model,dropout)
    model=EncoderDecoder(
          Encoder(EncoderLayer(d_model,c(attn),c(ff),dropout),N),
          Decoder(DecoderLayer(d_model,c(attn),c(attn),
                              c(ff),dropout),N),
          nn.Sequential(Embeddings(d_model,src_vocab),c(position)),
          nn.Sequential(Embeddings(d_model,tgt_vocab),c(position)),
          Generator(d_model,tgt_vocab))
    for p in model.parameters():
        if p.dim()>1:
            nn.init.xavier_uniform(p)
    return model

In [None]:
def read_data(data_chinese_dir,data_english_dir):
    chinese=[]
    english=[]
    with open(data_chinese_dir,'r',encoding='utf-8') as f:
        for line in f:
            chinese.append(line)
    with open(data_english_dir,'r',encoding='utf-8') as f:
        for line in f:
            english.append(line)
    return {"chinese":chinese,
            "english":english
           }  

def build_worddict(data):
    words=[]
    words.extend(["_PAD_","_OOV_","_BOS_","_EOS_"])
    for sentence in data["premise"]:
        words.extend(sentence.strip().split(" "))
    for sentence in data["hypothesis"]:
        words.extend(sentence.strip().split(" ")) 
    word_id={}
    id_word={}
    i=0
    for index,word in enumerate(words):
        if word not in word_id:
            word_id[word]=i
            id_word[i]=word
            i+=1
    #保存词典
    with open(worddict_dir, "w",encoding='utf-8') as f:
        for word in word_id:
            f.write("%s\t%d\n"%(word, word_id[word]))
    return word_id,id_word

def sentence2idList(sentence,word_id):
    ids=[]
    ids.append(word_id["_BOS_"])
    sentence=sentence.strip().split(" ")
    for word in sentence:
        if word not in word_id:
            ids.append(word_id["_OOV_"])
        else:
            ids.append(word_id[word])
    ids.append(word_id["_EOS_"])
    return ids

def data2id(data,word_id):
    premise_id=[]
    hypothesis_id=[]
    labels_id=[] 
    labels_map={"entailment":0,"neutral":1,"contradiction":2}
    for i,label in enumerate(data["labels"]):
        if label not in labels_map:   #忽略没有label的例子
            continue
        premise_id.append(sentence2idList(data["premise"][i],word_id))
        hypothesis_id.append(sentence2idList(data["hypothesis"][i],word_id))
        labels_id.append(labels_map[label])
            
    return {"premise_id":premise_id,
            "hypothesis_id":hypothesis_id,
            "labels_id":labels_id}    

def build_embeddings(embedding_file,word_id):   #读取文件存入集合中
    embeddings_map={}
    with open(embedding_file,'r',encoding='utf-8') as f:
        for line in f:
            line=line.strip().split()
            word=line[0]
            if word in word_id:
                embeddings_map[word]=line[1:]
    #放入矩阵中
    words_num = len(word_id)
    embedding_dim=len(embeddings_map['a'])
    embedding_matrix=np.zeros((words_num,embedding_dim))
    #print(words_num,embedding_dim)
    missed_cnt=0
    for i,word in enumerate(word_id):
        if word in embeddings_map:
            embedding_matrix[i]=embeddings_map[word]
        else:
            if word=="_PAD_":
                continue
            else:
                missed_cnt+=1
                embedding_matrix[i]=np.random.normal(size=embedding_dim)
    print("missed word count: %d"%(missed_cnt)) 
    return embeddings_map,embedding_matrix

In [None]:
d=read_data(data_chinese_dir,data_english_dir)
print(d["chinese"][:10])
print(d["english"][:10])