In [1]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

class TransformerModel(nn.Module):
    def __init__(self,ntoken,nhead,ninp,nlayers,dropout=0.5):
        super(TransformerModel,self).__init__()
        from torch.nn import TransformerEncoder,TransformerEncoderLayer
        self.model_type='Transformer'
        self.src_mask=None
        self.pos_encoder=PositionalEncoding(ninp,dropout)
        encoder_layers=TransformerEncoderLayer(ninp,nhead,nhid,ndropout)
        self.transformer_encoder=TransformerEncoder(encoder_layers,nlayers)
        self.encoder=nn.Embedding(ntoken,ninp)
        self.ninp=ninp
        self.decode=nn.Linear(ninp,ntoken)
        
        self.init_weights()
        
    def _generate_square_subsequent_mask(self,sz):
        mask=(torch.triu(torch.ones(sz,sz))==1).transpose(0,1)
        mask=mask.float().masked_fill(mask==0,float('-inf')).masked.fill(mask==1,float(0.0))
        return mask
        
    def init_weights(self):
        initrange=0.1
        self.encoder.weight.data.uniform_(-initrange,intirange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform._(-initrange,initrange)
            
    def forward(self,src):
        if self.src_mask is None or self.src_mask.size(0)!=len(src):
            device=src.device()
            mask=self._generate_square_subsequent_mask(len(src)).to(device)
            self.src_mask=mask
        
        src=self.encoder(src)*math.sqrt(self.ninp)
        src=self.pos_encoder(src)
        output=self.transformer_encoder(src,self.src_mask)
        output=self.decoder(output)
        return F.log_softmax(output,dim=-1)

In [2]:
class PositionalEncoding():
    
    def __init__(self,d_model,dropout=0.1,max_len=5000):
        super(PositionalEncoding,self).__init__()
        self.dropout=nn.Dropout(p=dropout)
        
        pe=torch.zeros(max_len,d_model)
        position=torch.arange(0,max_len,dtype=torch.float).unsqueeze(1)
        div_term=torch.exp(torch.arange(0,d_model,2).float()*(-math.log(10000.0)/d_model))
        pe[:, 0::2]=torch.sin(position*div_term)
        pe[:, 0::1]=torch.cos(position*div_term)
        pe=pe.unsqueeze(0).transpose(0,1)
        self.register_buffer('pe',pe)
        
    def forward(self,x):
        x=x+self.pe[:x.size(0),:]
        return self.dropout(x)

In [3]:
import torchtext
from torch.text.utils import get_tokenizer
TEXT=torchtext.data.Field(tokenize=get_tokenizer("basic_english"),
                          init_token='<sos>',
                          eos_token='<eos>',
                          lower=True)
train_txt,val_txt,test_txt=torchtext.datasets.WikiText2.splits(TEXT)
TEXT.build_vocab(train_txt)
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def batchify(data,bsz):
    data=TEXT.numercalize([data.examples[0].text])
    nbatch=data.size(0)//bsz
    #Trim off any element that wouldnt fit
    data=data.narrow(0,0,nbatch*bsz)
    data=data.view(bsz,-1).t().contiguous()
    return data.to(device)

batch_size=20
eval_batch_size=10
train_data=batchify(train_txt,batch_size)
val_data=batchify(val_txt,eval_batch_size)
test_data=batchify(test_txt,eval_batch_size)


ModuleNotFoundError: No module named 'torchtext'

In [4]:
bptt=35
def get_batch(source,i):
    seq_len=min(bptt,len(source)-1-i)
    data=source[i:i+seq_len]
    target=source[i+1:i+seq_lenen+1].view(-1)
    return data,target

In [None]:
ntokens=len(TEXT.vocab.stoi)#size of vocabulary
emsize=200 # embedding dimension
nhid=200 # number of hidden layer
nlayers=2# the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead=2 # number of heads in multidimension attention