https://github.com/bentrevett/pytorch-seq2seq/blob/master/6%20-%20Attention%20is%20All%20You%20Need.ipynb

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

import torchtext
from torchtext.legacy.datasets import Multi30k
from torchtext.legacy.data import Field, BucketIterator

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

import spacy
import numpy as np

import random
import math
import time

In [2]:
SEED=12

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic=True

In [4]:
spacy_de = spacy.load('de_core_news_sm')
spacy_en = spacy.load("en_core_web_sm")

In [5]:
def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [6]:
SRC=Field(tokenize=tokenize_de, init_token='', eos_token='', lower=True, batch_first=True)
TRG=Field(tokenize=tokenize_en, init_token='', eos_token='', lower=True, batch_first=True)

In [9]:
train_data, valid_data, test_data = Multi30k.splits(exts=('.de', '.en'), fields=(SRC,TRG))

In [10]:
SRC.build_vocab(train_data, min_freq=2)
TRG.build_vocab(train_data, min_freq=2)

In [12]:
device= torch.device("cuda:0" if torch.cuda.is_available() else 'cpu')

In [13]:
BATCH_SIZE=128

train_iterator, valid_iterator, test_iterator = BucketIterator.splits((train_data, valid_data, test_data), batch_size=BATCH_SIZE, device=device)

In [15]:
class Encoder(nn.Module):
    def __init__(self, input_dim, hid_dim, n_layers, n_heads, pf_dim, dropout, device, max_length=100):
        super().__init__()
            
        self.device=device
        self.tok_embedding=nn.Embedding(input_dim, hid_dim)
        self.pos_embedding=nn.Embedding(max_length, hid_dim)
        
        self.layers=nn.ModuleList([EncoderLayer(hid_dim, n_heads, pf_dim, dropout, device) for _ in range(n_layers)])
        
        self.dropout=nn.Dropout(dropout)
        
        self.scale=torch.sqrt(torch.FloatTensor([hid_dim])).to(device)
    
    def forward(self,src, src_mask):
        
        
        #src=[src len, batch_size]
        #src mask=[batch size, 1, 1, src len]
        
        batch_size=src.shape[0]
        src_len=src.shape[1]
        
        pos= torch.arange(0, src_len).unsqueeze(0).repeat(batch_size,1).to(self.device)
        src= self.dropout((self.tok_embedding(src)*self.scale) + self.pos_embedding(pos))
        
        for layer in self.layers:
            src=layer(src, src_mask)
            
        return src
        
        

In [16]:
class EncoderLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, pf_dim, dropout, device):
        super().__init__()
        
        self.self_attn_layer_norm=nn.LayerNorm(hid_dim)
        self.ff_layer_norm=nn.LayerNorm(hid)