# Preprocessing


In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch import Tensor

import torchtext
from torchtext.legacy.datasets import Multi30k
from torchtext.legacy.data import Field, BucketIterator

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

import spacy
import numpy as np

import random
import math
import time

In [2]:

### (PARAMS)
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

spacy_de = spacy.load('de_core_news_sm')
spacy_en = spacy.load('en_core_web_sm')

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# DEVICE = torch.device('cpu')


In [3]:
def tokenize_de(text):
    """
    Tokenizes German text from a string into a list of strings
    """
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [4]:
SRC = Field(tokenize = tokenize_de, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True, 
            batch_first = False)

TRG = Field(tokenize = tokenize_en, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True, 
            batch_first = False)

In [5]:
train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'), 
                                                    fields = (SRC, TRG))

In [6]:
### (TEST)
test_index = 3
print(type(train_data.examples))
print("train_data")
print(vars(train_data.examples[test_index])['src'])
print(vars(train_data.examples[test_index])['trg'])
print("")

print("valid_data")
print(vars(valid_data.examples[test_index])['src'])
print(vars(valid_data.examples[test_index])['trg'])
print("")

print("test_data")
print(vars(test_data.examples[test_index])['src'])
print(vars(test_data.examples[test_index])['trg'])
print("")


<class 'list'>
train_data
['ein', 'mann', 'in', 'einem', 'blauen', 'hemd', 'steht', 'auf', 'einer', 'leiter', 'und', 'putzt', 'ein', 'fenster', '.']
['a', 'man', 'in', 'a', 'blue', 'shirt', 'is', 'standing', 'on', 'a', 'ladder', 'cleaning', 'a', 'window', '.']

valid_data
['zwei', 'männer', 'bauen', 'eine', 'blaue', 'eisfischerhütte', 'auf', 'einem', 'zugefrorenen', 'see', 'auf']
['two', 'men', 'setting', 'up', 'a', 'blue', 'ice', 'fishing', 'hut', 'on', 'an', 'iced', 'over', 'lake']

test_data
['fünf', 'leute', 'in', 'winterjacken', 'und', 'mit', 'helmen', 'stehen', 'im', 'schnee', 'mit', 'schneemobilen', 'im', 'hintergrund', '.']
['five', 'people', 'wearing', 'winter', 'jackets', 'and', 'helmets', 'stand', 'in', 'the', 'snow', ',', 'with', 'snowmobiles', 'in', 'the', 'background', '.']



In [7]:
### (lzj)
# SRC.build_vocab(train_data, min_freq = 2)
# TRG.build_vocab(train_data, min_freq = 2)
SRC.build_vocab(train_data)
TRG.build_vocab(train_data)

In [8]:
### (TEST)
print(len(SRC.vocab))
print(len(TRG.vocab))

18668
9799


In [9]:
### (lzj)
### (PARAMS)
BATCH_SIZE = 32

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
     batch_size = BATCH_SIZE,
     device = DEVICE)

# batch.src / batch.trg : [N,S]


In [10]:

### (TEST) generate correct form of data
for i,batch in enumerate(train_iterator):
    if i < 2 :
        print(f'i = {i}')
        src = batch.src
        trg = batch.trg
        print(f'src.shape: {src.shape}')
        print(f'trg.shape: {trg.shape}')
        print("")
        
        print(f'src[:,:3]: {src[:,:3]}')
        print(f'src[:,-3:]: {src[:,-3:]}')  
        print("")
        
        print(f'trg[:,:3]: {trg[:,:3]}')
        print(f'trg[:,-3:]: {trg[:,-3:]}')        
        
#         src = torch.unsqueeze(src, dim = 0)
#         trg = torch.unsqueeze(trg, dim = 0)
#         print(f'src.shape: {src.shape}')
#         print(f'trg.shape: {trg.shape}')
#         print(f'src: {src[:,:3,:]}')    
#         print(f'trg: {trg[:,:3,:]}')        
        
        print("")  
    else:
        break
    

i = 0
src.shape: torch.Size([22, 32])
trg.shape: torch.Size([28, 32])

src[:,:3]: tensor([[    2,     2,     2],
        [   18,     5,    18],
        [ 4706,    13,    54],
        [   57,    11,    52],
        [    5,     6,   100],
        [  318,   713,     6],
        [    4,    95, 18255],
        [    3,    23,    27],
        [    1,    17,    14],
        [    1, 15182,   189],
        [    1,     7,     4],
        [    1,     6,     3],
        [    1, 16887,     1],
        [    1,    21,     1],
        [    1,     4,     1],
        [    1,     3,     1],
        [    1,     1,     1],
        [    1,     1,     1],
        [    1,     1,     1],
        [    1,     1,     1],
        [    1,     1,     1],
        [    1,     1,     1]], device='cuda:0')
src[:,-3:]: tensor([[    2,     2,     2],
        [    5,    18,     8],
        [   66,     7,    36],
        [   25, 12909,    73],
        [   60,   196,    60],
        [   21,    26,    21],
        [    6,    3

In [11]:
SRC_PAD = SRC.pad_token
SRC_PAD_INIT = SRC.init_token
SRC_PAD_EOS = SRC.eos_token

SRC_PAD_IDX = SRC.vocab.stoi[SRC_PAD]
SRC_PAD_INIT_IDX = SRC.vocab.stoi[SRC_PAD_INIT]
SRC_PAD_EOS_IDX = SRC.vocab.stoi[SRC_PAD_EOS]

TRG_PAD = TRG.pad_token
TRG_PAD_INIT = TRG.init_token
TRG_PAD_EOS = TRG.eos_token

TRG_PAD_IDX = TRG.vocab.stoi[TRG_PAD]
TRG_PAD_INIT_IDX = SRC.vocab.stoi[TRG_PAD_INIT]
TRG_PAD_EOS_IDX = SRC.vocab.stoi[TRG_PAD_EOS]

In [12]:
# ### (TEST)
# print(f'SRC_PAD: {SRC_PAD}')
# print(f'SRC_PAD_INIT: {SRC_PAD_INIT}')
# print(f'SRC_PAD_EOS: {SRC_PAD_EOS}')
# print("")

# print(f'SRC_PAD_IDX: {SRC_PAD_IDX}')
# print(f'SRC_PAD_INIT_IDX: {SRC_PAD_INIT_IDX}')
# print(f'SRC_PAD_EOS_IDX: {SRC_PAD_EOS_IDX}')
# print("")

# print(f'TRG_PAD: {TRG_PAD}')
# print(f'TRG_PAD_INIT: {TRG_PAD_INIT}')
# print(f'TRG_PAD_EOS: {TRG_PAD_EOS}')
# print("")

# print(f'TRG_PAD_IDX: {TRG_PAD_IDX}')
# print(f'TRG_PAD_INIT_IDX: {TRG_PAD_INIT_IDX}')
# print(f'TRG_PAD_EOS_IDX: {TRG_PAD_EOS_IDX}')
# print("")

# Creating model


## TransformerModel Class 

In [13]:
from torch.nn import Transformer,TransformerEncoder,TransformerDecoder,\
TransformerEncoderLayer,TransformerDecoderLayer


In [14]:
class TransformerModel(nn.Module):
    def __init__(self,
                input_dim,
                output_dim,
                n_head,
                hid_dim,
                n_encoder_layer,
                n_decoder_layer,
                ff_dim,
                dropout,
                device,
                max_len = 200):
        super(TransformerModel,self).__init__()
        self.device = device
        self.hid_dim = hid_dim
        
        ## self layers 
        encoder_norm = nn.LayerNorm(hid_dim)  
        decoder_norm = nn.LayerNorm(hid_dim)  
        
        encoder_layer = TransformerEncoderLayer(d_model=hid_dim, nhead=n_head,
                                                dim_feedforward=ff_dim)

        self.transformer_encoder = TransformerEncoder(encoder_layer, n_encoder_layer)
        
        decoder_layer = TransformerDecoderLayer(d_model=hid_dim, nhead=n_head,
                                                dim_feedforward=ff_dim)
        self.transformer_decoder = TransformerDecoder(decoder_layer, n_decoder_layer)
              
        self.src_tok_emb = TokenEmbedding(input_dim,hid_dim).to(device)
        self.trg_tok_emb = TokenEmbedding(output_dim,hid_dim).to(device)
        self.pos_emb = PositionalEncoding(hid_dim,dropout=dropout).to(device)
        
        self.out_linear = nn.Linear(hid_dim,output_dim).to(device)
        
    def forward(self,src,trg,src_attn_mask,trg_attn_mask,
                src_pad_mask,trg_pad_mask,memory_pad_mask,
                enable_test = False):
        # src: [S,N]
        # trg: [T,N]
                                
        N = src.shape[1]
        S = src.shape[0]
        T = trg.shape[0]
    
        # [S,N,E],[T,N,E]
        src_emb = self.pos_emb(self.src_tok_emb(src))
        trg_emb = self.pos_emb(self.trg_tok_emb(trg))        
            
        memory = self.transformer_encoder(src_emb, src_attn_mask, src_pad_mask)   
            
        # [T,N,E]        
        output = self.transformer_decoder(trg_emb, memory, trg_attn_mask, None,
                                        trg_pad_mask, memory_pad_mask )        
        # [T,N,out_dim]
        output = \
            self.out_linear(output).to(self.device)
             
        return output
                 
        
    def create_trg_attn_mask(self,sz):
        mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask      
    
    def create_mask(self,src:Tensor, trg:Tensor): 
        # src: [S,N]
        # trg: [T,N]
        S = src.shape[0]
        T = trg.shape[0]

        # [S,S],[T,T]
        src_attn_mask = torch.zeros((S,S), device = DEVICE).type(torch.bool)
        trg_attn_mask = self.create_trg_attn_mask(T).to(self.device)

        # [N,S],[N,T]
        src_pad_mask = (src == SRC_PAD_IDX).transpose(0,1).to(self.device)
        trg_pad_mask = (trg == TRG_PAD_IDX).transpose(0,1).to(self.device)

        return src_attn_mask,src_pad_mask,trg_attn_mask,trg_pad_mask
    
    
    def encode(self,src,src_attn_mask):
        # src: [S,N]
        # return: []
        return self.transformer_encoder(self.pos_emb(self.src_tok_emb(src)),src_attn_mask)
    
    def decode(self,trg,memory,trg_attn_mask):
        # trg: [T,N]
        # return [T,N,E]
        return self.transformer_decoder(self.pos_emb(self.trg_tok_emb(trg)),memory,trg_attn_mask)
    

In [15]:
class PositionalEncoding(nn.Module):
    def __init__(self, emb_size: int, dropout, maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2) * math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding + 
                            self.pos_embedding[:token_embedding.size(0),:])

class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size
    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

## Creating TransformerModel

In [16]:

### (PARAMS)
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)

HEAD = 8
HID_DIM = 256
ENC_LAYER = 3
DEC_LAYER = 3
FF_DIM = 512
DROPOUT = 0.1

model = TransformerModel(
                input_dim = INPUT_DIM,
                output_dim = OUTPUT_DIM,
                n_head = HEAD,
                hid_dim = HID_DIM,
                n_encoder_layer = ENC_LAYER,
                n_decoder_layer = DEC_LAYER,
                ff_dim = FF_DIM,
                dropout = DROPOUT,
                device = DEVICE,
                max_len = 200)

for p in model.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)
        
model = model.to(DEVICE)

In [17]:
print(INPUT_DIM)
print(OUTPUT_DIM)

18668
9799


## Creating optimizer

In [18]:
class NoamOpt:
    "Optim wrapper that implements rate."
    def __init__(self, model_size, factor, warmup, optimizer):
        self.optimizer = optimizer
        self._step = 0
        self.warmup = warmup
        self.factor = factor
        self.model_size = model_size
        self._rate = 0  
        
    def step(self):
        "Update parameters and rate"
        self._step += 1
        rate = self.rate()
        for p in self.optimizer.param_groups:
            p['lr'] = rate
        self._rate = rate
        self.optimizer.step()   
        
    def rate(self, step = None):
        "Implement `lrate` above"
        if step is None:
            step = self._step
        return self.factor * \
            (self.model_size ** (-0.5) *
            min(step ** (-0.5), step * self.warmup ** (-1.5)))
    
    def zero_grad(self):
        return self.optimizer.zero_grad()
    
def get_std_opt(model):
    return NoamOpt(model.hid_dim, 2, 4000,
            torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))



In [19]:
### (PARAMS)
LEARNING_RATE = 0.0005
optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)
# optimizer = NoamOpt(model.hid_dim, 1, 400,
#         torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))



## Creating loss fcn

In [20]:
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX).to(DEVICE)
# criterion = nn.CrossEntropyLoss()

# Trainning & Testing


## Train fcn

In [21]:

### (PARAMS)
enable_test = False

def train_epoch(model, iterator, optimizer, criterion, max_norm):
    model.train()
    losses = 0.0
    
    for i,batch in enumerate(iterator):   
        # [S,N],[T,N]
        src = batch.src.to(DEVICE)
        trg = batch.trg.to(DEVICE)
        
        ### (TODO) below: trg -> trg_input,trg_output
        trg_input = trg[:-1,:]
        
        src_attn_mask,src_pad_mask,trg_attn_mask,trg_pad_mask = \
                                model.create_mask(src,trg_input)  
     
        out = model(src,trg_input,src_attn_mask,trg_attn_mask,src_pad_mask,trg_pad_mask,src_pad_mask)
        out_E = out.shape[-1] 
        
        optimizer.zero_grad()   
        
        out = out.contiguous().view(-1,out_E)
        trg_output = trg[1:,:]        
        trg_output = trg_output.contiguous().view(-1)
       
        loss = criterion(out,trg_output)
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(),max_norm)
        optimizer.step()
        
        losses += loss.item()
        

        ### (TEST)
        if(enable_test and i%200==0):
            print(f'in train_epoch(): loss={loss.item()} \t losses={losses} \t len(iterator)={len(iterator)} \n')

    return losses / len(iterator)
    

## Evaluate fcn

In [22]:

def eval_epoch(model, iterator, criterion):
    model.eval()
    losses = 0.0

    with torch.no_grad():    
        for i,batch in enumerate(iterator):          
            
            src = batch.src.to(DEVICE)
            trg = batch.trg.to(DEVICE) 
    
            ### (TODO) below: trg -> trg_input,trg_output
            trg_input = trg[:-1,:]
            
            src_attn_mask,src_pad_mask,trg_attn_mask,trg_pad_mask = \
                                    model.create_mask(src,trg_input)  

            out = model(src,trg_input,src_attn_mask,trg_attn_mask,src_pad_mask,trg_pad_mask,src_pad_mask)
            out_E = out.shape[-1] 
        
            out = out.contiguous().view(-1,out_E)
            trg_output = trg[1:,:]        
            trg_output = trg_output.contiguous().view(-1)
            
            loss = criterion(out,trg_output)  
            losses += loss.item()
            
    return losses / len(iterator)


## Trainning process

In [23]:
### (PARAMS)
EPOCH = 12    # (TEMP)
MAX_NORM = 1.0
MODEL_NAME = 'model_simple.pt'

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

# def draw_epoch_loss():
#     pass

def train():
    best_valid_loss = float('inf')
    train_losses = []
    valid_losses = []
    
    for epoch in range(EPOCH):
        start_time = time.time()
        train_loss = \
            train_epoch(model = model, iterator = train_iterator, \
                        optimizer = optimizer, criterion = criterion, \
                        max_norm = MAX_NORM)
        valid_loss = \
            eval_epoch(model = model, iterator = valid_iterator, criterion = criterion)
        end_time = time.time()
    
        m,s = epoch_time(start_time, end_time)
        print(f'Epoch: {epoch+1:02} | Time: {m}m {s}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')        
        print("")
        
        train_losses.append(train_loss)
        valid_losses.append(valid_loss)
        
        if (valid_loss < best_valid_loss):
            best_valid_loss = valid_loss
            torch.save(model.state_dict(),MODEL_NAME)
            

In [24]:
### trainning process
train()

Epoch: 01 | Time: 0m 45s
	Train Loss: 3.901 | Train PPL:  49.476
	 Val. Loss: 2.922 |  Val. PPL:  18.586

Epoch: 02 | Time: 0m 45s
	Train Loss: 2.614 | Train PPL:  13.654
	 Val. Loss: 2.220 |  Val. PPL:   9.207

Epoch: 03 | Time: 0m 45s
	Train Loss: 1.995 | Train PPL:   7.355
	 Val. Loss: 1.944 |  Val. PPL:   6.983

Epoch: 04 | Time: 0m 46s
	Train Loss: 1.595 | Train PPL:   4.929
	 Val. Loss: 1.838 |  Val. PPL:   6.286

Epoch: 05 | Time: 0m 46s
	Train Loss: 1.317 | Train PPL:   3.732
	 Val. Loss: 1.805 |  Val. PPL:   6.079

Epoch: 06 | Time: 0m 46s
	Train Loss: 1.106 | Train PPL:   3.021
	 Val. Loss: 1.829 |  Val. PPL:   6.227

Epoch: 07 | Time: 0m 47s
	Train Loss: 0.948 | Train PPL:   2.580
	 Val. Loss: 1.871 |  Val. PPL:   6.495

Epoch: 08 | Time: 0m 46s
	Train Loss: 0.824 | Train PPL:   2.280
	 Val. Loss: 1.908 |  Val. PPL:   6.743

Epoch: 09 | Time: 0m 46s
	Train Loss: 0.732 | Train PPL:   2.080
	 Val. Loss: 1.983 |  Val. PPL:   7.264

Epoch: 10 | Time: 0m 45s
	Train Loss: 0.660 | 

## Test

In [25]:
model.load_state_dict(torch.load(MODEL_NAME))

test_loss = eval_epoch(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

| Test Loss: 1.840 | Test PPL:   6.298 |


# Use trained model to translate

## Translate fcn

In [31]:
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)

    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
          
    for i in range(max_len-1):
        
        memory = memory.to(DEVICE)
        memory_mask = torch.zeros(ys.shape[0], memory.shape[0]).to(DEVICE).type(torch.bool)
        trg_mask = (model.create_trg_attn_mask(ys.size(0))
                                    .type(torch.bool)).to(DEVICE)
        
        out = model.decode(ys, memory, trg_mask)

        prob = model.out_linear(out)        

        next_word = torch.argmax(prob[-1,0,:])

        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == SRC_PAD_EOS_IDX:
            break  

    return ys



def translate(model, src, src_vocab,trg_vocab, src_tokenizer):
    model.eval()
    tokens = [SRC_PAD_INIT_IDX] + [src_vocab.stoi[tok] for tok in src] + [SRC_PAD_EOS_IDX]    
    num_tokens = len(tokens)
    src = (torch.LongTensor(tokens).reshape(num_tokens, 1) )
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    
    trg_tokens = \
        greedy_decode(model,  src, src_mask, max_len=num_tokens + 5, start_symbol=SRC_PAD_INIT_IDX).flatten()
    
    return " ".join([trg_vocab.itos[tok] for tok in trg_tokens]).replace("<sos>", "").replace("<eos>", "")
    

In [32]:

test_index = 8
test_src = vars(test_data.examples[test_index])['src']
test_trg = vars(test_data.examples[test_index])['trg']
print(test_src)
print(test_trg)
print("")

## for dataset, input de
result_str = translate(model, test_src,\
          SRC.vocab, TRG.vocab, spacy_de)

print(result_str)

['ein', 'typ', 'arbeitet', 'an', 'einem', 'gebäude', '.']
['a', 'guy', 'works', 'on', 'a', 'building', '.']

 a guy is working on a building . 


## Test bleu score & output

In [36]:
from torchtext.data.metrics import bleu_score

def output_text(data, src_field, trg_field, model, device, max_len = 50): 
    with open("output_text.txt",'w') as file:
        for datum in data:
            src = vars(datum)['src']
            trg = vars(datum)['trg']
            pred_trg = translate(model, src, src_field.vocab, trg_field.vocab, spacy_de) + '\n'
            file.write(pred_trg)
#             print(pred_trg)


In [37]:
print(type(test_data.examples))
print(len(test_data.examples))

<class 'list'>
1000


In [38]:
t0 = time.time()
output_text(test_data, SRC, TRG, model, DEVICE)
t1 = time.time() 
tspan = t1 - t0
print(f'time:(s) {tspan}')


time:(s) 65.01646447181702
