<a href="https://colab.research.google.com/github/DmitryKutsev/eng_to_jap_translator/blob/main/seq2seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install tinysegmenter

Collecting tinysegmenter
  Downloading https://files.pythonhosted.org/packages/9c/70/488895cb11e160b548c9ba5847c171b65b86a8ca1e54d206d55b2976bf7b/tinysegmenter-0.4.tar.gz
Building wheels for collected packages: tinysegmenter
  Building wheel for tinysegmenter (setup.py) ... [?25l[?25hdone
  Created wheel for tinysegmenter: filename=tinysegmenter-0.4-cp36-none-any.whl size=13536 sha256=b8ed9f37cf1595d1ab024521cf69dc49f1c710217f510bb516bdd792da2da9b4
  Stored in directory: /root/.cache/pip/wheels/68/71/2b/6402196bf28012826e507ef7b99df6ebd98cce78bd99023471
Successfully built tinysegmenter
Installing collected packages: tinysegmenter
Successfully installed tinysegmenter-0.4


In [2]:
import sys
import os
import math
from tqdm import tqdm

import torch
import torch.optim as optim
import torch.nn as nn
import pandas as pd
import numpy as np

import torchtext
from torchtext.data import Field, BucketIterator, TabularDataset
import random
import spacy
import tinysegmenter

import torch
import torch.nn as nn
import random


In [3]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [4]:
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
spacy_en = spacy.load('en')

In [6]:
segmenter = tinysegmenter.TinySegmenter()

In [7]:
my_frame = pd.read_excel(
'http://nlp.ist.i.kyoto-u.ac.jp/EN/?plugin=attach&refer=JEC%20Basic%20Sentence%20Data&openfile=JEC_basic_sentence_v1-2.xls')

In [8]:
#remove Chineese column
my_frame = my_frame.drop(['难道不会是X吗，我实在是感到怀疑。'], axis=1)
my_frame.columns = ['index', 'jp', 'en']
my_frame = my_frame.drop(['index'], axis=1)

In [9]:
my_frame

Unnamed: 0,jp,en
0,Xがいいなといつも思います,I always think X would be nice.
1,それがあるようにいつも思います,It always seems like it is there.
2,それが多すぎないかと正直思う,I honestly feel like there is too much.
3,山田はみんなに好かれるタイプの人だと思う,I think that Yamada is the type everybody likes.
4,〜と誰かが思った,Someone thought that 〜
...,...,...
5298,チームが４人のメンバーで構成されています,The team consists of four members.
5299,彼が実際に動画を再生する,He actually plays the video.
5300,政府が銀行に公的資金をどんどん投入しました,The government injected massive public funds i...
5301,レベル１の機能に下記の機能をプラスする,The following will be added to the level 1 fun...


In [10]:
segmenter.tokenize(my_frame['jp'][1])

['それ', 'が', 'ある', 'よう', 'にいつも', '思い', 'ます']

In [11]:
[tok.text for tok in spacy_en.tokenizer(my_frame['en'][1])]

['It', 'always', 'seems', 'like', 'it', 'is', 'there', '.']

In [12]:
my_frame.to_csv('my_frame.csv', index=False)  

In [13]:
!ls

my_frame.csv  sample_data


In [14]:
def tokenize_jp(text):
    """
    Tokenizes JP text from a string into a list of strings
    """
    return segmenter.tokenize(text)

def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [15]:
SRC = Field(tokenize=tokenize_jp, init_token='<sos>', eos_token='<eos>')
TRG = Field(tokenize=tokenize_en, init_token='<sos>', eos_token='<eos>')

In [16]:
dataset = TabularDataset(path='my_frame.csv', 
                         format='csv', 
                         fields=[('en', SRC), ('jp', TRG)],
                         skip_header=True)

In [17]:
train_dt, valid_dt, test_dt = dataset.split(split_ratio=[0.7, 0.1, 0.2], 
                                            random_state=random.getstate())

In [18]:
SRC.build_vocab(train_dt, min_freq=2)
TRG.build_vocab(train_dt, min_freq=2)

In [19]:
print (len(SRC.vocab), len(TRG.vocab))
print (SRC.vocab.freqs.most_common(10))
print (TRG.vocab.freqs.most_common(10))

2399 2406
[('が', 2865), ('の', 2473), ('を', 2320), ('に', 2011), ('ます', 1082), ('た', 1079), ('彼', 927), ('し', 680), ('は', 668), ('、', 524)]
[('.', 3477), ('the', 1641), ('He', 872), ('of', 648), ('to', 628), ('a', 615), ('in', 535), ('The', 498), ('I', 485), ('will', 435)]


In [20]:
gpu = False
device = torch.device('cuda' if gpu and torch.cuda.is_available() else 'cpu')

In [44]:
batch_size = 32
train_it, valid_it, test_it = BucketIterator.splits((train_dt, valid_dt, test_dt),
                                                    batch_size=batch_size, 
                                                    device=device)


In [45]:
class Encoder(nn.Module):
    def __init__(self, 
                 input_dim, 
                 hid_dim, 
                 n_layers, 
                 n_heads, 
                 pf_dim,
                 dropout, 
                 device,
                 max_length = 100):
        super().__init__()

        self.device = device
        
        self.tok_embedding = nn.Embedding(input_dim, hid_dim)
        self.pos_embedding = nn.Embedding(max_length, hid_dim)
        
        self.layers = nn.ModuleList([EncoderLayer(hid_dim, 
                                                  n_heads, 
                                                  pf_dim,
                                                  dropout, 
                                                  device) 
                                     for _ in range(n_layers)])
        
        self.dropout = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)
        
    def forward(self, src, src_mask):
        
        #src = [batch size, src len]
        #src_mask = [batch size, src len]
        
        batch_size = src.shape[0]
        src_len = src.shape[1]
        
        pos = torch.arange(0, src_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
        
        #pos = [batch size, src len]
        
        src = self.dropout((self.tok_embedding(src) * self.scale) + self.pos_embedding(pos))
        
        #src = [batch size, src len, hid dim]
        
        for layer in self.layers:
            src = layer(src, src_mask)
            
        #src = [batch size, src len, hid dim]
            
        return src

In [51]:
class EncoderLayer(nn.Module):
    def __init__(self, 
                 hid_dim, 
                 n_heads, 
                 pf_dim,  
                 dropout, 
                 device):
        super().__init__()
        
        self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.ff_layer_norm = nn.LayerNorm(hid_dim)
        self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.positionwise_feedforward = PositionwiseFeedforwardLayer(hid_dim, 
                                                                     pf_dim, 
                                                                     dropout)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src, src_mask):
        
        #src = [batch size, src len, hid dim]
        #src_mask = [batch size, src len]
                
        #self attention
        _src, _ = self.self_attention(src, src, src, src_mask) # multi-head( query, key, value, mask = None)
        
        #dropout, residual connection and layer norm
        src = self.self_attn_layer_norm(src + self.dropout(_src))
        
        #src = [batch size, src len, hid dim]
        
        #positionwise feedforward
        _src = self.positionwise_feedforward(src)
        
        #dropout, residual and layer norm
        src = self.ff_layer_norm(src + self.dropout(_src))
        
        #src = [batch size, src len, hid dim]
        
        return src

In [46]:
class MultiHeadAttentionLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, dropout, device):
        super().__init__()
        
        assert hid_dim % n_heads == 0
        
        self.hid_dim = hid_dim   # in paper, 512
        self.n_heads = n_heads   # in paper, 8
        self.head_dim = hid_dim // n_heads  # in paper, 512 // 8 = 64
         
        self.fc_q = nn.Linear(hid_dim, hid_dim)
        self.fc_k = nn.Linear(hid_dim, hid_dim)
        self.fc_v = nn.Linear(hid_dim, hid_dim)
        
        self.fc_o = nn.Linear(hid_dim, hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([self.head_dim])).to(device)  # sqrt(64)
        
    def forward(self, query, key, value, mask = None):
        
        batch_size = query.shape[0]
        
        #query = [batch size, query len, hid dim]
        #key = [batch size, key len, hid dim]
        #value = [batch size, value len, hid dim]
                
        Q = self.fc_q(query)
        K = self.fc_k(key)
        V = self.fc_v(value)
        
        #Q = [batch size, query len, hid dim]
        #K = [batch size, key len, hid dim]
        #V = [batch size, value len, hid dim]
                
        Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        K = K.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        V = V.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        
        #Q = [batch size, n heads, query len, head dim]
        #K = [batch size, n heads, key len, head dim]
        #V = [batch size, n heads, value len, head dim]
                
        energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.scale
        
        #energy = [batch size, n heads, query len, key len]
        
        if mask is not None:
            energy = energy.masked_fill(mask == 0, -1e10)
        
        attention = torch.softmax(energy, dim = -1)
                
        #attention = [batch size, n heads, query len, key len]
                
        x = torch.matmul(self.dropout(attention), V)  #x = [batch size, n heads, query len, head dim]
        
        # 将x还原成linear layer可以process的size
        x = x.permute(0, 2, 1, 3).contiguous() 
        # contiguous 返回一个内存连续的有相同数据的tensor，如果原tensor内存连续，则返回原tensor. 一般与transpose，permute, view搭配使用
        # transpose、permute等维度变换操作后，tensor在内存中不再是连续存储的，而view操作要求tensor的内存连续存储，所以需要contiguous来返回一个contiguous copy
        
        
        x = x.view(batch_size, -1, self.hid_dim) #x = [batch size, query len, n heads, head dim]
        
        x = self.fc_o(x) #x = [batch size, query len, hid dim]
                
        return x, attention

In [47]:
class PositionwiseFeedforwardLayer(nn.Module):
    def __init__(self, hid_dim, pf_dim, dropout):
        super().__init__()
        
        self.fc_1 = nn.Linear(hid_dim, pf_dim)
        self.fc_2 = nn.Linear(pf_dim, hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        
        #x = [batch size, seq len, hid dim]
        
        x = self.dropout(torch.relu(self.fc_1(x)))
        
        #x = [batch size, seq len, pf dim]
        
        x = self.fc_2(x)
        
        #x = [batch size, seq len, hid dim]
        
        return x

In [54]:
class Decoder:
    def __init__(self, 
                 output_dim, 
                 hid_dim, 
                 n_layers, 
                 n_heads, 
                 pf_dim, 
                 dropout, 
                 device,
                 max_length = 100):
        super().__init__()
        
        self.device = device
        
        self.tok_embedding = nn.Embedding(output_dim, hid_dim)
        self.pos_embedding = nn.Embedding(max_length, hid_dim)
        
        self.layers = nn.ModuleList([DecoderLayer(hid_dim, 
                                                  n_heads, 
                                                  pf_dim, 
                                                  dropout, 
                                                  device)
                                     for _ in range(n_layers)])
        
        self.fc_out = nn.Linear(hid_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)
        
    def forward(self, trg, enc_src, trg_mask, src_mask):
        
        #trg = [batch size, trg len]
        #enc_src = [batch size, src len, hid dim]
        #trg_mask = [batch size, trg len]
        #src_mask = [batch size, src len]
                
        batch_size = trg.shape[0]
        trg_len = trg.shape[1]
        
        pos = torch.arange(0, trg_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
                            
        #pos = [batch size, trg len]
            
        trg = self.dropout((self.tok_embedding(trg) * self.scale) + self.pos_embedding(pos))
                
        #trg = [batch size, trg len, hid dim]
        
        for layer in self.layers:
            trg, attention = layer(trg, enc_src, trg_mask, src_mask)
        
        #trg = [batch size, trg len, hid dim]
        #attention = [batch size, n heads, trg len, src len]
        
        output = self.fc_out(trg)
        
        #output = [batch size, trg len, output dim]
            
        return output, attention

In [55]:
class DecoderLayer(nn.Module):
    def __init__(self, 
                 hid_dim, 
                 n_heads, 
                 pf_dim, 
                 dropout, 
                 device):
        super().__init__()
        
        self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.enc_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.ff_layer_norm = nn.LayerNorm(hid_dim)
        self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.encoder_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.positionwise_feedforward = PositionwiseFeedforwardLayer(hid_dim, 
                                                                     pf_dim, 
                                                                     dropout)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, trg, enc_src, trg_mask, src_mask):
        
        #trg = [batch size, trg len, hid dim]
        #enc_src = [batch size, src len, hid dim]
        #trg_mask = [batch size, trg len]
        #src_mask = [batch size, src len]
        
        #self attention
        _trg, _ = self.self_attention(trg, trg, trg, trg_mask)
        
        #dropout, residual connection and layer norm
        trg = self.self_attn_layer_norm(trg + self.dropout(_trg))
            
        #trg = [batch size, trg len, hid dim]
            
        #encoder attention
        _trg, attention = self.encoder_attention(trg, enc_src, enc_src, src_mask)
        
        #dropout, residual connection and layer norm
        trg = self.enc_attn_layer_norm(trg + self.dropout(_trg))
                    
        #trg = [batch size, trg len, hid dim]
        
        #positionwise feedforward
        _trg = self.positionwise_feedforward(trg)
        
        #dropout, residual and layer norm
        trg = self.ff_layer_norm(trg + self.dropout(_trg))
        
        #trg = [batch size, trg len, hid dim]
        #attention = [batch size, n heads, trg len, src len]
        
        return trg, attention

In [56]:
class Seq2Seq(nn.Module):
    def __init__(self, 
                 encoder, 
                 decoder, 
                 src_pad_idx, 
                 trg_pad_idx, 
                 device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.device = device
        
    def make_src_mask(self, src):
        
        #src = [batch size, src len]
        
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)

        #src_mask = [batch size, 1, 1, src len]

        return src_mask
    
    def make_trg_mask(self, trg):
        
        #trg = [batch size, trg len]
        
        trg_pad_mask = (trg != self.trg_pad_idx).unsqueeze(1).unsqueeze(2)
        
        #trg_pad_mask = [batch size, 1, 1, trg len]
        
        trg_len = trg.shape[1]
        
        trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), device = self.device)).bool()
        
        #trg_sub_mask = [trg len, trg len]
            
        trg_mask = trg_pad_mask & trg_sub_mask
        
        #trg_mask = [batch size, 1, trg len, trg len]
        
        return trg_mask

    def forward(self, src, trg):
        
        #src = [batch size, src len]
        #trg = [batch size, trg len]
                
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)
        
        #src_mask = [batch size, 1, 1, src len]
        #trg_mask = [batch size, 1, trg len, trg len]
        
        enc_src = self.encoder(src, src_mask)
        
        #enc_src = [batch size, src len, hid dim]
                
        output, attention = self.decoder(trg, enc_src, trg_mask, src_mask)
        
        #output = [batch size, trg len, output dim]
        #attention = [batch size, n heads, trg len, src len]
        
        return output, attention

In [57]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
HID_DIM = 256
ENC_LAYERS = 3
DEC_LAYERS = 3
ENC_HEADS = 8
DEC_HEADS = 8
ENC_PF_DIM = 512
DEC_PF_DIM = 512
ENC_DROPOUT = 0.1
DEC_DROPOUT = 0.1

enc = Encoder(INPUT_DIM, 
              HID_DIM, 
              ENC_LAYERS, 
              ENC_HEADS, 
              ENC_PF_DIM, 
              ENC_DROPOUT, 
              device)

dec = Decoder(OUTPUT_DIM, 
              HID_DIM, 
              DEC_LAYERS, 
              DEC_HEADS, 
              DEC_PF_DIM, 
              DEC_DROPOUT, 
              device)

In [58]:
SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token]
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]

model = Seq2Seq(enc, dec, SRC_PAD_IDX, TRG_PAD_IDX, device).to(device)

In [60]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters') # The model has 9,038,853 trainable parameters

The model has 3,628,639 trainable parameters


In [61]:
def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.xavier_uniform_(m.weight.data)

model.apply(initialize_weights)

Seq2Seq(
  (encoder): Encoder(
    (tok_embedding): Embedding(2399, 256)
    (pos_embedding): Embedding(100, 256)
    (layers): ModuleList(
      (0): DecoderLayer(
        (self_attn_layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (enc_attn_layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (ff_layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (self_attention): MultiHeadAttentionLayer(
          (fc_q): Linear(in_features=256, out_features=256, bias=True)
          (fc_k): Linear(in_features=256, out_features=256, bias=True)
          (fc_v): Linear(in_features=256, out_features=256, bias=True)
          (fc_o): Linear(in_features=256, out_features=256, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder_attention): MultiHeadAttentionLayer(
          (fc_q): Linear(in_features=256, out_features=256, bias=True)
          (fc_k): Linear(in_features=256, out_features=256, bias

TRAINING PROCESS


In [36]:
def train(model, train_it, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    for i, batch in tqdm(enumerate(train_it)):
        src = batch.jp
        trg = batch.en
        optimizer.zero_grad()
        output = model(src, trg)
        loss = criterion(output[1:].view(-1, output.shape[-1]), trg[1:].view(-1))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss/ len(train_it)

In [37]:
def evaluate(model, data_it, criterion):
    model.eval()
    epoch_loss = 0
    for i, batch in tqdm(enumerate(data_it)):
        src = batch.jp
        trg = batch.en
        output = model(src, trg, 0)
        loss = criterion(output[1:].view(-1, output.shape[-1]), trg[1:].view(-1))
        epoch_loss += loss.item()
    return epoch_loss/ len(data_it)

In [39]:
input_dim = len(SRC.vocab)
out_dim = len(TRG.vocab)
enc_emb_dim = 128
dec_emb_dim = 128
hidden_dim = 256
nlayers = 2
enc_dropout = 0.3
dec_dropout = 0.3
enc = Encoder(input_dim, enc_emb_dim, hidden_dim, nlayers, enc_dropout)
dec = Decoder(out_dim, dec_emb_dim, hidden_dim, nlayers, dec_dropout)
model = Seq2Seq(enc, dec, device).to(device)

optimizer = optim.Adam(model.parameters())
pad_idx = TRG.vocab.stoi['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

In [43]:
epoch = 2
clip = 1
savedir = 'models'
model_save_path = os.path.join(savedir, 's2smodel.pt')
best_valid_loss = float('inf')

if not os.path.isdir(f'{savedir}'):
    os.makedirs(f'{savedir}')
for ep in range(epoch):
    train_loss = train(model, train_it, optimizer, criterion, clip)
    valid_loss = evaluate(model, valid_it, criterion)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), model_save_path)
    
    print (f'epoch: {ep+1:03} | train loss: {train_loss: .3f} | train_ppl: {math.exp(train_loss):7.3f} | Val. Loss: {valid_loss:.3f} | Val. PPL: {math.exp(valid_loss):7.3f} |')

model.load_state_dict(torch.load(model_save_path))
test_loss = evaluate(model, test_it, criterion)
print(f'|test loss: {test_loss: .3f} | test_ppl: {math.exp(test_loss):7.3f}|')


5it [00:01,  2.55it/s]


IndexError: ignored

In [None]:
model.load_state_dict(torch.load(model_save_path))
test_loss = evaluate(model, test_it, criterion)
print(f'|test loss: {test_loss: .3f} | test_ppl: {math.exp(test_loss):7.3f}|')

In [None]:
def translate_sentence(sentence):
    tokenized = tokenize_jp(sentence)
    numericalised = [SRC.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(numericalised).unsqueeze(1).to(device)
    translation_tensor_probs = model(tensor, None, 0).squeeze(1)
    translation_tensor = torch.argmax(translation_tensor_probs, 1)
    translation = [TRG.vocab.itos[t] for t in translation_tensor][1:]
    return translation

In [None]:
# candidate = ' '.join(vars(valid_dt.examples[2])['jp'])
candidate = 'I'm hungry, help me'
candidate_translation = ' '.join(vars(valid_dt.examples[2])['en'])
print (candidate)
print (candidate_translation)
print (translate_sentence(candidate))
tokenized = tokenize_jp(candidate)
numericalised = [SRC.vocab.stoi[t] for t in tokenized]
back_to_candidate = [SRC.vocab.itos[n] for n in numericalised][1:]
print (back_to_candidate)