<a href="https://colab.research.google.com/github/DmitryKutsev/eng_to_jap_translator/blob/main/seq2seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install tinysegmenter



In [44]:
import sys
import os
import math
from tqdm import tqdm

import torch
import torch.optim as optim
import torch.nn as nn
import pandas as pd
import numpy as np

import torchtext
from torchtext.data import Field, BucketIterator, TabularDataset
import random
import spacy
import tinysegmenter

import torch
import torch.nn as nn
import random


In [29]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [30]:
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [16]:
spacy_en = spacy.load('en')

In [17]:
segmenter = tinysegmenter.TinySegmenter()

In [21]:
my_frame = pd.read_excel(
'http://nlp.ist.i.kyoto-u.ac.jp/EN/?plugin=attach&refer=JEC%20Basic%20Sentence%20Data&openfile=JEC_basic_sentence_v1-2.xls')

In [22]:
#remove Chineese column
my_frame = my_frame.drop(['难道不会是X吗，我实在是感到怀疑。'], axis=1)
my_frame.columns = ['index', 'jp', 'en']
my_frame = my_frame.drop(['index'], axis=1)

In [23]:
my_frame

Unnamed: 0,jp,en
0,Xがいいなといつも思います,I always think X would be nice.
1,それがあるようにいつも思います,It always seems like it is there.
2,それが多すぎないかと正直思う,I honestly feel like there is too much.
3,山田はみんなに好かれるタイプの人だと思う,I think that Yamada is the type everybody likes.
4,〜と誰かが思った,Someone thought that 〜
...,...,...
5298,チームが４人のメンバーで構成されています,The team consists of four members.
5299,彼が実際に動画を再生する,He actually plays the video.
5300,政府が銀行に公的資金をどんどん投入しました,The government injected massive public funds i...
5301,レベル１の機能に下記の機能をプラスする,The following will be added to the level 1 fun...


In [24]:
segmenter.tokenize(my_frame['jp'][1])

['それ', 'が', 'ある', 'よう', 'にいつも', '思い', 'ます']

In [31]:
[tok.text for tok in spacy_en.tokenizer(my_frame['en'][1])]

['It', 'always', 'seems', 'like', 'it', 'is', 'there', '.']

In [33]:
my_frame.to_csv('my_frame.csv', index=False)  

In [34]:
!ls

my_frame.csv  sample_data


In [36]:
def tokenize_jp(text):
    """
    Tokenizes JP text from a string into a list of strings
    """
    return segmenter.tokenize(text)

def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [37]:
SRC = Field(tokenize=tokenize_jp, init_token='<sos>', eos_token='<eos>')
TRG = Field(tokenize=tokenize_en, init_token='<sos>', eos_token='<eos>')

In [38]:
dataset = TabularDataset(path='my_frame.csv', 
                         format='csv', 
                         fields=[('en', SRC), ('jp', TRG)],
                         skip_header=True)

In [39]:
train_dt, valid_dt, test_dt = dataset.split(split_ratio=[0.7, 0.1, 0.2], 
                                            random_state=random.getstate())

In [40]:
SRC.build_vocab(train_dt, min_freq=2)
TRG.build_vocab(train_dt, min_freq=2)

In [41]:
print (len(SRC.vocab), len(TRG.vocab))
print (SRC.vocab.freqs.most_common(10))
print (TRG.vocab.freqs.most_common(10))

2399 2406
[('が', 2865), ('の', 2473), ('を', 2320), ('に', 2011), ('ます', 1082), ('た', 1079), ('彼', 927), ('し', 680), ('は', 668), ('、', 524)]
[('.', 3477), ('the', 1641), ('He', 872), ('of', 648), ('to', 628), ('a', 615), ('in', 535), ('The', 498), ('I', 485), ('will', 435)]


In [None]:
gpu = False
device = torch.device('cuda' if gpu and torch.cuda.is_available() else 'cpu')

In [43]:
batch_size = 32
train_it, valid_it, test_it = BucketIterator.splits((train_dt, valid_dt, test_dt),
                                                    batch_size=batch_size, 
                                                    sort_key=lambda x: len(x.jp), 
                                                    sort_within_batch=False, 
                                                    device=device)


In [45]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, dropout):
        super().__init__()
        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.dropout = dropout

        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.gru = nn.GRU(emb_dim, hid_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, seq):
        #seq = [seq len, batch size]
        embedded = self.dropout(self.embedding(seq)) # out shape [seq len, batch size, emb dim]
        outputs, hidden = self.gru(embedded)
        #outputs = [seq len, batch size, hid dim*n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        return hidden

In [46]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, dropout):
        super().__init__()
        self.output_dim = output_dim
        self.hid_dim = hid_dim

        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.gru = nn.GRU(emb_dim+hid_dim, hid_dim) #emb_dim is current input (target)+hid_dim is context vector from encoder
        self.out = nn.Linear(emb_dim+hid_dim*2, output_dim) #hid dim*2 = context vector encoder + prev decoder hidden states 
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, input, hidden, context):
        #input = [batch size]
        #hidden = [n layers * n directions, batch size, hid dim]
        #context = [n layers * n directions, batch size, hid dim]
        input = input.unsqueeze(0) # [1, batch size]

        embedded = self.dropout(self.embedding(input)) # [1, batch size, emb dim]
        emb_concat = torch.cat((embedded, context), dim=2)
        
        output, hidden = self.gru(emb_concat, hidden) # decoder's first hidden state h0 is from encoder last hidden state
        output = torch.cat((embedded.squeeze(0), hidden.squeeze(0), context.squeeze(0)), dim=1)
        pred = self.out(output)
        return pred, hidden

In [47]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        assert encoder.hid_dim == decoder.hid_dim, "hidden dimensions of encoder and decoder must be equal!"

    def forward(self, src, trg=None, teacher_forcing_ratio=0.5):
        if trg is None:
            trg = torch.zeros((25, src.shape[1])).fill_(2).long().to(src.device)
            assert teacher_forcing_ratio == 0, 'techer forcing must be 0 during inference, i.e. use the y prediction'

        batch_size = trg.shape[1]
        max_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        outputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(self.device) #tensor to store decoder outputs
        context = self.encoder(src) #last hidden state of the encoder. same for all time step
        hidden = context #initial hidden state of the decoder is the last hidden state of encoder
        
        input = trg[0,:] #first input to the decoder is the <sos> token which is the first row in trg
        for t in range(1, max_len):
            output, hidden = self.decoder(input, hidden, context)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.max(1)[1] #the prediction of the next output
            input = (trg[t] if teacher_force else top1) #the next input for decoder is either the real y or the prediction
        return outputs

TRAINING PROCESS


In [54]:
def train(model, train_it, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    for i, batch in tqdm(enumerate(train_it)):
        src = batch.jp
        trg = batch.en
        optimizer.zero_grad()
        output = model(src, trg)
        loss = criterion(output[1:].view(-1, output.shape[-1]), trg[1:].view(-1))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss/ len(train_it)

In [55]:
def evaluate(model, data_it, criterion):
    model.eval()
    epoch_loss = 0
    for i, batch in tqdm(enumerate(data_it)):
        src = batch.jp
        trg = batch.en
        output = model(src, trg, 0)
        loss = criterion(output[1:].view(-1, output.shape[-1]), trg[1:].view(-1))
        epoch_loss += loss.item()
    return epoch_loss/ len(data_it)

In [56]:
input_dim = len(SRC.vocab)
out_dim = len(TRG.vocab)
enc_emb_dim = 128
dec_emb_dim = 128
hidden_dim = 256
nlayers = 2
enc_dropout = 0.3
dec_dropout = 0.3
enc = Encoder(input_dim, enc_emb_dim, hidden_dim,  enc_dropout)
dec = Decoder(out_dim, dec_emb_dim, hidden_dim, dec_dropout)
model = Seq2Seq(enc, dec, device).to(device)

optimizer = optim.Adam(model.parameters())
pad_idx = TRG.vocab.stoi['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

In [57]:
epoch = 2
clip = 1
savedir = 'models'
model_save_path = os.path.join(savedir, 's2smodel.pt')
best_valid_loss = float('inf')

if not os.path.isdir(f'{savedir}'):
    os.makedirs(f'{savedir}')
for ep in range(epoch):
    train_loss = train(model, train_it, optimizer, criterion, clip)
    valid_loss = evaluate(model, valid_it, criterion)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), model_save_path)
    
    print (f'epoch: {ep+1:03} | train loss: {train_loss: .3f} | \
    train_ppl: {math.exp(train_loss):7.3f} | Val. Loss: {valid_loss:.3f} | \
     Val. PPL: {math.exp(valid_loss):7.3f} |')


22it [00:01, 21.44it/s]


RuntimeError: ignored

In [None]:
model.load_state_dict(torch.load(model_save_path))
test_loss = evaluate(model, test_it, criterion)
print(f'|test loss: {test_loss: .3f} | test_ppl: {math.exp(test_loss):7.3f}|')

In [None]:
def translate_sentence(sentence):
    tokenized = tokenize_jp(sentence)
    numericalised = [SRC.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(numericalised).unsqueeze(1).to(device)
    translation_tensor_probs = model(tensor, None, 0).squeeze(1)
    translation_tensor = torch.argmax(translation_tensor_probs, 1)
    translation = [TRG.vocab.itos[t] for t in translation_tensor][1:]
    return translation

In [None]:
# candidate = ' '.join(vars(valid_dt.examples[2])['jp'])
candidate = 'I'm hungry, help me'
candidate_translation = ' '.join(vars(valid_dt.examples[2])['en'])
print (candidate)
print (candidate_translation)
print (translate_sentence(candidate))
tokenized = tokenize_jp(candidate)
numericalised = [SRC.vocab.stoi[t] for t in tokenized]
back_to_candidate = [SRC.vocab.itos[n] for n in numericalised][1:]
print (back_to_candidate)