In [None]:
from torch.utils.data import IterableDataset
from torch.utils.data import DataLoader
from collections import defaultdict

import torch.nn as nn
import torch.nn.functional as F
import torch

import gensim

import math
import time
import tqdm
import pickle
import random

In [None]:
def vi_preprocessor(text):
    return [t.lower() for t in text.replace('.','').replace(',','').split()]

def ja_preprocessor(text):
    return [t.lower() for t in text.replace('.','').replace(',','').split()]

##collator function


In [None]:
def collator(batch, PAD_IDX, max_src_len, max_trg_len):
    dyn_max_src_len = dyn_max_trg_len = 0
    for x, y in batch:
        dyn_max_src_len = max(dyn_max_src_len, len(x))
        dyn_max_trg_len = max(dyn_max_trg_len, len(y))
    
    dyn_max_src_len = min(dyn_max_src_len, max_src_len)
    dyn_max_trg_len = min(dyn_max_trg_len, max_trg_len)

    X = []
    X_len = []
    Y = []
    for x, y in batch:
        X.append(x[:dyn_max_src_len] + [PAD_IDX for i in range(max(dyn_max_src_len-len(x), 0))])
        X_len.append(min(len(x), dyn_max_src_len))
        Y.append(y[:dyn_max_trg_len] + [PAD_IDX for i in range(max(dyn_max_trg_len-len(y), 0))])

    Y = torch.tensor(Y).contiguous()
    X = torch.tensor(X).contiguous()

    X_len = torch.tensor(X_len)
    return (X, X_len), Y

#Vocab

In [None]:
class Vocab:
    def __init__(self, src_dic=None, trg_dic=None):
        self.src_stoi = src_dic
        self.src_itos = {}
        self.word_vec = {}
        if self.src_stoi is not None:
            for k, v in self.src_stoi.items():
                self.src_itos[v] = k
        self.trg_stoi = trg_dic
        self.trg_itos = {}

        if self.trg_stoi is not None:
            for k, v in self.trg_stoi.items():
                self.trg_itos[v] = k

    def ret_z(self):
        return 0

    def build_dic(self, path, preprocessor, vocab_size, lang='ja'):
        dic = {}
        freq_dic = defaultdict(self.ret_z)

        dic['<UNK>'] = 0
        dic['<sos>'] = 1
        dic['<eos>'] = 2
        dic['<pad>'] = 3

        ctr = 4
        with open(path, 'r', encoding='utf-8') as F:
            for line in F:
                tokens = preprocessor(line)
                for token in tokens:
                    freq_dic[token] += 1
        for k, v in sorted(freq_dic.items(), key = lambda kv:(kv[1],kv[0]), reverse=True):
            if k not in dic:
                dic[k] = ctr
                ctr += 1
                if ctr == vocab_size:
                    break
        vec_dic = self.build_vector(dic, lang)
        return dic, vec_dic

    def build_vector(self, dic, lang='ja'):
        full_dic = {}
        vec_dic = {}
        if lang == 'ja':
            f = open('/content/drive/MyDrive/JaVi_Translation/w2v_pretrained/ja_w2v.txt', encoding='utf-8')
            for line in f.readlines():
                try:
                    full_dic[line.split('\t')[0]] = torch.Tensor([float(x) for x in line.split('\t')[1].split()])
                except Exception:
                    print(line)
            for w in dic:
                if w in full_dic:
                    vec_dic[dic[w]] = full_dic[w]
                else:
                    vec_dic[dic[w]] = torch.rand(300)
            f.close()
        elif lang == 'vi':
            wv_model = gensim.models.KeyedVectors.load_word2vec_format('/content/drive/MyDrive/JaVi_Translation/w2v_pretrained/baomoi.window2.vn.model.bin', binary=True) 
            for w in dic:
                if w in wv_model.wv.vocab:
                    vec_dic[dic[w]] = torch.Tensor(wv_model.wv[w].tolist())    
                else:
                    vec_dic[dic[w]] = torch.rand(300)
        return vec_dic

    def add_src_dic(self, dic):
        self.src_stoi = dic
        for k, v in self.src_stoi.items():
            self.src_itos[v] = k

    def add_trg_dic(self, dic):
        self.trg_stoi = dic
        for k, v in self.trg_stoi.items():
            self.trg_itos[v]=k

#Data Reader

In [None]:
class DataReader(IterableDataset):
    def __init__(self, args, paths, preprocessors, vocab_sizes=(100,100), DIC=None):
        self.src_path = paths[0]
        self.trg_path = paths[1]

        self.src_preprocessor = preprocessors[0]
        self.trg_preprocessor = preprocessors[1]

        self.src_vocab_size = vocab_sizes[0]
        self.trg_vocab_size = vocab_sizes[1]

        self.vocab = Vocab()
        if DIC is None:
            src_dic, self.src_vec_dic = self.vocab.build_dic(self.src_path, self.src_preprocessor, self.src_vocab_size, lang='ja')
            trg_dic, self.trg_vec_dic = self.vocab.build_dic(self.trg_path, self.trg_preprocessor, self.trg_vocab_size, lang='vi')

            self.vocab.add_src_dic(src_dic)
            self.vocab.add_trg_dic(trg_dic)
            self.src_vocab_size = len(self.vocab.src_stoi)
            self.trg_vocab_size = len(self.vocab.trg_stoi)
        else:
            self.vocab = DIC
        
    def line_mapper(self, line, is_src):
        text = line
        tokens = []
        if is_src:
            tokens.append(self.vocab.src_stoi['<sos>'])
            tokens = tokens + [self.vocab.src_stoi.get(token, 0) for token in self.src_preprocessor(text)]
            tokens.append(self.vocab.src_stoi['<eos>'])
        else:
            tokens.append(self.vocab.trg_stoi['<sos>'])
            tokens = tokens + [self.vocab.trg_stoi.get(token, 0) for token in self.trg_preprocessor(text)]
            tokens.append(self.vocab.trg_stoi['<eos>'])
        return tokens

    def __iter__(self):
        src_itr = open(self.src_path, encoding='utf-8')
        trg_itr = open(self.trg_path, encoding='utf-8')

        mapped_src_itr = map(lambda text : self.line_mapper(text, True), src_itr)
        mapped_trg_itr = map(lambda text : self.line_mapper(text, False), trg_itr)

        zipped_itr = zip(mapped_src_itr, mapped_trg_itr)

        return zipped_itr

#MultiHeadAttention

In [None]:
class MultiHeadAttentionLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, dropout, device):
        super(MultiHeadAttentionLayer, self).__init__()

        self.hid_dim = hid_dim
        self.n_heads = n_heads
        self.head_dim = hid_dim // n_heads

        self.fc_q = nn.Linear(hid_dim, hid_dim)
        self.fc_k = nn.Linear(hid_dim, hid_dim)
        self.fc_v = nn.Linear(hid_dim, hid_dim)

        self.fc_o = nn.Linear(hid_dim, hid_dim)

        self.dropout = nn.Dropout(dropout)

        self.scale = torch.sqrt(torch.FloatTensor([self.head_dim])).to(device)

    def forward(self, query, key, value, mask=None):
        batch_size = query.shape[0]
        
        Q = self.fc_q(query)
        K = self.fc_k(key)
        V = self.fc_v(value)

        Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0,2,1,3)
        K = K.view(batch_size, -1, self.n_heads, self.head_dim).permute(0,2,1,3)
        V = V.view(batch_size, -1, self.n_heads, self.head_dim).permute(0,2,1,3)
        
        energy = torch.matmul(Q, K.permute(0,1,3,2)) / self.scale

        if mask is not None:
            energy = energy.masked_fill(mask==0, -1e10)
        
        attention = torch.softmax(energy, dim=-1)

        x = torch.matmul(self.dropout(attention), V)
        x = x.permute(0,2,1,3).contiguous()

        x = x.view(batch_size, -1, self.hid_dim)
        x = self.fc_o(x)
        return x, attention

In [None]:
class PositionwiseFeedforwardLayer(nn.Module):
    def __init__(self, hid_dim, pf_dim, dropout):
        super().__init__()

        self.fc_1 = nn.Linear(hid_dim, pf_dim)
        self.fc_2 = nn.Linear(pf_dim, hid_dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.dropout(torch.relu(self.fc_1(x)))
        x = self.fc_2(x)
        return x

#Encoder

In [None]:
class EncoderLayer(nn.Module):
    def __init__(self,
                 hid_dim,
                 n_heads,
                 pf_dim,
                 dropout,
                 device):
        super(EncoderLayer, self).__init__()
        self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.ff_layer_norm = nn.LayerNorm(hid_dim)
        self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.positionwise_feedforward = PositionwiseFeedforwardLayer(hid_dim, pf_dim, dropout)
        self.dropout = nn.Dropout(dropout)
    def forward(self, src, src_mask):
        _src, _ = self.self_attention(src, src, src, src_mask)
        src = self.self_attn_layer_norm(src + self.dropout(_src))
        _src = self.positionwise_feedforward(src)
        src = self.ff_layer_norm(src + self.dropout(_src))

        return src


In [None]:
class Encoder(nn.Module):
    def __init__(self, 
                 input_vocab_size, 
                 input_emb_dim,
                 n_layers,
                 n_heads,
                 pf_dim,
                 max_sent_len,
                 dropout,
                 device,
                 emb_weight=None):
        super(Encoder, self).__init__()
        self.device = device
        self.scale = torch.sqrt(torch.FloatTensor([input_emb_dim])).to(device)
        # self.embedding = nn.Embedding.from_pretrained(
        #         embeddings=torch.as_tensor(emb_weight),
        #         padding_idx = 3,
        #         freeze=False
        #     )
        self.embedding = nn.Embedding(input_vocab_size, input_emb_dim)
        self.pos_embedding = nn.Embedding(max_sent_len, input_emb_dim)
        self.layers = nn.ModuleList([EncoderLayer(input_emb_dim,
                                                  n_heads,
                                                  pf_dim,
                                                  dropout,
                                                  device) for _ in range(n_layers)])

        self.dropout = nn.Dropout(dropout)

    def forward(self, input, mask):

        batch_size = input.shape[0]
        input_len = input.shape[1]

        pos = torch.arange(0, input_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)

        word_emb = self.embedding(input)
        pos_emb = self.pos_embedding(pos)

        src = self.dropout((word_emb*self.scale) + pos_emb)
        
        for layer in self.layers:
            src = layer(src, mask)

        return src

#Decoder

In [None]:
class DecoderLayer(nn.Module):
    def __init__(self,
                 hid_dim,
                 n_heads,
                 pf_dim,
                 dropout,
                 device):
        super().__init__()

        self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.enc_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.ff_layer_norm = nn.LayerNorm(hid_dim)
        self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.encoder_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.positionwise_feedforward = PositionwiseFeedforwardLayer(hid_dim, pf_dim, dropout)

        self.dropout = nn.Dropout(dropout) 

    def forward(self, trg, enc_src, trg_mask, src_mask):
        _trg, _ = self.self_attention(trg, trg, trg, trg_mask)
        trg = self.self_attn_layer_norm(trg + self.dropout(_trg))

        _trg, attention = self.encoder_attention(trg, enc_src, enc_src, src_mask)
        trg = self.enc_attn_layer_norm(trg + self.dropout(_trg))

        trg = self.enc_attn_layer_norm(trg +self.dropout(_trg))
        _trg = self.positionwise_feedforward(trg)
        trg = self.ff_layer_norm(trg + self.dropout(_trg))

        return trg, attention                                                                

In [None]:
class Decoder(nn.Module):
    def __init__(self, 
                 output_vocab_size, 
                 output_emb_dim, 
                 n_layers,
                 n_heads,
                 pf_dim,
                 max_sent_len,
                 dropout,
                 device,
                 emb_weight=None):
        super(Decoder, self).__init__()
        self.device = device 
       
        self.scale = torch.sqrt(torch.FloatTensor([output_emb_dim])).to(device)
        # self.embedding = nn.Embedding.from_pretrained(
        #         embeddings=torch.as_tensor(emb_weight),
        #         padding_idx = 3,
        #         freeze=False
        #     )
        self.embedding= nn.Embedding(output_vocab_size, output_emb_dim)
        self.pos_embedding = nn.Embedding(max_sent_len, output_emb_dim)

        self.layers = nn.ModuleList([DecoderLayer(output_emb_dim,
                                                  n_heads,
                                                  pf_dim,
                                                  dropout,
                                                  device) for _ in range(n_layers)])

        self.fc_out = nn.Linear(output_emb_dim, output_vocab_size)

        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, enc_src, trg_mask, src_mask):
        
        batch_size = input.shape[0]
        input_len = input.shape[1]

        pos = torch.arange(0, input_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)

        word_emb = self.embedding(input)
        pos_emb = self.pos_embedding(pos)

        trg = self.dropout(word_emb*self.scale + pos_emb)
        # print(trg.shape)
        for layer in self.layers:
            trg, attention = layer(trg, enc_src, trg_mask, src_mask)
        output = self.fc_out(trg)

        return output, attention

#Seq2Seq

src = [src sent len, batch size] 

src_len = [batch size]
 
trg = [trg sent len, batch size] 

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, 
                 ags, 
                 input_vocab_size, 
                 output_vocab_size, 
                 pad_idx, 
                 sos_idx, 
                 eos_idx, 
                 src_vec_emb=None, 
                 trg_vec_emb=None):
        super(Seq2Seq, self).__init__()

        self.input_vocab_size = input_vocab_size
        self.output_vocab_size = output_vocab_size

        self.pad_idx = pad_idx
        self.sos_idx = sos_idx
        self.eos_idx = eos_idx
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.encoder = Encoder(input_vocab_size, 
                               args.input_embedding_dim, 
                               args.n_layers, 
                               args.n_heads,
                               args.pf_dim, 
                               args.max_sent_len, 
                               args.dropout, 
                               self.device, 
                               src_vec_emb)
        self.decoder = Decoder(output_vocab_size, 
                               args.output_embedding_dim,
                               args.n_layers, 
                               args.n_heads,
                               args.pf_dim,
                               args.max_sent_len, 
                               args.dropout, 
                               self.device, 
                               trg_vec_emb)

    def make_src_mask(self, src):
        src_mask = (src != self.pad_idx).unsqueeze(1).unsqueeze(2)
        return src_mask

    def make_trg_mask(self, trg):
        trg_pad_mask = (trg != self.pad_idx).unsqueeze(1).unsqueeze(2)
        trg_len = trg.shape[1]
        trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), device=self.device)).bool()

        trg_mask = trg_pad_mask & trg_sub_mask
        return trg_mask

    def forward(self, src, trg):
        # src = src.permute(1, 0)
        if trg is None:
            trg = torch.zeros((src.shape[1], src.shape[0])).long().fill_(self.sos_idx).to(self.device)
        # trg = trg.permute(1, 0)

        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)
        
        enc_src = self.encoder(src, src_mask)
        output, attention = self.decoder(trg, enc_src, trg_mask, src_mask)

        output = output
        attention = attention
        
        return output, attention

#train function

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
def train_iter(model, iterator, epoch, optimizer, criterion, clip, args, checkpoint=None):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model.train()

    epoch_loss = 0
    batch_ctr = 0

    if checkpoint is not None:
      batch_ctr = checkpoint['batch']
      epoch_loss = checkpoint ['epoch_loss']
    # print(iterator.__iter__())
    for current_batch_ctr, batch in enumerate(iterator):
        if current_batch_ctr < batch_ctr:
            continue

        torch.cuda.empty_cache()

        src, src_len = batch[0]
        trg = batch[1]
        src = src.to(device)
        trg = trg.to(device)
        # print(src.shape, trg.shape)
        optimizer.zero_grad()

        output, _ = model(src, trg[:,:-1])
        
        output_dim = output.shape[-1]
        output = output.contiguous().view(-1, output_dim)
        trg = trg[:,1:].contiguous().view(-1)

        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()
        epoch_loss += loss.item()

        if batch_ctr % 100 == 0 and args.save_checkpoint:
            torch.save({
                'epoch': epoch,
                'batch': batch_ctr,
                'model': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'epoch_loss': epoch_loss,
                }, args.checkpoint_path)
            EPOCH_INFO = f'Epoch: {epoch+1:02} | Batch: {batch_ctr+1:02}'
            av_loss = epoch_loss/(batch_ctr+1)
            LOSS_INFO = f'\tRunning av training Loss: {av_loss:.3f} | Train PPL: {math.exp(av_loss):7.3f}'
        
        batch_ctr += 1
    return epoch_loss/(batch_ctr)

In [None]:
def evaluate_iter(model, iterator, criterion, args):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.eval()

    epoch_loss = 0
    batch_ctr = 0
    with torch.no_grad():
        for batch in iterator:
            src, src_len = batch[0]
            trg = batch[1]
            src = src.to(device)
            trg = trg.to(device)

            output, _ = model(src, trg[:,:-1])
            
            output_dim = output.shape[-1]
            output = output.contiguous().view(-1, output_dim)
            trg = trg[:,1:].contiguous().view(-1)

            loss = criterion(output, trg)

            epoch_loss += loss.item()
            batch_ctr += 1
    return epoch_loss / (batch_ctr)

In [None]:
def train(args):

    TRG_MAX_LEN = args.trg_max_len
    SRC_MAX_LEN = args.src_max_len

    preprocessors = (vi_preprocessor, ja_preprocessor)

    lengths = (SRC_MAX_LEN, TRG_MAX_LEN)
    vocab_size = (args.input_vocab, args.output_vocab)

    training_dataset = DataReader(args, args.training_data, preprocessors, vocab_size)
    validation_dataset = DataReader(args, args.validation_data, preprocessors, DIC=training_dataset.vocab)

    src_vec_dic = torch.stack([v for k, v in training_dataset.src_vec_dic.items()])
    trg_vec_dic = torch.stack([v for k, v in training_dataset.trg_vec_dic.items()])
    

    INPUT_DIM = len(training_dataset.vocab.src_stoi)
    OUTPUT_DIM = len(training_dataset.vocab.trg_stoi)
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    PAD_IDX = training_dataset.vocab.src_stoi['<pad>']
    SOS_IDX = training_dataset.vocab.src_stoi['<sos>']
    EOS_IDX = training_dataset.vocab.src_stoi['<eos>']
    
    training_dataloader = DataLoader(training_dataset, batch_size = args.batch, drop_last=True, collate_fn=lambda b: collator(b,PAD_IDX,SRC_MAX_LEN,TRG_MAX_LEN))
    validation_dataloader = DataLoader(validation_dataset, batch_size = args.batch, drop_last=True, collate_fn=lambda b: collator(b,PAD_IDX,SRC_MAX_LEN,TRG_MAX_LEN))
    
    model = Seq2Seq(args, INPUT_DIM, OUTPUT_DIM, PAD_IDX, SOS_IDX, EOS_IDX, src_vec_dic, trg_vec_dic).to(device)
    
    N_EPOCHS = args.epochs
    CLIP = 1
    best_valid_loss = float('inf')

    optimizer = torch.optim.Adam(model.parameters(), lr=0.001, betas=(0.9,0.98))

    criterion = nn.CrossEntropyLoss(ignore_index= PAD_IDX)

    start_epoch = 0
    checkpoint = None
    if args.load_checkpoint:
        checkpoint = torch.load(args.checkpoint_path)
        # print(checkpoint.keys())
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        start_epoch = checkpoint['epoch']
    
    for epoch in range(start_epoch, N_EPOCHS):
        start_time = time.time()

        train_loss = train_iter(model, training_dataloader, epoch, optimizer, criterion, CLIP, args, checkpoint)
        valid_loss = evaluate_iter(model, validation_dataloader, criterion, args)

        end_time = time.time()

        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        # if valid_loss < best_valid_loss:
        if True:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), args.save_model_path)
            with open(args.save_dic_path, 'wb') as F:
                pickle.dump(training_dataset.vocab, F)

        print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')
        print('-----------------------------------------')

#inference function

In [None]:
def translate_sentence(model, vocab, sent, args, max_len=30):
    model.eval()
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    tokenized = ja_preprocessor(sent)
    tokenized = ['<sos>'] + tokenized + ['<eos>']

    numericalized = [vocab.src_stoi.get(t, 0) for t in tokenized]
    sentence_length = torch.LongTensor([len(numericalized)]).to(device)

    tensor = torch.LongTensor(numericalized).unsqueeze(0).to(device)
    mask = model.make_src_mask(tensor)
    with torch.no_grad():
        enc_src = model.encoder(tensor, mask)
    trg_indexes = [vocab.trg_stoi['<sos>']]
    for i in range(max_len):
        trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device)
        trg_mask = model.make_trg_mask(trg_tensor)
        with torch.no_grad():
            output, attention = model.decoder(trg_tensor, enc_src, trg_mask, mask)
        pred_token = output.argmax(2)[:,-1].item()
        trg_indexes.append(pred_token)
        if pred_token == vocab.trg_stoi['<eos>']:
            break
    trg_tokens = [vocab.trg_itos[int(i)] for i in trg_indexes]
    print(trg_indexes)
    return trg_tokens[1:], attention

In [None]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

def display_attention(candidate, translation, attention):
    fig = plt.figure(figsize=(10,10))
    ax = fig.add_subplot(111)
    
    attention = attention.squeeze(1).cpu().detach().numpy()
    
    cax = ax.matshow(attention, cmap='bone')
   
    ax.tick_params(labelsize=15)
    ax.set_xticklabels([''] + ['<sos>'] + [t.lower() for t in ja_preprocessor(candidate)] + ['<eos>'], 
                       rotation=45)
    ax.set_yticklabels([''] + translation)

    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()
    plt.close()

In [None]:
def inference(args, sent):
    vocab = None
    with open(args.load_dic_path, 'rb') as F:
        vocab = pickle.load(F)
    
    INPUT_DIM = len(vocab.src_stoi)
    OUTPUT_DIM = len(vocab.trg_stoi)
    PAD_IDX = vocab.src_stoi['<pad>']
    SOS_IDX = vocab.src_stoi['<sos>']
    EOS_IDX = vocab.src_stoi['<eos>']

    TRG_MAX_LEN = args.trg_max_len
    SRC_MAX_LEN = args.src_max_len

    preprocessors = (vi_preprocessor, ja_preprocessor)

    lengths = (SRC_MAX_LEN, TRG_MAX_LEN)
    vocab_size = (args.input_vocab, args.output_vocab)

    training_dataset = DataReader(args, args.training_data, preprocessors, vocab_size)

    src_vec_dic = torch.stack([v for k, v in training_dataset.src_vec_dic.items()])
    trg_vec_dic = torch.stack([v for k, v in training_dataset.trg_vec_dic.items()])

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model = Seq2Seq(args,INPUT_DIM,OUTPUT_DIM, PAD_IDX, SOS_IDX, EOS_IDX, src_vec_dic, trg_vec_dic).to(device)
    model.load_state_dict(torch.load(args.load_model_path,map_location=torch.device(device)))

    sentence = sent
    translation, attention = translate_sentence(model, vocab, sentence, args)
    # with open(args.output_file, 'w', encoding='utf-8') as F:
    #     print('Translated: ',' '.join(translation),file=F)
    # display_attention(sentence,translation,attention)
    print(translation)

#Configs

In [None]:
import argparse

def str2bool(v):
    return v.lower() in ('true')

def str2dict(v):
    return {'run': str(v) }

def str2tuple(v):
    v = v.split('`!`!`')
    return (v[0],v[1])

parser = argparse.ArgumentParser()
parser.add_argument("--batch",type=int,default=32)
parser.add_argument('--input_vocab',type=int,default=30000)
parser.add_argument('--output_vocab',type=int,default=30000)
parser.add_argument("--input_embedding_dim",type=int,default=128)
parser.add_argument("--output_embedding_dim",type=int,default=128)
# args.hidden_dim, args.kernel_sizes, args.max_sent_len
parser.add_argument("--n_layers",type=int,default=3)
parser.add_argument("--n_heads",type=int,default=8)
parser.add_argument("--pf_dim",type=int,default=512)

parser.add_argument("--max_sent_len",type=int,default=50)

parser.add_argument("--dropout",type=float,default=0.1)
parser.add_argument("--epochs",type=int,default=100)
# parser.add_argument("--device",type=str,default='auto',choices=['cpu', 'gpu','auto'])
parser.add_argument('--exec_id',type=str2dict,default={'run': str(time.time()).replace('.','')})

parser.add_argument('--training_data',type=str2tuple,default=('/content/drive/MyDrive/JaVi_Translation/data/train_tokenized.ja-vi.ja','/content/drive/MyDrive/JaVi_Translation/data/train_tokenized.ja-vi.vi'))
parser.add_argument('--testing_data',type=str2tuple,default=('/content/drive/MyDrive/JaVi_Translation/data/test_tokenized.ja-vi.ja','/content/drive/MyDrive/JaVi_Translation/data/test_tokenized.ja-vi.vi'))
parser.add_argument('--validation_data',type=str2tuple,default=('/content/drive/MyDrive/JaVi_Translation/data/test_tokenized.ja-vi.ja','/content/drive/MyDrive/JaVi_Translation/data/test_tokenized.ja-vi.vi'))

parser.add_argument('--save_model_path',type=str,default='/content/drive/MyDrive/JaVi_Translation/trained_models/seq2seq_trans.pt')
parser.add_argument('--save_dic_path',type=str,default='/content/drive/MyDrive/JaVi_Translation/trained_models/dictionary.pkl')
parser.add_argument('--save_checkpoint',type=str2bool,default=True)
parser.add_argument('--load_checkpoint',type=str2bool,default=False)
parser.add_argument('--checkpoint_path',type=str,default='/content/drive/MyDrive/JaVi_Translation/trained_models/checkpoint_trans.pt')

parser.add_argument('--load_model_path',type=str,default='/content/drive/MyDrive/JaVi_Translation/trained_models/seq2seq_trans.pt')
parser.add_argument('--load_dic_path',type=str,default='/content/drive/MyDrive/JaVi_Translation/trained_models/dictionary.pkl')
parser.add_argument('--src_max_len',type=int,default=50)
parser.add_argument('--trg_max_len',type=int,default=50)
parser.add_argument('--output_file',type=str,default='./translation_out.txt')


args,unparsed = parser.parse_known_args()

In [None]:
train(args)



Epoch: 01 | Time: 5m 9s
	Train Loss: 5.853 | Train PPL: 348.225
	 Val. Loss: 5.351 |  Val. PPL: 210.791
-----------------------------------------
Epoch: 02 | Time: 5m 12s
	Train Loss: 5.403 | Train PPL: 222.101
	 Val. Loss: 5.186 |  Val. PPL: 178.790
-----------------------------------------
Epoch: 03 | Time: 5m 13s
	Train Loss: 5.256 | Train PPL: 191.758
	 Val. Loss: 5.081 |  Val. PPL: 160.998
-----------------------------------------
Epoch: 04 | Time: 5m 15s
	Train Loss: 5.157 | Train PPL: 173.702
	 Val. Loss: 5.018 |  Val. PPL: 151.174
-----------------------------------------
Epoch: 05 | Time: 5m 16s
	Train Loss: 5.094 | Train PPL: 163.013
	 Val. Loss: 4.974 |  Val. PPL: 144.670
-----------------------------------------
Epoch: 06 | Time: 5m 7s
	Train Loss: 5.051 | Train PPL: 156.165
	 Val. Loss: 4.957 |  Val. PPL: 142.211
-----------------------------------------
Epoch: 07 | Time: 4m 59s
	Train Loss: 5.023 | Train PPL: 151.796
	 Val. Loss: 4.926 |  Val. PPL: 137.802
---------------

In [None]:
tqdm(iterator)

In [None]:
a = inference(args, 'アンドレア ・ マージ が 開始 4 分 後 の トライ で イタリア に と っ て 最初 の 得点 を 入れ た 。')

In [None]:
a = inference(args, '科学 オタク だっ た ん で す ね ')

In [None]:
sent = 'アンドレア ・ マージ が 開始 の 得点 を 入れ た 。'

vocab = None
with open(args.load_dic_path, 'rb') as F:
    vocab = pickle.load(F)

INPUT_DIM = len(vocab.src_stoi)
OUTPUT_DIM = len(vocab.trg_stoi)
PAD_IDX = vocab.src_stoi['<pad>']
SOS_IDX = vocab.src_stoi['<sos>']
EOS_IDX = vocab.src_stoi['<eos>']

TRG_MAX_LEN = args.trg_max_len
SRC_MAX_LEN = args.src_max_len

preprocessors = (vi_preprocessor, ja_preprocessor)

lengths = (SRC_MAX_LEN, TRG_MAX_LEN)
vocab_size = (args.input_vocab, args.output_vocab)

training_dataset = DataReader(args, args.training_data, preprocessors, vocab_size)

src_vec_dic = torch.stack([v for k, v in training_dataset.src_vec_dic.items()])
trg_vec_dic = torch.stack([v for k, v in training_dataset.trg_vec_dic.items()])

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = Seq2Seq(args,INPUT_DIM,OUTPUT_DIM, PAD_IDX, SOS_IDX, EOS_IDX, src_vec_dic, trg_vec_dic).to(device)
model.load_state_dict(torch.load(args.load_model_path,map_location=torch.device(device)))

sentence = sent
translation, attention = translate_sentence(model, vocab, sentence, args)

In [None]:
translation

In [None]:
TRG_MAX_LEN = args.trg_max_len
SRC_MAX_LEN = args.src_max_len

preprocessors = (vi_preprocessor, ja_preprocessor)

lengths = (SRC_MAX_LEN, TRG_MAX_LEN)
vocab_size = (args.input_vocab, args.output_vocab)

training_dataset = DataReader(args, args.training_data, preprocessors, vocab_size)
validation_dataset = DataReader(args, args.validation_data, preprocessors, DIC=training_dataset.vocab)

src_vec_dic = torch.stack([v for k, v in training_dataset.src_vec_dic.items()])
trg_vec_dic = torch.stack([v for k, v in training_dataset.trg_vec_dic.items()])


INPUT_DIM = len(training_dataset.vocab.src_stoi)
OUTPUT_DIM = len(training_dataset.vocab.trg_stoi)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

PAD_IDX = training_dataset.vocab.src_stoi['<pad>']
SOS_IDX = training_dataset.vocab.src_stoi['<sos>']
EOS_IDX = training_dataset.vocab.src_stoi['<eos>']

training_dataloader = DataLoader(training_dataset, batch_size = args.batch, drop_last=True, collate_fn=lambda b: collator(b,PAD_IDX,SRC_MAX_LEN,TRG_MAX_LEN))
validation_dataloader = DataLoader(validation_dataset, batch_size = args.batch, drop_last=True, collate_fn=lambda b: collator(b,PAD_IDX,SRC_MAX_LEN,TRG_MAX_LEN))


In [None]:
for i, e in enumerate(validation_dataloader):
    src, src_len = e[0]
    trg = e[1]
    print([vocab.trg_itos[int(i)] for i in trg.T[0]])
    print(trg.T[0])
    break

In [None]:
translation

In [None]:
x = torch.device("cuda" if torch.cuda.is_available() else "cpu")
x

In [None]:
torch.cuda.is_available()

In [None]:
!nvidia-smi

In [None]:
!wget https://thiaisotajppub.s3-ap-northeast-1.amazonaws.com/publicfiles/baomoi.window2.vn.model.bin.gz

In [None]:
!cp baomoi.window2.vn.model.bin /content/drive/MyDrive/JaVi_Translation/w2v_pretrained