<a href="https://colab.research.google.com/github/DmitryKutsev/eng_to_jap_translator/blob/main/attn_seq2seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install tinysegmenter

Collecting tinysegmenter
  Downloading https://files.pythonhosted.org/packages/9c/70/488895cb11e160b548c9ba5847c171b65b86a8ca1e54d206d55b2976bf7b/tinysegmenter-0.4.tar.gz
Building wheels for collected packages: tinysegmenter
  Building wheel for tinysegmenter (setup.py) ... [?25l[?25hdone
  Created wheel for tinysegmenter: filename=tinysegmenter-0.4-cp36-none-any.whl size=13536 sha256=2b791a0b760ee35444e11bcfe19b24ccb409e670cdb1a4addc88e1e5f669edbe
  Stored in directory: /root/.cache/pip/wheels/68/71/2b/6402196bf28012826e507ef7b99df6ebd98cce78bd99023471
Successfully built tinysegmenter
Installing collected packages: tinysegmenter
Successfully installed tinysegmenter-0.4


In [2]:
import sys
import os
import math
from tqdm import tqdm

import torch
import torch.optim as optim
import torch.nn as nn
import pandas as pd
import numpy as np

import torchtext
from torchtext.data import Field, BucketIterator, TabularDataset
import random
import spacy
import tinysegmenter

import torch
import torch.nn as nn
import random


In [3]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [4]:
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
spacy_en = spacy.load('en')

In [6]:
segmenter = tinysegmenter.TinySegmenter()

In [7]:
my_frame = pd.read_excel(
'http://nlp.ist.i.kyoto-u.ac.jp/EN/?plugin=attach&refer=JEC%20Basic%20Sentence%20Data&openfile=JEC_basic_sentence_v1-2.xls')

In [8]:
#remove Chineese column
my_frame = my_frame.drop(['难道不会是X吗，我实在是感到怀疑。'], axis=1)
my_frame.columns = ['index', 'jp', 'en']
my_frame = my_frame.drop(['index'], axis=1)

In [9]:
my_frame

Unnamed: 0,jp,en
0,Xがいいなといつも思います,I always think X would be nice.
1,それがあるようにいつも思います,It always seems like it is there.
2,それが多すぎないかと正直思う,I honestly feel like there is too much.
3,山田はみんなに好かれるタイプの人だと思う,I think that Yamada is the type everybody likes.
4,〜と誰かが思った,Someone thought that 〜
...,...,...
5298,チームが４人のメンバーで構成されています,The team consists of four members.
5299,彼が実際に動画を再生する,He actually plays the video.
5300,政府が銀行に公的資金をどんどん投入しました,The government injected massive public funds i...
5301,レベル１の機能に下記の機能をプラスする,The following will be added to the level 1 fun...


In [10]:
segmenter.tokenize(my_frame['jp'][1])

['それ', 'が', 'ある', 'よう', 'にいつも', '思い', 'ます']

In [11]:
[tok.text for tok in spacy_en.tokenizer(my_frame['en'][1])]

['It', 'always', 'seems', 'like', 'it', 'is', 'there', '.']

In [12]:
my_frame.to_csv('my_frame.csv', index=False)  

In [22]:
def get_datasets(batch_size=128):
    # Download the language files
    spacy_de = spacy.load('de')
    spacy_en = spacy.load('en')

    # define the tokenizer
    def tokenize_jp(text):
        """
        Tokenizes JP text from a string into a list of strings
        """
        return segmenter.tokenize(text)

        
    def tokenize_en(text):
        """
        Tokenizes English text from a string into a list of strings
        """
        return [tok.text for tok in spacy_en.tokenizer(text)]

    # Create the pytext's Field
    source = Field(tokenize=tokenize_en, init_token='<sos>', eos_token='<eos>')
    target = Field(tokenize=tokenize_jp, init_token='<sos>', eos_token='<eos>')
    dataset = TabularDataset(path='my_frame.csv', 
                         format='csv', 
                         fields=[ ('jp', TRG), ('en', SRC),],
                         skip_header=True)

    # Splits the data in Train, Test and Validation data
    train_data, valid_data, test_data = dataset.split(split_ratio=[0.7, 0.1, 0.2], 
                                            random_state=random.getstate())

    # Build the vocabulary for both the language
    source.build_vocab(train_data, min_freq=1)
    target.build_vocab(train_data, min_freq=1)

    # Create the Iterator using builtin Bucketing
    train_iterator, valid_iterator, test_iterator = BucketIterator.splits((train_data, valid_data, test_data),
                                                                          batch_size=batch_size,
                                                                          sort_within_batch=True,
                                                                          sort_key=lambda x: len(x.src),
                                                                          device=device)
    return train_iterator, valid_iterator, test_iterator, source, target

In [23]:
class Encoder(nn.Module):
    def __init__(self, vocab_len, embedding_dim, encoder_hidden_dim, n_layers=1, dropout_prob=0.5):
        super().__init__()

        self.embedding = nn.Embedding(vocab_len, embedding_dim)
        self.rnn = nn.GRU(embedding_dim, encoder_hidden_dim, n_layers, dropout=dropout_prob)

        self.dropout = nn.Dropout(dropout_prob)

    def forward(self, input_batch):
        embedded = self.dropout(self.embedding(input_batch))
        outputs, hidden = self.rnn(embedded)

        return outputs, hidden

In [24]:
class Attention(nn.Module):
    def __init__(self, encoder_hidden_dim, decoder_hidden_dim):
        super().__init__()

        # The input dimension will the the concatenation of
        # encoder_hidden_dim (hidden) and  decoder_hidden_dim(encoder_outputs)
        self.attn_hidden_vector = nn.Linear(encoder_hidden_dim + decoder_hidden_dim, decoder_hidden_dim)

        # We need source len number of values for n batch as the dimension
        # of the attention weights. The attn_hidden_vector will have the
        # dimension of [source len, batch size, decoder hidden dim]
        # If we set the output dim of this Linear layer to 1 then the
        # effective output dimension will be [source len, batch size]
        self.attn_scoring_fn = nn.Linear(decoder_hidden_dim, 1, bias=False)

    def forward(self, hidden, encoder_outputs):
        # hidden = [1, batch size, decoder hidden dim]
        src_len = encoder_outputs.shape[0]

        # We need to calculate the attn_hidden for each source words.
        # Instead of repeating this using a loop, we can duplicate
        # hidden src_len number of times and perform the operations.
        hidden = hidden.repeat(src_len, 1, 1)

        # Calculate Attention Hidden values
        attn_hidden = torch.tanh(self.attn_hidden_vector(torch.cat((hidden, encoder_outputs), dim=2)))

        # Calculate the Scoring function. Remove 3rd dimension.
        attn_scoring_vector = self.attn_scoring_fn(attn_hidden).squeeze(2)

        # The attn_scoring_vector has dimension of [source len, batch size]
        # Since we need to calculate the softmax per record in the batch
        # we will switch the dimension to [batch size,source len]
        attn_scoring_vector = attn_scoring_vector.permute(1, 0)

        # Softmax function for normalizing the weights to
        # probability distribution
        return F.softmax(attn_scoring_vector, dim=1)

In [25]:
class OneStepDecoder(nn.Module):
    def __init__(self, input_output_dim, embedding_dim, encoder_hidden_dim, decoder_hidden_dim, attention, dropout_prob=0.5):
        super().__init__()

        self.output_dim = input_output_dim
        self.attention = attention

        self.embedding = nn.Embedding(input_output_dim, embedding_dim)

        # Add the encoder_hidden_dim and embedding_dim
        self.rnn = nn.GRU(encoder_hidden_dim + embedding_dim, decoder_hidden_dim)
        # Combine all the features for better prediction
        self.fc = nn.Linear(encoder_hidden_dim + decoder_hidden_dim + embedding_dim, input_output_dim)

        self.dropout = nn.Dropout(dropout_prob)

    def forward(self, input, hidden, encoder_outputs):
        # Add the source len dimension
        input = input.unsqueeze(0)

        embedded = self.dropout(self.embedding(input))

        # Calculate the attention weights
        a = self.attention(hidden, encoder_outputs).unsqueeze(1)

        # We need to perform the batch wise dot product.
        # Hence need to shift the batch dimension to the front.
        encoder_outputs = encoder_outputs.permute(1, 0, 2)

        # Use PyTorch's bmm function to calculate the
        # weight W.
        W = torch.bmm(a, encoder_outputs)

        # Revert the batch dimension.
        W = W.permute(1, 0, 2)

        # concatenate the previous output with W
        rnn_input = torch.cat((embedded, W), dim=2)

        output, hidden = self.rnn(rnn_input, hidden)

        # Remove the sentence length dimension and pass them to the Linear layer
        predicted_token = self.fc(torch.cat((output.squeeze(0), W.squeeze(0), embedded.squeeze(0)), dim=1))

        return predicted_token, hidden, a.squeeze(1)

In [26]:
class Decoder(nn.Module):
    def __init__(self, one_step_decoder, device):
        super().__init__()
        self.one_step_decoder = one_step_decoder
        self.device = device

    def forward(self, target, encoder_outputs, hidden, teacher_forcing_ratio=0.5):
        batch_size = target.shape[1]
        trg_len = target.shape[0]
        trg_vocab_size = self.one_step_decoder.output_dim

        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        input = target[0, :]

        for t in range(1, trg_len):
            # Pass the encoder_outputs. For the first time step the
            # hidden state comes from the encoder model.
            output, hidden, a = self.one_step_decoder(input, hidden, encoder_outputs)
            outputs[t] = output

            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)

            input = target[t] if teacher_force else top1

        return outputs

In [29]:
class EncodeDecoder(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder

    def forward(self, source, target, teacher_forcing_ratio=0.5):
        encoder_outputs, hidden = self.encoder(source)
        return self.decoder(target, encoder_outputs, hidden, teacher_forcing_ratio)

In [30]:
def create_model(source, target):
    # Define the required dimensions and hyper parameters
    embedding_dim = 256
    hidden_dim = 1024
    dropout = 0.5

    # Instantiate the models
    attention_model = Attention(hidden_dim, hidden_dim)
    encoder = Encoder(len(source.vocab), embedding_dim, hidden_dim)
    one_step_decoder = OneStepDecoder(len(target.vocab), embedding_dim, hidden_dim, hidden_dim, attention_model)
    decoder = Decoder(one_step_decoder, device)

    model = EncodeDecoder(encoder, decoder)

    model = model.to(device)

    # Define the optimizer
    optimizer = optim.Adam(model.parameters())

    # Makes sure the CrossEntropyLoss ignores the padding tokens.
    TARGET_PAD_IDX = target.vocab.stoi[target.pad_token]
    criterion = nn.CrossEntropyLoss(ignore_index=TARGET_PAD_IDX)

    return model, optimizer, criterion
    

In [31]:
def train(train_iterator, valid_iterator, source, target, epochs=10):
    model, optimizer, criterion = create_model(source, target)

    clip = 1

    for epoch in range(1, epochs + 1):
        pbar = tqdm(total=len(train_iterator), bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}', unit=' batches', ncols=200)

        training_loss = []
        # set training mode
        model.train()

        # Loop through the training batch
        for i, batch in enumerate(train_iterator):
            # Get the source and target tokens
            src = batch.src
            trg = batch.trg

            optimizer.zero_grad()

            # Forward pass
            output = model(src, trg)

            # reshape the output
            output_dim = output.shape[-1]

            # Discard the first token as this will always be 0
            output = output[1:].view(-1, output_dim)

            # Discard the sos token from target
            trg = trg[1:].view(-1)

            # Calculate the loss
            loss = criterion(output, trg)

            # back propagation
            loss.backward()

            # Gradient Clipping for stability
            torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

            optimizer.step()

            training_loss.append(loss.item())

            pbar.set_postfix(
                epoch=f" {epoch}, train loss= {round(sum(training_loss) / len(training_loss), 4)}", refresh=True)
            pbar.update()

        with torch.no_grad():
            # Set the model to eval
            model.eval()

            validation_loss = []

            # Loop through the validation batch
            for i, batch in enumerate(valid_iterator):
                src = batch.src
                trg = batch.trg

                # Forward pass
                output = model(src, trg, 0)

                output_dim = output.shape[-1]

                output = output[1:].view(-1, output_dim)
                trg = trg[1:].view(-1)

                # Calculate Loss
                loss = criterion(output, trg)

                validation_loss.append(loss.item())

        pbar.set_postfix(
            epoch=f" {epoch}, train loss= {round(sum(training_loss) / len(training_loss), 4)}, val loss= {round(sum(validation_loss) / len(validation_loss), 4)}",
            refresh=False)
        pbar.close()

    return model

In [271]:
# if __name__ == '__main__':
train_iterator, valid_iterator, test_iterator, source, target = get_datasets(batch_size=256)
model = train(train_iterator, valid_iterator, source, target, epochs=25)

checkpoint = {
    'model_state_dict': model.state_dict(),
    'source': source.vocab,
    'target': target.vocab
}

torch.save(checkpoint, 'nmt-model-gru-attention-25.pth')