In [1]:
import pandas as pd
from chameleon.base_dataset import Vocabulary, TranslationDataset, TranslationCollator
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence


def read_data(file_path, src_tgt):
    data = pd.read_pickle(file_path)
    src_lang, tgt_lang = src_tgt[:2], src_tgt[2:]

    # parse source column and target column
    src_col, tgt_col = ("tok" + "_" + src_lang, "tok" + "_" + tgt_lang)
    srcs = data[src_col].tolist()
    tgts = data[tgt_col].tolist()
    return srcs, tgts

In [2]:
train_srcs, train_tgts = read_data("./data/chameleon.train.tok.pickle", "enko")

train_loader = DataLoader(
    TranslationDataset(train_srcs, train_tgts, with_text=True),
    batch_size=5,
    shuffle=True,
    collate_fn=TranslationCollator(
        pad_idx=Vocabulary.PAD, max_length=256, with_text=True
    ),
)

[32m2023-07-22 00:30:24.557[0m | [1mINFO    [0m | [35mNone[0m | [36mchameleon.base_dataset[0m:[36mbuild_vocab[0m:[36m67[0m - [1mNumber of vocabularies: 30488[0m
[32m2023-07-22 00:30:28.697[0m | [1mINFO    [0m | [35mNone[0m | [36mchameleon.base_dataset[0m:[36mbuild_vocab[0m:[36m67[0m - [1mNumber of vocabularies: 53430[0m


## Check the model forward functionality
- Data: Using first batch of the input, output data
- `Encoder` Test
- `Decoder` Test
- `Attention` Test
- `Generator` Test

In [3]:
batch = next(iter(train_loader))

In [4]:
x = batch["input_ids"]
y = batch["output_ids"]

In [5]:
import torch

# Note that batch has not been sorted yet.
if isinstance(x, tuple):
    x, x_length = x
if isinstance(y, tuple):
    y = y[0]

In [6]:
input_size = len(train_loader.dataset.src_vocab)
output_size = len(train_loader.dataset.tgt_vocab)
word_vec_size = 512

In [8]:
import torch.nn as nn

# Embedding Layer
emb_src = nn.Embedding(input_size, word_vec_size)
emb_tgt = nn.Embedding(output_size, word_vec_size)

### Encoder Layer Debugging

In [9]:
# Embedding x
emb_x = emb_src(x)
emb_x.size()

torch.Size([5, 55, 512])

In [10]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence as pack
from torch.nn.utils.rnn import pad_packed_sequence as unpack


class Encoder(nn.Module):
    def __init__(self, word_vec_size, hidden_size, num_layers, dropout):
        # input_size – The number of expected features in the input x
        # hidden_size – The number of features in the hidden state h
        # num_layers – Number of recurrent layers. E.g., setting num_layers=2 would mean stacking two LSTMs together to form a stacked LSTM, with the second LSTM taking in outputs of the first LSTM and computing the final results. Default: 1
        # bias – If False, then the layer does not use bias weights b_ih and b_hh. Default: True
        # batch_first – If True, then the input and output tensors are provided as (batch, seq, feature) instead of (seq, batch, feature). Note that this does not apply to hidden or cell states. See the Inputs/Outputs sections below for details. Default: False
        # dropout – If non-zero, introduces a Dropout layer on the outputs of each LSTM layer except the last layer, with dropout probability equal to dropout. Default: 0
        # bidirectional – If True, becomes a bidirectional LSTM. Default: False
        # proj_size – If > 0, will use LSTM with projections of corresponding size. Default: 0
        super(Encoder, self).__init__()

        self.encoder = nn.LSTM(
            input_size=word_vec_size,
            hidden_size=int(hidden_size / 2),
            num_layers=num_layers,
            dropout=dropout,
            bidirectional=True,
            batch_first=True,
        )

    def forward(self, emb):
        if isinstance(emb, tuple):
            x, x_lengths = emb
            x = pack(x, x_lengths.tolist(), batch_first=True, enforce_sorted=False)

        y, h = self.encoder(x)
        if isinstance(emb, tuple):
            y, x_lengths = unpack(y, batch_first=True)
        # |y| = (batch_size, length, hidden_size)

        return y, h

In [11]:
encoder = Encoder(word_vec_size=512, hidden_size=2, num_layers=2, dropout=0.2)

In [12]:
h_src, h_0_tgt = encoder((emb_x, x_length))

In [13]:
# check size of h_src, h_0_tgt (hidden_state, cell_state)
print(h_src.size())
print(h_0_tgt[0].size())

torch.Size([5, 55, 2])
torch.Size([4, 5, 1])


### Transform the size of hidden, cell state derived from Encoder

In [16]:
h_0_tgt[0]

tensor([[[-9.4700e-04],
         [-1.0582e-03],
         [-6.1422e-04],
         [-1.3532e-03],
         [-1.4081e-03]],

        [[ 1.5024e-09],
         [-7.0232e-15],
         [ 6.8798e-06],
         [ 1.8581e-07],
         [-5.4890e-08]],

        [[-6.1933e-01],
         [-5.5100e-01],
         [-6.5158e-01],
         [-5.7692e-01],
         [-5.6602e-01]],

        [[-1.3714e-01],
         [-1.6879e-01],
         [-1.3475e-01],
         [-1.4646e-01],
         [-1.5944e-01]]], grad_fn=<IndexSelectBackward0>)

In [17]:
h_0_tgt[0][0]

tensor([[-0.0009],
        [-0.0011],
        [-0.0006],
        [-0.0014],
        [-0.0014]], grad_fn=<SelectBackward0>)

In [18]:
h_0_tgt[0][1]

tensor([[ 1.5024e-09],
        [-7.0232e-15],
        [ 6.8798e-06],
        [ 1.8581e-07],
        [-5.4890e-08]], grad_fn=<SelectBackward0>)

In [19]:
torch.cat([h_0_tgt[0][0], h_0_tgt[0][1]], dim=-1)

tensor([[-9.4700e-04,  1.5024e-09],
        [-1.0582e-03, -7.0232e-15],
        [-6.1422e-04,  6.8798e-06],
        [-1.3532e-03,  1.8581e-07],
        [-1.4081e-03, -5.4890e-08]], grad_fn=<CatBackward0>)

In [20]:
# |h_0_tgt| = (num_layers*2, batch_size, hidden_size/2)
# transform the shape of tensor to (num_layers, batch_size, hidden_size)

new_hiddens = []
new_cells = []

hiddens, cells = h_0_tgt
n_layers_double = hiddens.size(0)

for i in range(0, n_layers_double, 2):
    new_hiddens += [torch.cat([hiddens[i], hiddens[i + 1]], dim=-1)]
    new_cells += [torch.cat([cells[i], cells[i + 1]], dim=-1)]

In [21]:
new_hiddens

[tensor([[-9.4700e-04,  1.5024e-09],
         [-1.0582e-03, -7.0232e-15],
         [-6.1422e-04,  6.8798e-06],
         [-1.3532e-03,  1.8581e-07],
         [-1.4081e-03, -5.4890e-08]], grad_fn=<CatBackward0>),
 tensor([[-0.6193, -0.1371],
         [-0.5510, -0.1688],
         [-0.6516, -0.1347],
         [-0.5769, -0.1465],
         [-0.5660, -0.1594]], grad_fn=<CatBackward0>)]

In [22]:
new_hiddens = torch.stack(new_hiddens)
new_hiddens.size()

torch.Size([2, 5, 2])

In [23]:
def merge_encoder_hiddens(encoder_hiddens):
    new_hiddens = []
    new_cells = []

    hiddens, cells = encoder_hiddens

    # i-th and (i+1)-th layer is opposite direction.
    # Also, each direction of layer is half hidden size.
    # Therefore, we concatenate both directions to 1 hidden size layer.
    for i in range(0, hiddens.size(0), 2):
        new_hiddens += [torch.cat([hiddens[i], hiddens[i + 1]], dim=-1)]
        new_cells += [torch.cat([cells[i], cells[i + 1]], dim=-1)]

    new_hiddens, new_cells = torch.stack(new_hiddens), torch.stack(new_cells)
    # |new_hiddens| = (n_layers, batch_size, hidden_size)
    # |new_cells| = (n_layers, batch_size, hidden_size)

    return (new_hiddens, new_cells)

In [24]:
h_0_tgt = merge_encoder_hiddens(h_0_tgt)

### Decoder Layer Debugging

In [25]:
class Decoder(nn.Module):
    def __init__(
        self,
        word_vec_size,
        hidden_size,
        dropout,
        n_layers,
    ):
        super(Decoder, self).__init__()

        self.rnn = nn.LSTM(
            input_size=word_vec_size + hidden_size,
            hidden_size=hidden_size,
            num_layers=n_layers,
            dropout=dropout,
            bidirectional=False,
            batch_first=True,
        )

    def forward(self, emb_t, h_t_1_tilde, h_t_1):
        batch_size = emb_t.size(0)
        hidden_size = h_t_1[0].size(-1)

        if h_t_1_tilde is None:
            h_t_1_tilde = emb_t.new(batch_size, 1, hidden_size)

        # input feeding
        x = torch.concat([emb_t, h_t_1_tilde], dim=-1)
        y, h = self.rnn(x, h_t_1)
        return y, h

In [26]:
# embedding tgt data
emb_y = emb_tgt(y)

In [27]:
print(h_src.size(), h_0_tgt[0].size())

torch.Size([5, 55, 2]) torch.Size([2, 5, 2])


In [28]:
decoder = Decoder(word_vec_size=512, hidden_size=2, dropout=0.2, n_layers=2)

In [29]:
emb_y.size()

torch.Size([5, 39, 512])

In [30]:
emb_y[:, 0, :].unsqueeze(1).size()

torch.Size([5, 1, 512])

In [32]:
h_1_tgt, h_c_1_tgt = decoder(emb_y[:, 0, :].unsqueeze(1), None, h_0_tgt)

In [33]:
print(h_1_tgt.size(), h_c_1_tgt[0].size())

torch.Size([5, 1, 2]) torch.Size([2, 5, 2])


In [34]:
h_1_tgt

tensor([[[-0.3703, -0.1409]],

        [[-0.3955, -0.1939]],

        [[-0.5010, -0.0566]],

        [[-0.3858, -0.1498]],

        [[-0.3876, -0.1760]]], grad_fn=<TransposeBackward0>)

In [35]:
h_c_1_tgt[0]

tensor([[[-1.2283e-04,  9.3552e-01],
         [-1.1302e-04,  7.3618e-01],
         [-1.2514e-04,  7.4923e-01],
         [-1.0626e-04,  7.3663e-01],
         [-1.0453e-04,  7.3633e-01]],

        [[-3.7028e-01, -1.4095e-01],
         [-3.9554e-01, -1.9392e-01],
         [-5.0102e-01, -5.6645e-02],
         [-3.8585e-01, -1.4980e-01],
         [-3.8760e-01, -1.7595e-01]]], grad_fn=<StackBackward0>)

### Attention Layer Debugging

In [57]:
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()

        self.linear = nn.Linear(hidden_size, hidden_size, bias=False)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, h_src, h_t_tgt, mask=None):
        # |h_src| = (batch_size, src_length, hidden_size)
        # |h_t_tgt| = (batch_size, 1, hidden_size)
        # |mask| = (batch_size, src_length)

        query = self.linear(h_t_tgt)
        # |query| = (batch_size, 1, hidden_size)
        weight = torch.bmm(query, h_src.transpose(1, 2))
        # |weight| = (batch_size, 1, src_length)

        if mask is not None:
            weight.masked_fill_(mask.unsqueeze(1), -float("inf"))

        weight = self.softmax(weight)
        context_vector = torch.bmm(weight, h_src)
        # |context_vector| = (batch_size, 1, hidden_size)

        return context_vector

In [58]:
attention = Attention(hidden_size=2)

In [59]:
h_src.size()

torch.Size([5, 55, 2])

In [60]:
def generate_mask(x, length):
    mask = []

    max_length = max(length)
    for l in length:
        if max_length - l > 0:
            # If the length is shorter than maximum length among samples,
            # set last few values to be 1s to remove attention weight.
            mask += [
                torch.cat(
                    [x.new_ones(1, l).zero_(), x.new_ones(1, (max_length - l))], dim=-1
                )
            ]
        else:
            # If the length of the sample equals to maximum length among samples,
            # set every value in mask to be 0.
            mask += [x.new_ones(1, l).zero_()]

    mask = torch.cat(mask, dim=0).bool()

    return mask


mask = generate_mask(x, x_length)

In [61]:
context_vector = attention(h_src, h_1_tgt, mask)

In [63]:
context_vector.size()

torch.Size([5, 1, 2])

### Make h_t_tilde

In [65]:
concat_linear = nn.Linear(2 * 2, 2)
# |concat_linear| = (hidden_size*2, hidden_size)
tanh = nn.Tanh()
# activation layer

h_t_tilde = tanh(concat_linear(torch.cat([h_1_tgt, context_vector], dim=-1)))

### Generator Debugging

In [66]:
class Generator(nn.Module):
    def __init__(self, hidden_size):
        super(Generator, self).__init__()

        self.output = nn.Linear(hidden_size, hidden_size)
        self.softmax = nn.LogSoftmax(dim=-1)

    def forward(self, h_t_tildes):
        # |h_t_tildes| = (batch_size, tgt_length, hidden_size)
        y_hat = self.softmax(self.output(h_t_tildes))
        # |y_hat| = (batch_size, tgt_length, output_size)

        return y_hat

In [67]:
generator = Generator(2)

# In real implmentation, input data should be h_t_tilde's'
# to speed up the model forward (put all time steps at once)
y_hat = generator(h_t_tilde)

In [69]:
y_hat.size()

torch.Size([5, 1, 2])