In [35]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [54]:
with open("cmn.txt", encoding='utf8') as f:
    lines = f.readlines()

sep = ".!?,。？！，"
lines = ["".join([char if char not in sep else " " + char + " " for char in line]) for line in lines]
print(lines[0])
lines = [line.split("\t")[:2] for line in lines]
lines = [(line_pair[0].split(), line_pair[1].split()) for line_pair in lines]

source_sequence = []
target_sequence = []
for line in lines:
    source_sequence.append(line[0])
    target = []
    for seq in line[1]:
        if len(seq) == 1:
            target.append(seq)
        else:
            target.extend([char for char in seq])
    target_sequence.append(target)
print(source_sequence[10], target_sequence[10])

source_sequence = [["<bos>"] + sequence + ["<eos>"] for sequence in source_sequence]
target_sequence = [["<bos>"] + sequence + ["<eos>"] for sequence in target_sequence]

def padding_sequence(padding_size, sequence):
    if len(sequence) > padding_size:
        return sequence[:padding_size]
    elif len(sequence) < padding_size:
        for i in range(len(sequence), padding_size):
            sequence.append("<pad>")
    return sequence
    
source_sequence = [padding_sequence(20, sequence) for sequence in source_sequence]
target_sequence = [padding_sequence(20, sequence) for sequence in target_sequence]

source_count_dict = {}
target_count_dict = {}

for sequence in source_sequence:
    for word in sequence:
        if source_count_dict.get(word, -1) == -1:
            source_count_dict[word] = 0
        else:
            source_count_dict[word] += 1
for sequence in target_sequence:
    for word in sequence:
        if target_count_dict.get(word, -1) == -1:
            target_count_dict[word] = 0
        else:
            target_count_dict[word] += 1

target_words = ["<unknown>", "<pad>", "<eos>", "<bos>"]
source_words = ["<unknown>", "<pad>", "<eos>", "<bos>"]
target_dict = {"<unknown>":0, "<pad>":1, "<eos>":2, "<bos>":3}
source_dict = {"<unknown>":0, "<pad>":1, "<eos>":2, "<bos>":3}
print(target_dict["<bos>"], target_dict["<eos>"])
for k in source_count_dict:
    if source_count_dict[k] < 2 or source_dict.get(k, -1) != -1:
        continue
    source_dict[k] = len(source_words)
    source_words.append(k)
for k in target_count_dict:
    if target_count_dict[k] < 2 or target_dict.get(k, -1) != -1:
        continue
    target_dict[k] = len(target_words)
    target_words.append(k)

print(target_dict["<bos>"], target_dict["<eos>"])

Hi . 	嗨 。 	CC-BY 2 . 0 (France) Attribution: tatoeba . org #538123 (CM) & #891077 (Martha)

['Oh', 'no', '!'] ['不', '会', '吧', '。']
3 2
3 2


In [67]:
class SeqDataset(Dataset):
    def __init__(self, source_lines, target_lines, source_words, target_words, source_dict, target_dict) -> None:
        super().__init__()
        self.source_lines = source_lines
        self.target_lines = target_lines
        self.source_words = source_words
        self.target_words = target_words
        self.source_dict = source_dict
        self.target_dict = target_dict
    def __getitem__(self, index):
        line = self.source_lines[index]
        source_index = [self.source_dict.get(word, 0) for word in line]
        line = self.target_lines[index]
        target_index = [self.target_dict.get(word, 0)  for word in line]
        # here we use tensor, so we can get [batch_size, sequence_length] tensor. or we will get wierd things.
        return torch.tensor(source_index), torch.tensor(target_index)
    def __len__(self):
        return len(self.source_lines)
seq_data_set = SeqDataset(source_lines=source_sequence,
                          target_lines=target_sequence,
                          source_words=source_words,
                          target_words=target_words,
                          source_dict=source_dict,
                          target_dict=target_dict)
print(seq_data_set[0])
train_data_loader = DataLoader(dataset=seq_data_set, batch_size=2, shuffle=True)
for iter in train_data_loader:
    # print(iter[0].shape, iter[1].shape)
    print(iter)
    break


(tensor([3, 4, 5, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), tensor([3, 4, 5, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]))
[tensor([[   3,  297,  213,   94,   64,  149,   25,  123,  428,   17,    2,    1,
            1,    1,    1,    1,    1,    1,    1,    1],
        [   3,  489,  857,  430, 1014,   20,  529, 2896, 2994, 1022, 3373,    5,
            2,    1,    1,    1,    1,    1,    1,    1]]), tensor([[   3,    6,  670,  266,  576,   68,   47,   10,  723,   33,    2,    1,
            1,    1,    1,    1,    1,    1,    1,    1],
        [   3,  159,  479,   12, 1152,  127,  128,   19,  747,  518,   15, 1070,
          501, 1421, 1374,    7,   10, 2312, 2450,    5]])]


In [68]:
class Encoder(nn.Module):
    def __init__(self):
        super().__init__()
    def forward(self, x):
        raise NotImplementedError

class Decoder(nn.Module):
    def __init__(self):
        super().__init__()
    def init_state(self, state):
        raise NotImplementedError
    def forward(self, x, state):
        raise NotImplementedError

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder_ = encoder
        self.decoder_ = decoder
    def forward(self, x, y):
        encoder_output = self.encoder_(x)
        state = self.decoder_.init_state(encoder_output)
        decoder_output = self.decoder_(y, state)
        return decoder_output
    # def predict(self, x):
    #     NotImplementedError

In [69]:
class GRUEncoder(Encoder):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.hidden_dim = embedding_dim * 2
        self.word2vec = nn.Embedding(vocab_size, embedding_dim)
        self.gru = nn.GRU(input_size=embedding_dim, hidden_size=self.hidden_dim, num_layers=3, bias=True, dropout=0.5)
    def forward(self, x):
        hidden_state = torch.zeros((x.shape[0], self.hidden_dim))
        embedding = self.word2vec(x)
        output, hidden = self.gru(embedding, hidden_state)
        return hidden

class GRUDecoder(Decoder):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.hidden_dim = embedding_dim * 2 
        self.gru = nn.GRU(input_size=embedding_dim, hidden_size=self.hidden_dim, num_layers=2, bias=True, dropout=0.5)
        self.linear = nn.Linear(self.hidden_dim, vocab_size)    
    def init_state(self, state):
        return state
    def forward(self, y, state):
        output, hidden = self.gru(y, state)
        # output.shape = sequence, batch, embedding_dim
        one_hot = self.linear(output)
        return torch.sigmoid(one_hot)

In [None]:
def train(data_loader, seq2seq, epoch, cross_loss):
    optimizer = torch.optim.Adam(seq2seq.parameters(), lr=0.1)
    for _ in epoch:
        for data in data_loader:
            x, y_label, _, _ = data
            # x, y shape=[batch, sequence length], here we permute to make their shape=[sequence length, batch]
            x = x.permute(1, 0)
            y = y.permute(1, 0)
            y_label = y_label.permute(1, 0)
            # y_inference shape=[sequence length, batch, vocab_size]
            y_inference = seq2seq(x, y_label)
            loss = cross_loss(y_inference, y_label)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        

In [40]:
seq_data_set = SeqDataset(source_lines=source_sequence,
                          target_lines=target_sequence,
                          source_words=source_words,
                          target_words=target_words,
                          source_dict=source_dict,
                          target_dict=target_dict)
train_data_loader = DataLoader(dataset=seq_data_set, batch_size=32, shuffle=True)

seq2seq = Seq2Seq(GRUEncoder(len(source_words), 96), GRUDecoder(len(target_words), 96))

In [65]:
embedding = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
class EmbeddingDataset(Dataset):
    def __init__(self) -> None:
        super().__init__()
    def __getitem__(self, index):
        return embedding[index]
    def __len__(self):
        return len(embedding)

data_loader = DataLoader(dataset=EmbeddingDataset(), batch_size=2)
for data in data_loader:
    print(data)

tensor([[1, 2, 3],
        [4, 5, 6]])
tensor([[7, 8, 9]])


In [29]:
import math
-math.log(math.exp(0) / (math.exp(1) * 2 + math.exp(0)))

1.861994804058251