In [74]:
# implement a rnn encoder decoder network for seq 2 seq
import torch
from torch import nn
from torch import optim
from torch.utils.data import Dataset, DataLoader

### DATA

In [11]:
# to keep thing simple lets use one hot encoding representations for our sequential data
# we are going to split the data X, Y
data = [["i like to eat fish", "toi thit an ca"],
		["have you ate yet", "co an com chua"],
		["we are going to church tomorrow", "ngay may minh di le"]]

# also, most tokenizers add special tokens such as <SOS>, <EOS>, <SEP>, <MASK>, etc.
# padding may be done as at this level as well
data = [["<SOS> "+sentence+" <EOS>"for sentence in entry] for entry in data]
print(data)

[['<SOS> i like to eat fish <EOS>', '<SOS> toi thit an ca <EOS>'], ['<SOS> have you ate yet <EOS>', '<SOS> co an com chua <EOS>'], ['<SOS> we are going to church tomorrow <EOS>', '<SOS> ngay may minh di le <EOS>']]


In [21]:
english_dict = {}
idx = 0
for entry in data:
	tokens = entry[0].split(" ")
	for token in tokens:

		if token not in english_dict:
			english_dict[token] = idx
			idx += 1

vietnamese_dict = {}
idx = 0
for entry in data:
	tokens = entry[1].split(" ")
	for token in tokens:

		if token not in vietnamese_dict:
			vietnamese_dict[token] = idx
			idx += 1

EMBEDDING_SIZE = 25
MAX_SEQ_LEN = 10

english = []
for entry in data:
	sentence = entry[0]
	tokens = sentence.split(" ")

	sequence = []
	for token in tokens:
		ohe = [0]*EMBEDDING_SIZE
		ohe[english_dict[token]] = 1
		sequence.append(ohe)
	while len(sequence) < MAX_SEQ_LEN: # padding
		ohe = [0]*EMBEDDING_SIZE
		sequence.append(ohe)

	english.append(sequence)

vietnamese = []
for entry in data:
	sentence = entry[1]
	tokens = sentence.split(" ")

	sequence = []
	for token in tokens:
		ohe = [0]*EMBEDDING_SIZE
		ohe[vietnamese_dict[token]] = 1
		sequence.append(ohe)
	while len(sequence) < MAX_SEQ_LEN:
		ohe = [0]*EMBEDDING_SIZE
		sequence.append(ohe)

	vietnamese.append(sequence)

english = torch.Tensor(english)
vietnamese = torch.Tensor(vietnamese)

print(english.size(), vietnamese.size()) # batch size, sequence length, embedding size

torch.Size([3, 10, 25]) torch.Size([3, 10, 25])


In [None]:
# using pytorch dataset and dataloader can be helpful
class EN_VN_dataset(Dataset):
	def __init__(self, params):

	def __len__(self):

	def __getitem__(self, index):
		

### MODEL

In [33]:
class Encoder(nn.Module):
	def __init__(self, input_size, hidden_size, num_layers=1, bidirectional=False, batch_first=True):
		super(Encoder, self).__init__()
		self.rnn = nn.RNN(input_size=input_size,
					hidden_size=hidden_size,
					num_layers=num_layers,
					bidirectional=bidirectional,
					batch_first=batch_first)

	def forward(self, x):
		output, hn = self.rnn(x)
		return output, hn

# check image on read me
# if bidirectional = True
# 	according to torch LSTM doc, hn contains the final hidden states of forward and backward
#	output contains the forward output and backward output at time step t

In [64]:
class Decoder(nn.Module):
	def __init__(self, input_size, hidden_size, num_layers=1, bidirectional=False, batch_first=True):
		super(Decoder, self).__init__()
		self.rnn = nn.RNN(input_size=input_size,
					hidden_size=hidden_size,
					num_layers=num_layers,
					bidirectional=bidirectional,
					batch_first=batch_first) # common to have decoder architecture similar to encoder
		
		self.linear = nn.Linear(hidden_size, EMBEDDING_SIZE) # classificatin head
		self.softmax = nn.Softmax(dim=-1)

	def forward(self, encoder_hidden):
		decoder_hidden = encoder_hidden

		# we are going to unraveling the translation backwards
		# begin with the first input as <EOS>
		decoder_input = []
		ohe = [0]*EMBEDDING_SIZE
		ohe[vietnamese_dict["<EOS>"]] = 1
		decoder_input.append([ohe]) # sequence length is one as we will step one at a time to change the input after each step
		decoder_input = decoder_input*encoder_hidden.size(1)
		decoder_input = torch.Tensor(decoder_input)

		decoder_outputs = []
		for step in range(MAX_SEQ_LEN):
			decoder_output, decoder_hidden = self.forward_step(decoder_input, decoder_hidden)
			decoder_outputs.append(decoder_output)
			# use the output from the decoder as the input now
			_, topidx = decoder_output.topk(1) # the index of the highest value
			topidx = topidx.squeeze(-1)
			decoder_input = [] # create single sequence ohe for batch
			for entry in topidx:
				ohe = [0]*EMBEDDING_SIZE 
				ohe[entry.item()] = 1
				decoder_input.append([ohe])
			decoder_input = torch.Tensor(decoder_input).detach()
		
		decoder_outputs = torch.stack(decoder_outputs, dim=1).squeeze(2) # reshaping data
		decoder_outputs = self.softmax(decoder_outputs)
		return decoder_outputs, decoder_hidden

	def forward_step(self, x, hn):
		# function to run though rnn one step
		x, hn = self.rnn(x, hn)
		x = self.linear(x)
		return x, hn
		

In [65]:
encoder = Encoder(EMBEDDING_SIZE, 50)
decoder = Decoder(EMBEDDING_SIZE, 50)
encoder_output, encoder_hn = encoder(english)
decoder_output, decoder_hn = decoder(encoder_hn)

### TRAINING

In [73]:
loss_fn = nn.CrossEntropyLoss()

param_groups = [
    {'params': encoder.parameters(), 'lr': 0.0001},
    {'params': decoder.parameters(), 'lr': 0.0001}
]
optimizer = optim.Adam(param_groups)

