In [1]:
from letters_dataset import LettersDataset
from words_dataset import WordsDataset
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

import random
import math
import time
from train_collections import *

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
class Encoder(nn.Module):
    def __init__(self, input_dim, embedding_dim=128, hidden_dim=256, num_layers=1, dropout_probability=0.1):
        super().__init__()
        # TODO: replace with one hot encoding
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.lstm_layer = nn.LSTM(
            embedding_dim, hidden_dim, num_layers, dropout=dropout_probability, batch_first=True)

        # Dropout layer to prevent over fitting (regularization)
        # it randomly zeroes some of the elements of the input tensor with probability p using samples from a Bernoulli distribution.
        self.dropout = nn.Dropout(dropout_probability)

    def forward(self, inputs):
        # inputs = [inputs len, batch size]
        embeddings = self.dropout(self.embedding(inputs))

        # embedded = [inputs len, batch size, emb dim]
        outputs, (hidden, cell) = self.lstm_layer(embeddings)

        # outputs = [inputs len, batch size, hid dim * n directions]
        # hidden = [n layers * n directions, batch size, hid dim]
        # cell = [n layers * n directions, batch size, hid dim]
        # outputs are always from the top hidden layer
        return hidden, cell

In [4]:
class Decoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size, device='cuda'):
        super().__init__()
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, h0, c0):
        # print("from decoder forward")
        # print(x.shape)
        embeddings = self.embedding(x)
        # print("from decoder forward after embedding")
        # print(embeddings.shape)
        outs, _ = self.lstm(embeddings, (h0, c0))
        # h is the output of the RNN
        # hn is the hidden state of the last timestep
        # cn is the cell state of the last timestep
        scores = self.fc(outs)
        return scores

In [5]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, encoder_inputs, decoder_inputs):
        encoder_hidden, encoder_cell = self.encoder(encoder_inputs)
        # insert start token at the beginning of the decoder inputs
        decoder_output = self.decoder(
            decoder_inputs, encoder_hidden, encoder_cell)
        return decoder_output

In [6]:
from regex import P


batch_size = 64
dataset = LettersDataset(
    "./clean_out/X.csv", "./clean_out/Y.csv", device=device, special_tokens=[PAD_TOKEN,UNK_TOKEN,START_TOKEN,END_TOKEN])
loader = DataLoader(dataset, shuffle=True, batch_size=batch_size)

w = 495


In [7]:
n_chars = dataset.get_input_vocab_size()
n_harakat = dataset.get_output_vocab_size()
# encoder_dim_vocab = #tokens
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [8]:
hidden_dim = 128
num_batches = len(loader)
n_epochs = 10

In [11]:
enc_model = Encoder(
    n_chars, hidden_dim=hidden_dim, num_layers=1, dropout_probability=0)

dec_model = Decoder(n_harakat, embedding_size=128,
                    hidden_size=128, output_size=n_harakat, device=device.type)


model = Seq2Seq(encoder=enc_model, decoder=dec_model).to(device)
print(model)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(64, 128)
    (lstm_layer): LSTM(128, 128, batch_first=True)
    (dropout): Dropout(p=0, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(19, 128)
    (lstm): LSTM(128, 128, batch_first=True)
    (fc): Linear(in_features=128, out_features=19, bias=True)
  )
)


In [12]:
print("Number of batches:", num_batches)
best_model = None
best_loss = np.inf
for epoch in range(n_epochs):
    model.train()
    for i, (X_batch, Y_batch) in enumerate(loader):
        y_pred = ''
        curr_batch_size = X_batch.shape[0]
        # print(Y_batch.shape)
        # make 2d tensor of start tokens of size batch_size
        start_token_tensor = torch.tensor([[dataset.harakat_encoder.get_id_by_token(START_TOKEN)]] * curr_batch_size).to(device)
        # print(start_token_tensor.shape)
        # decoder input is the same as the target but shifted by one and with start token at the beginning
        decoder_input = torch.cat(
            (start_token_tensor, Y_batch[:, :-1]), dim=1)
        y_pred = model(X_batch, decoder_input)
        y_pred = y_pred.transpose(1, 2)
        # print(y_pred.shape)
        # print(y_batch.shape)
        loss = loss_fn(y_pred, Y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if i % 100 == 0:
            print("Epoch %d, batch %d: Loss = %.4f" % (epoch, i, loss))

    # Validation
    model.eval()
    loss = 0
    with torch.no_grad():
        for (X, Y) in loader:
            curr_batch_size = X.shape[0]
             # decoder input is the same as the target but shifted by one and with start token at the beginning
            start_token_tensor = torch.tensor([[dataset.harakat_encoder.get_id_by_token(START_TOKEN)]] * curr_batch_size).to(device)
            decoder_input = torch.cat(
            (start_token_tensor, Y[:, :-1]), dim=1)
            y_pred = model(X, decoder_input)
            y_pred = y_pred.transpose(1, 2)

            loss += loss_fn(y_pred, Y)
        if loss < best_loss:
            best_loss = loss
            best_model = model.state_dict()
        print("Epoch %d: Cross-entropy: %.4f" % (epoch, loss))

Number of batches: 2590


Epoch 0, batch 0: Loss = 2.9877
Epoch 0, batch 100: Loss = 0.2287
Epoch 0, batch 200: Loss = 0.1569
Epoch 0, batch 300: Loss = 0.2097
Epoch 0, batch 400: Loss = 0.1721
Epoch 0, batch 500: Loss = 0.2070
Epoch 0, batch 600: Loss = 0.1966
Epoch 0, batch 700: Loss = 0.2254
Epoch 0, batch 800: Loss = 0.1565
Epoch 0, batch 900: Loss = 0.1386
Epoch 0, batch 1000: Loss = 0.1160
Epoch 0, batch 1100: Loss = 0.1680
Epoch 0, batch 1200: Loss = 0.1701
Epoch 0, batch 1300: Loss = 0.2105
Epoch 0, batch 1400: Loss = 0.1905
Epoch 0, batch 1500: Loss = 0.1376
Epoch 0, batch 1600: Loss = 0.2238
Epoch 0, batch 1700: Loss = 0.1395
Epoch 0, batch 1800: Loss = 0.1621
Epoch 0, batch 1900: Loss = 0.1435
Epoch 0, batch 2000: Loss = 0.1305
Epoch 0, batch 2100: Loss = 0.1589
Epoch 0, batch 2200: Loss = 0.1449
Epoch 0, batch 2300: Loss = 0.1458
Epoch 0, batch 2400: Loss = 0.1502
Epoch 0, batch 2500: Loss = 0.1277
Epoch 0: Cross-entropy: 410.3089
Epoch 1, batch 0: Loss = 0.2354
Epoch 1, batch 100: Loss = 0.1273
Epo

In [None]:
val_dataset = LettersDataset(
    'clean_out/X_val.csv', 'clean_out/y_val.csv', device=device, special_tokens=[PAD_TOKEN,UNK_TOKEN,START_TOKEN,END_TOKEN])

val_loader = DataLoader(val_dataset, shuffle=True, batch_size=batch_size)



model.eval()
correct = 0
total = 0

with torch.no_grad():
    for (X, Y) in val_loader:
        curr_batch_size = X.shape[0]
        start_token_tensor = torch.tensor([[dataset.harakat_encoder.get_id_by_token(START_TOKEN)]] * curr_batch_size).to(device)
        decoder_input = torch.cat(
            (start_token_tensor, Y[:, :-1]), dim=1)
        is_padding = (X == val_dataset.char_encoder.get_pad_id())
        y_pred = model(X, decoder_input)
        y_pred = y_pred.transpose(1, 2)
        _, predicted = torch.max(y_pred.data, 1)
        # Count only non-padding characters
        total += torch.sum(~is_padding).item()

        # Count correct predictions
        correct += torch.sum((predicted == Y) & (~is_padding)).item()
print("Accuracy: %.2f%%" % (100 * correct / total))