In [1]:
from letters_dataset import LettersDataset
from words_dataset import WordsDataset
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

import random
import math
import time
from train_collections import *

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
class Encoder(nn.Module):
    def __init__(self, input_dim, embedding_dim=128, hidden_dim=256, num_layers=1, dropout_probability=0.1):
        super().__init__()
        # TODO: replace with one hot encoding
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.lstm_layer = nn.LSTM(
            embedding_dim, hidden_dim, num_layers, dropout=dropout_probability, batch_first=True)

        # Dropout layer to prevent over fitting (regularization)
        # it randomly zeroes some of the elements of the input tensor with probability p using samples from a Bernoulli distribution.
        self.dropout = nn.Dropout(dropout_probability)

    def forward(self, inputs):
        # inputs = [inputs len, batch size]
        embeddings = self.dropout(self.embedding(inputs))

        # embedded = [inputs len, batch size, emb dim]
        outputs, (hidden, cell) = self.lstm_layer(embeddings)

        # outputs = [inputs len, batch size, hid dim * n directions]
        # hidden = [n layers * n directions, batch size, hid dim]
        # cell = [n layers * n directions, batch size, hid dim]
        # outputs are always from the top hidden layer
        return hidden, cell

In [4]:
class Decoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size, device='cuda'):
        super().__init__()
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, h0, c0):
        # print("from decoder forward")
        # print(x.shape)
        embeddings = self.embedding(x)
        # print("from decoder forward after embedding")
        # print(embeddings.shape)
        outs, _ = self.lstm(embeddings, (h0, c0))
        # h is the output of the RNN
        # hn is the hidden state of the last timestep
        # cn is the cell state of the last timestep
        scores = self.fc(outs)
        return scores

In [5]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, encoder_inputs, decoder_inputs):
        encoder_hidden, encoder_cell = self.encoder(encoder_inputs)
        # insert start token at the beginning of the decoder inputs
        decoder_output = self.decoder(
            decoder_inputs, encoder_hidden, encoder_cell)
        return decoder_output

In [25]:
class AppendStartDataset(Dataset):
    def __init__(self, letters_dataset: LettersDataset):
        self.letters_dataset = letters_dataset


    def __len__(self):
        return len(self.letters_dataset)

    def __getitem__(self, idx):
        (X_enc,Y) = self.letters_dataset[idx]
        
        X_dec = None
        # add start token to the beginning of the decoder input
        start_token = torch.tensor([self.letters_dataset.harakat_encoder.get_id_by_token(START_TOKEN)]).to(device)
        X_dec = torch.cat((start_token, Y[:-1]))
        
        return X_enc,X_dec,Y
        

In [26]:
from train_collections import *



batch_size = 64
letters_dataset = LettersDataset(
    "./clean_out/X.csv", "./clean_out/Y.csv", device=device, special_tokens=[PAD_TOKEN,UNK_TOKEN,START_TOKEN,END_TOKEN])
seq2seqDataset = AppendStartDataset(letters_dataset)

loader = DataLoader(seq2seqDataset, shuffle=True, batch_size=batch_size)

w = 495


In [27]:
sample = next(iter(loader))
print(sample[0].shape)
print(sample[1].shape)
print(sample[2].shape)
print(sample[2])

torch.Size([64, 495])
torch.Size([64, 495])
torch.Size([64, 495])
tensor([[ 0,  3,  1,  ..., 15, 15, 15],
        [14,  0,  3,  ..., 15, 15, 15],
        [14,  0, 14,  ..., 15, 15, 15],
        ...,
        [14,  2,  3,  ..., 15, 15, 15],
        [14,  0,  8,  ..., 15, 15, 15],
        [14,  0,  3,  ..., 15, 15, 15]], device='cuda:0')


In [28]:
n_chars = seq2seqDataset.letters_dataset.get_input_vocab_size()
n_harakat = seq2seqDataset.letters_dataset.get_output_vocab_size()
# encoder_dim_vocab = #tokens
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [29]:
hidden_dim = 265
num_batches = len(loader)
n_epochs = 5

In [30]:
enc_model = Encoder(
    n_chars, hidden_dim=hidden_dim, num_layers=1, dropout_probability=0)

dec_model = Decoder(n_harakat, embedding_size=128,
                    hidden_size=128, output_size=n_harakat, device=device.type)


model = Seq2Seq(encoder=enc_model, decoder=dec_model).to(device)
print(model)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(64, 128)
    (lstm_layer): LSTM(128, 128, batch_first=True)
    (dropout): Dropout(p=0, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(19, 128)
    (lstm): LSTM(128, 128, batch_first=True)
    (fc): Linear(in_features=128, out_features=19, bias=True)
  )
)


In [33]:
print("Number of batches:", num_batches)
best_model = None
best_loss = np.inf
for epoch in range(n_epochs):
    model.train()
    for i, (X_enc,X_dec, Y_batch) in enumerate(loader):
        y_pred = ''
        curr_batch_size = X_enc.shape[0]
        y_pred = model(X_enc, X_dec)
        y_pred = y_pred.transpose(1, 2)
        # print(y_pred.shape)
        # print(y_batch.shape)
        loss = loss_fn(y_pred, Y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if i % 100 == 0:
            print("Epoch %d, batch %d: Loss = %.4f" % (epoch, i, loss))

    # Validation
    model.eval()
    loss = 0
    with torch.no_grad():
        for (X_enc,X_dec, Y_batch) in loader:
            y_pred = model(X_enc, X_dec)
            y_pred = y_pred.transpose(1, 2)

            loss += loss_fn(y_pred, Y_batch)
        if loss < best_loss:
            best_loss = loss
            best_model = model.state_dict()
        print("Epoch %d: Cross-entropy: %.4f" % (epoch, loss))

Number of batches: 2590
Epoch 0, batch 0: Loss = 0.1520
Epoch 0, batch 100: Loss = 0.1935
Epoch 0, batch 200: Loss = 0.0951
Epoch 0, batch 300: Loss = 0.1827
Epoch 0, batch 400: Loss = 0.1850
Epoch 0, batch 500: Loss = 0.1220
Epoch 0, batch 600: Loss = 0.2171
Epoch 0, batch 700: Loss = 0.1047
Epoch 0, batch 800: Loss = 0.0954
Epoch 0, batch 900: Loss = 0.0957
Epoch 0, batch 1000: Loss = 0.1243
Epoch 0, batch 1100: Loss = 0.1355
Epoch 0, batch 1200: Loss = 0.1530
Epoch 0, batch 1300: Loss = 0.0983
Epoch 0, batch 1400: Loss = 0.1152
Epoch 0, batch 1500: Loss = 0.1214
Epoch 0, batch 1600: Loss = 0.1689
Epoch 0, batch 1700: Loss = 0.0885
Epoch 0, batch 1800: Loss = 0.1492
Epoch 0, batch 1900: Loss = 0.1890
Epoch 0, batch 2000: Loss = 0.0853
Epoch 0, batch 2100: Loss = 0.1730
Epoch 0, batch 2200: Loss = 0.1194
Epoch 0, batch 2300: Loss = 0.2258
Epoch 0, batch 2400: Loss = 0.1607
Epoch 0, batch 2500: Loss = 0.1681
Epoch 0: Cross-entropy: 376.1655
Epoch 1, batch 0: Loss = 0.1440
Epoch 1, batc

In [35]:
val_dataset = LettersDataset(
    'clean_out/X_val.csv', 'clean_out/y_val.csv', device=device, special_tokens=[PAD_TOKEN,UNK_TOKEN,START_TOKEN,END_TOKEN])
val_dataset = AppendStartDataset(val_dataset)
val_loader = DataLoader(val_dataset, shuffle=True, batch_size=batch_size)



model.eval()
correct = 0
total = 0

with torch.no_grad():
    for (X_enc,X_dec, Y_batch) in val_loader:
        is_padding = (X_enc == val_dataset.letters_dataset.char_encoder.get_pad_id())
        y_pred = model(X_enc, X_dec)
        y_pred = y_pred.transpose(1, 2)
        _, predicted = torch.max(y_pred.data, 1)
        # Count only non-padding characters
        total += torch.sum(~is_padding).item()

        # Count correct predictions
        correct += torch.sum((predicted == Y_batch) & (~is_padding)).item()
print("Accuracy: %.2f%%" % (100 * correct / total))

w = 500
Accuracy: 60.36%
