In [27]:
import torch
import torch.optim as optim
import torch.utils.data as data
from torch.utils.data import Dataset, DataLoader
from letters_dataset import LettersDataset
import torch.nn as nn
from train_collections import *
import numpy as np
from tqdm import tqdm
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# autoreload notebook
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [30]:
embedding_dim = 64
n_epochs = 1
n_hidden = 256
batch_size = 64

In [31]:


dataset = LettersDataset(device=device)
loader = data.DataLoader(dataset, shuffle=True, batch_size=batch_size)

# load val data
# da = LettersDataset('clean_out/X_val.csv', 'clean_out/y_val.csv')

w = 495


In [32]:
n_chars = dataset.get_input_vocab_size()
n_harakat = dataset.get_output_vocab_size()
n_harakat

17

In [33]:
from models.baseline import BaseLineModel

class CharModel(nn.Module):
    def __init__(self):
        super().__init__()

        # embedding and LSTM layers
        self.embedding = nn.Embedding(n_chars, embedding_dim)

        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=n_hidden,
                            num_layers=1, batch_first=True )
        self.dropout = nn.Dropout(0.2)
        self.linear = nn.Linear(n_hidden, n_harakat)

    def forward(self, x):
        # pass thru embedding layer
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = self.linear(self.dropout(x))
        return x

class BiLSTM(nn.Module):
    def __init__(self):
        super(BiLSTM, self).__init__()
        self.embedding = nn.Embedding(n_chars, embedding_dim)

        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=n_hidden, bidirectional=True, batch_first=True)
        self.linear = nn.Linear(2*n_hidden, n_harakat)
        self.dropout = nn.Dropout(0.2)
        


    def forward(self, x):
        # pass thru embedding layer
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = self.linear(self.dropout(x))
        return x

model = BaseLineModel(n_chars, n_harakat,embedding_dim).to(device)
# model = BiLSTM().to(device)
optimizer = optim.Adam(model.parameters())
loss_fn = nn.CrossEntropyLoss(ignore_index=dataset.char_encoder.get_pad_id())
num_batches = len(loader)
print("Number of batches:", num_batches)
best_model = None
best_loss = np.inf
for epoch in range(n_epochs):
    model.train()
    for i, (X_batch,y_batch) in tqdm(enumerate(loader)):
        y_pred = ''
        y_pred = model(X_batch)['diacritics']
        y_pred = y_pred.transpose(1, 2) 
        # print(y_pred.shape)
        # print(y_batch.shape)
        loss = loss_fn(y_pred, y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if i % 100 == 0:
            print("Epoch %d, batch %d: Loss = %.4f" % (epoch, i, loss))
        
    # Validation
    model.eval()
    loss = 0
    with torch.no_grad():
        for (X_batch,y_batch) in loader:
            y_pred = model(X_batch)['diacritics']
            y_pred = y_pred.transpose(1, 2) 
            
            loss += loss_fn(y_pred, y_batch)
        if loss < best_loss:
            best_loss = loss
            best_model = model.state_dict()
        print("Epoch %d: Cross-entropy: %.4f" % (epoch, loss))


Number of batches: 2590


0it [00:00, ?it/s]

1it [00:00,  1.18it/s]

Epoch 0, batch 0: Loss = 2.8278


101it [01:22,  1.25it/s]

Epoch 0, batch 100: Loss = 0.1615


201it [02:43,  1.21it/s]

Epoch 0, batch 200: Loss = 0.1075


301it [04:03,  1.28it/s]

Epoch 0, batch 300: Loss = 0.0653


401it [05:21,  1.29it/s]

Epoch 0, batch 400: Loss = 0.0681


501it [06:40,  1.26it/s]

Epoch 0, batch 500: Loss = 0.0617


601it [08:00,  1.18it/s]

Epoch 0, batch 600: Loss = 0.0544


701it [09:21,  1.14it/s]

Epoch 0, batch 700: Loss = 0.0268


801it [10:42,  1.23it/s]

Epoch 0, batch 800: Loss = 0.0213


901it [12:01,  1.09it/s]

Epoch 0, batch 900: Loss = 0.0362


1001it [13:19,  1.29it/s]

Epoch 0, batch 1000: Loss = 0.0245


1101it [14:38,  1.27it/s]

Epoch 0, batch 1100: Loss = 0.0254


1201it [15:56,  1.30it/s]

Epoch 0, batch 1200: Loss = 0.0228


1301it [17:15,  1.26it/s]

Epoch 0, batch 1300: Loss = 0.0252


1401it [18:35,  1.24it/s]

Epoch 0, batch 1400: Loss = 0.0208


1501it [19:55,  1.25it/s]

Epoch 0, batch 1500: Loss = 0.0288


1601it [21:13,  1.30it/s]

Epoch 0, batch 1600: Loss = 0.0162


1701it [22:32,  1.29it/s]

Epoch 0, batch 1700: Loss = 0.0161


1801it [23:50,  1.30it/s]

Epoch 0, batch 1800: Loss = 0.0169


1901it [25:09,  1.24it/s]

Epoch 0, batch 1900: Loss = 0.0179


2001it [26:28,  1.27it/s]

Epoch 0, batch 2000: Loss = 0.0161


2101it [27:48,  1.17it/s]

Epoch 0, batch 2100: Loss = 0.0137


2201it [29:09,  1.27it/s]

Epoch 0, batch 2200: Loss = 0.0179


2301it [30:27,  1.30it/s]

Epoch 0, batch 2300: Loss = 0.0203


2401it [31:44,  1.29it/s]

Epoch 0, batch 2400: Loss = 0.0135


2501it [33:07,  1.20it/s]

Epoch 0, batch 2500: Loss = 0.0177


2590it [34:21,  1.26it/s]


Epoch 0: Cross-entropy: 39.2408


In [38]:
val_dataset = LettersDataset('clean_out/X_val.csv', 'clean_out/y_val.csv', device=device)   

val_loader = data.DataLoader(val_dataset, shuffle=True, batch_size=batch_size)

# evaluaate accuracy on validation set


model.eval()
correct = 0
total = 0

with torch.no_grad():
    for (X_batch,y_batch) in val_loader:
        is_padding = X_batch == val_dataset.char_encoder.get_pad_id()
        y_pred = model(X_batch)['diacritics']
        y_pred = y_pred.transpose(1, 2) 
        _, predicted = torch.max(y_pred.data, 1)
        # Count only non-padding characters
        total += torch.sum(~is_padding).item()
        
        # Count correct predictions
        correct += torch.sum((predicted == y_batch) & (~is_padding)).item()
print("Accuracy: %.2f%%" % (100 * correct / total))



w = 500
Accuracy: 95.84%


In [39]:
print('DER of the network on the validation set: %d %%' % (100 * (1 - correct / total)))


DER of the network on the validation set: 4 %


In [40]:
val_dataset = LettersDataset('clean_out/X_test.csv', 'clean_out/Y_test.csv', device=device)   

val_loader = data.DataLoader(val_dataset, shuffle=True, batch_size=batch_size)

# evaluaate accuracy on validation set


model.eval()
correct = 0
total = 0

with torch.no_grad():
    for (X_batch,y_batch) in val_loader:
        is_padding = X_batch == val_dataset.char_encoder.get_pad_id()
        y_pred = model(X_batch)['diacritics']
        y_pred = y_pred.transpose(1, 2) 
        _, predicted = torch.max(y_pred.data, 1)
        # Count only non-padding characters
        total += torch.sum(~is_padding).item()
        
        # Count correct predictions
        correct += torch.sum((predicted == y_batch) & (~is_padding)).item()
print("Accuracy: %.2f%%" % (100 * correct / total))



w = 499
