In [2]:
# to reload modules automatically without having to restart the kernel
%load_ext autoreload
%autoreload 2
import torch
import torch.optim as optim
import torch.utils.data as data
from letters_dataset import LettersDataset
import torch.nn as nn
from train_collections import *
import numpy as np
from tqdm import tqdm

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
def save_checkpoint(model, optimizer, epoch, loss, filename):
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss
    }, filename)


def load_checkpoint(model, optimizer, filename):
    checkpoint = torch.load(filename)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    epoch = checkpoint['epoch'] + 1
    loss = checkpoint['loss']
    return epoch, loss

In [4]:
# model and training parameters
batch_size = 64
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_epochs = 20

In [5]:
# load train data
dataset = LettersDataset(device=device)
loader = data.DataLoader(dataset, shuffle=True, batch_size=batch_size)
n_chars = dataset.get_input_vocab_size()
n_harakat = dataset.get_output_vocab_size()
print("n_chars: ", n_chars)
print("n_harakat: ", n_harakat)

w = 415
n_chars:  41
n_harakat:  15


In [8]:
from accio import Accio
from bilstm import BiLSTM
# model = Accio(input_size=n_chars, output_size=n_harakat, device=device).to(device)
model = BiLSTM(n_chars, n_harakat).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.CrossEntropyLoss(ignore_index=dataset.char_encoder.get_pad_id())

In [9]:
num_batches = len(loader)
print("Number of batches:", num_batches)
best_model = None
best_loss = np.inf
for epoch in range(n_epochs):
    torch.cuda.empty_cache()  # Clear CUDA cache to avoid memory error
    model.train()
    for i, (X_batch, y_batch) in tqdm(enumerate(loader)):
        y_pred = ''
        y_pred = model(X_batch)
        # we transpose because the loss function expects the second dimension to be the classes
        # y_pred is now (batch_size, n_classes, seq_len)
        y_pred = y_pred.transpose(1, 2)
        loss = loss_fn(y_pred, y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if i % 100 == 0:
            print("Epoch %d, batch %d: Loss = %.4f" % (epoch, i, loss))

    # save model after each epoch
    # torch.save(model.state_dict(), f'models/accio_epoch_{epoch}.pth')
    save_checkpoint(model, optimizer, epoch, 0, f'models/bilstm_epoch_{epoch}.pth')
    
    # Validation
    model.eval()
    loss = 0
    with torch.no_grad():
        for (X_batch, y_batch) in loader:
            y_pred = model(X_batch)
            y_pred = y_pred.transpose(1, 2)
            loss += loss_fn(y_pred, y_batch)
        if loss < best_loss:
            best_loss = loss
            best_model = model.state_dict()
        print("Epoch %d: Cross-entropy: %.4f" % (epoch, loss))


Number of batches: 3150


2it [00:01,  1.87it/s]

Epoch 0, batch 0: Loss = 2.6062


102it [00:14,  7.86it/s]

Epoch 0, batch 100: Loss = 0.0981


202it [00:26,  7.71it/s]

Epoch 0, batch 200: Loss = 0.0616


302it [00:40,  7.62it/s]

Epoch 0, batch 300: Loss = 0.0484


402it [00:53,  7.69it/s]

Epoch 0, batch 400: Loss = 0.0420


502it [01:06,  7.65it/s]

Epoch 0, batch 500: Loss = 0.0422


602it [01:19,  7.19it/s]

Epoch 0, batch 600: Loss = 0.0505


702it [01:33,  7.21it/s]

Epoch 0, batch 700: Loss = 0.0303


802it [01:47,  7.33it/s]

Epoch 0, batch 800: Loss = 0.0476


902it [02:00,  7.74it/s]

Epoch 0, batch 900: Loss = 0.0261


1002it [02:13,  7.38it/s]

Epoch 0, batch 1000: Loss = 0.0329


1102it [02:27,  7.65it/s]

Epoch 0, batch 1100: Loss = 0.0252


1202it [02:41,  7.24it/s]

Epoch 0, batch 1200: Loss = 0.0305


1302it [02:54,  7.36it/s]

Epoch 0, batch 1300: Loss = 0.0251


1402it [03:08,  7.61it/s]

Epoch 0, batch 1400: Loss = 0.0292


1502it [03:21,  7.61it/s]

Epoch 0, batch 1500: Loss = 0.0405


1602it [03:34,  7.67it/s]

Epoch 0, batch 1600: Loss = 0.0237


1702it [03:48,  7.58it/s]

Epoch 0, batch 1700: Loss = 0.0221


1802it [04:01,  7.71it/s]

Epoch 0, batch 1800: Loss = 0.0196


1902it [04:14,  7.72it/s]

Epoch 0, batch 1900: Loss = 0.0295


2002it [04:27,  7.64it/s]

Epoch 0, batch 2000: Loss = 0.0245


2102it [04:41,  7.63it/s]

Epoch 0, batch 2100: Loss = 0.0155


2202it [04:55,  7.59it/s]

Epoch 0, batch 2200: Loss = 0.0231


2302it [05:08,  7.15it/s]

Epoch 0, batch 2300: Loss = 0.0200


2402it [05:23,  6.88it/s]

Epoch 0, batch 2400: Loss = 0.0175


2502it [05:36,  7.58it/s]

Epoch 0, batch 2500: Loss = 0.0285


2602it [05:49,  7.59it/s]

Epoch 0, batch 2600: Loss = 0.0116


2702it [06:03,  7.42it/s]

Epoch 0, batch 2700: Loss = 0.0193


2802it [06:17,  7.12it/s]

Epoch 0, batch 2800: Loss = 0.0214


2902it [06:31,  7.14it/s]

Epoch 0, batch 2900: Loss = 0.0274


3002it [06:45,  7.26it/s]

Epoch 0, batch 3000: Loss = 0.0267


3102it [06:59,  7.50it/s]

Epoch 0, batch 3100: Loss = 0.0237


3150it [07:05,  7.40it/s]


In [None]:
# load validation data
val_dataset = LettersDataset('clean_out/X_val.csv', 'clean_out/y_val.csv', val_mode=True, device=device)
val_loader = data.DataLoader(val_dataset, batch_size=batch_size)
print(val_dataset.char_encoder.word2idx)

In [None]:
# evaluaate accuracy on validation set
model.eval()
letter_haraka = []
with torch.no_grad():
    for (X_batch, y_batch) in val_loader:
        y_pred = model(X_batch)
        # we transpose because the loss function expects the second dimension to be the classes
        # y_pred is now (batch_size, n_classes, seq_len)
        y_pred = y_pred.transpose(1, 2)
        _, predicted = torch.max(y_pred.data, 1)
        # Count only non-padding characters
        for x, y in zip(X_batch, predicted):
            for xx, yy in zip(x, y):
                # we reached the end of the sentence
                if xx.item() == val_dataset.char_encoder.get_pad_id():
                    break
                ll = val_dataset.char_encoder.is_arabic_letter(xx.item())
                if ll:
                    letter_haraka.append([ll, yy.item()])

# save ID,Label pairs in a csv file
import pandas as pd

df = pd.DataFrame(letter_haraka, columns=['letter', 'label'])
df.to_csv('./results/letter_haraka.csv', index=True, index_label='ID')

In [None]:
gold_val = pd.read_csv('clean_out/val_gold.csv', index_col=0)
sys_val = pd.read_csv('results/letter_haraka.csv', index_col=0)
# Accuracy per letter
correct = 0
total = len(gold_val)
for i in range(total):
    # print(gold_val[i][0], sys_val[i][0])
    correct += (gold_val.iloc[i]['label'] == sys_val.iloc[i]['label'])

print("Accuracy: %.2f%%" % (100.0 * correct / total))

In [None]:
print('DER of the network on the validation set: %d %%' % (100 * (1 - correct / total)))