In [1]:
# to reload modules automatically without having to restart the kernel
%load_ext autoreload
%autoreload 2

import torch
import torch.optim as optim
import torch.utils.data as data
from letters_dataset import LettersDataset
import torch.nn as nn
from train_collections import *
from tqdm import tqdm

In [2]:
# model and training parameters
batch_size = 64
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_epochs = 20

In [3]:
# load train data
dataset = LettersDataset(device=device)
loader = data.DataLoader(dataset, shuffle=True, batch_size=batch_size)

w = 415


In [4]:
n_chars = dataset.get_input_vocab_size()
n_harakat = dataset.get_output_vocab_size()
print("n_chars: ", n_chars)
print("n_harakat: ", n_harakat)

n_chars:  41
n_harakat:  15


In [5]:
from models.Accio import Accio
model = Accio(input_size=n_chars, output_size=n_harakat,device=device).to(device)
model.load_state_dict(torch.load("models/Accio.pth"))
optimizer = optim.Adam(model.parameters())
loss_fn = nn.CrossEntropyLoss()

In [6]:
num_batches = len(loader)
print("Number of batches:", num_batches)
best_model = None
best_loss = float('inf')
for epoch in range(n_epochs):
    model.train()
    for i, (X_batch,y_batch) in tqdm(enumerate(loader)):
        y_pred = ''
        # y_pred = model(X_batch)['diacritics']
        y_pred = model(X_batch)
        # we transpose because the loss function expects the second dimension to be the classes
        # y_pred is now (batch_size, n_classes, seq_len)
        y_pred = y_pred.transpose(1, 2)
        loss = loss_fn(y_pred, y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if i % 100 == 0:
            print("Epoch %d, batch %d: Loss = %.4f" % (epoch, i, loss))


    torch.cuda.empty_cache()  # Clear CUDA cache
    # Validation
    model.eval()
    
    torch.save(model.state_dict(), 'models/Accio.pth')
    loss = 0
    with torch.no_grad():
        for (X_batch,y_batch) in loader:
            # y_pred = model(X_batch)['diacritics']
            y_pred = model(X_batch)
            y_pred = y_pred.transpose(1, 2) 
            loss += loss_fn(y_pred, y_batch)
        if loss < best_loss:
            best_loss = loss
            best_model = model.state_dict()
        print("Epoch %d: Cross-entropy: %.4f" % (epoch, loss))


Number of batches: 3150


1it [00:01,  1.11s/it]

Epoch 0, batch 0: Loss = 0.0029


101it [00:26,  3.90it/s]

Epoch 0, batch 100: Loss = 0.0038


201it [00:52,  3.88it/s]

Epoch 0, batch 200: Loss = 0.0064


301it [01:20,  3.69it/s]

Epoch 0, batch 300: Loss = 0.0030


401it [01:46,  3.92it/s]

Epoch 0, batch 400: Loss = 0.0081


501it [02:11,  3.86it/s]

Epoch 0, batch 500: Loss = 0.0036


601it [02:36,  3.93it/s]

Epoch 0, batch 600: Loss = 0.0060


701it [03:00,  4.00it/s]

Epoch 0, batch 700: Loss = 0.0071


801it [03:25,  3.73it/s]

Epoch 0, batch 800: Loss = 0.0037


901it [03:51,  3.99it/s]

Epoch 0, batch 900: Loss = 0.0061


1001it [04:15,  4.04it/s]

Epoch 0, batch 1000: Loss = 0.0059


1101it [04:40,  4.00it/s]

Epoch 0, batch 1100: Loss = 0.0044


1201it [05:05,  3.74it/s]

Epoch 0, batch 1200: Loss = 0.0076


1301it [05:32,  3.52it/s]

Epoch 0, batch 1300: Loss = 0.0052


1401it [05:59,  3.79it/s]

Epoch 0, batch 1400: Loss = 0.0056


1501it [06:25,  3.74it/s]

Epoch 0, batch 1500: Loss = 0.0028


1601it [06:53,  3.31it/s]

Epoch 0, batch 1600: Loss = 0.0048


1701it [07:20,  3.83it/s]

Epoch 0, batch 1700: Loss = 0.0056


1801it [07:46,  3.93it/s]

Epoch 0, batch 1800: Loss = 0.0041


1901it [08:12,  3.62it/s]

Epoch 0, batch 1900: Loss = 0.0058


2001it [08:40,  3.69it/s]

Epoch 0, batch 2000: Loss = 0.0060


2101it [09:08,  3.37it/s]

Epoch 0, batch 2100: Loss = 0.0072


2201it [09:35,  3.76it/s]

Epoch 0, batch 2200: Loss = 0.0052


2301it [10:03,  3.36it/s]

Epoch 0, batch 2300: Loss = 0.0047


2401it [10:30,  3.76it/s]

Epoch 0, batch 2400: Loss = 0.0051


2501it [10:57,  3.77it/s]

Epoch 0, batch 2500: Loss = 0.0077


2601it [11:24,  3.56it/s]

Epoch 0, batch 2600: Loss = 0.0040


2701it [11:52,  3.62it/s]

Epoch 0, batch 2700: Loss = 0.0031


2801it [12:18,  3.79it/s]

Epoch 0, batch 2800: Loss = 0.0038


2901it [12:44,  3.86it/s]

Epoch 0, batch 2900: Loss = 0.0073


3001it [13:11,  3.46it/s]

Epoch 0, batch 3000: Loss = 0.0049


3101it [13:38,  3.90it/s]

Epoch 0, batch 3100: Loss = 0.0038


3150it [13:50,  3.79it/s]


Epoch 0: Cross-entropy: 14.9400


2it [00:00,  4.73it/s]

Epoch 1, batch 0: Loss = 0.0057


102it [00:21,  4.78it/s]

Epoch 1, batch 100: Loss = 0.0059


202it [00:42,  4.84it/s]

Epoch 1, batch 200: Loss = 0.0037


301it [01:02,  4.84it/s]

Epoch 1, batch 300: Loss = 0.0050


386it [01:21,  4.43it/s]

In [None]:
val_dataset = LettersDataset('clean_out/X_val.csv', 'clean_out/y_val.csv',val_mode=True, device=device)   

val_loader = data.DataLoader(val_dataset,  batch_size=batch_size)
print(val_dataset.char_encoder.word2idx)
# evaluaate accuracy on validation set

model.eval()
letter_haraka = []
with torch.no_grad():
    for (X_batch,y_batch) in val_loader:
        # y_pred = model(X_batch)['diacritics']
        y_pred = model(X_batch)
        # we transpose because the loss function expects the second dimension to be the classes
        # y_pred is now (batch_size, n_classes, seq_len)
        y_pred = y_pred.transpose(1, 2) 
        _, predicted = torch.max(y_pred.data, 1)
        # Count only non-padding characters
        for x,y in zip(X_batch,predicted):
            for xx,yy in zip(x,y):
                # we reached the end of the sentence
                # print(xx.item())
                # print(val_dataset.char_encoder.get_pad_id())
                # print(val_dataset.char_encoder.get_id_by_token(UNK_TOKEN))
                if xx.item() == val_dataset.char_encoder.get_pad_id():
                    break
                ll = val_dataset.char_encoder.is_arabic_letter(xx.item())
                if ll:
                    letter_haraka.append([ll,yy.item()])

# save ID,Label pairs in a csv file
import pandas as pd
df = pd.DataFrame(letter_haraka, columns=['letter','label'])
df.to_csv('./results/letter_haraka.csv', index=True, index_label='ID')



w = 1129
{'ا': 0, 'ب': 1, 'ت': 2, 'ث': 3, 'ج': 4, 'ح': 5, 'خ': 6, 'د': 7, 'ذ': 8, 'ر': 9, 'ز': 10, 'س': 11, 'ش': 12, 'ص': 13, 'ض': 14, 'ط': 15, 'ظ': 16, 'ع': 17, 'غ': 18, 'ف': 19, 'ق': 20, 'ك': 21, 'ل': 22, 'م': 23, 'ن': 24, 'ه': 25, 'و': 26, 'ي': 27, 'ى': 28, 'ة': 29, 'آ': 30, 'أ': 31, 'إ': 32, 'ء': 33, 'ؤ': 34, 'ئ': 35, ' ': 36, '،': 37, '-': 38, '<pad>': 39, '<unk>': 40}


In [None]:
gold_val = pd.read_csv('clean_out/val_gold.csv',index_col=0)
sys_val = pd.read_csv('results/letter_haraka.csv',index_col=0)
# Accuracy per letter
# print(gold_val.head())
# print(sys_val.head())   
# print(gold_val.iloc[0]['label'])

correct = 0
total = len(gold_val)
for i in range(total):
    # print(gold_val[i][0], sys_val[i][0])
    correct +=( gold_val.iloc[i]['label'] == sys_val.iloc[i]['label'])
    
print("Accuracy: %.2f%%" % (100.0 * correct / total))

Accuracy: 97.15%


In [None]:
# save model 
# torch.save(model, 'models/lstm.pth')
# save model state dict
torch.save(model.state_dict(), 'models/Accio.pth')
# load model state dict
# model = BiLSTM()
# model.load_state_dict(torch.load('models/bilstm.pth'))
# load model
# model = torch.load('models/___.pth')

In [None]:
print('DER of the network on the validation set: %f %%' % (100.0 * (1 - correct / total)))


DER of the network on the validation set: 2.850187 %
