In [3]:
# to reload modules automatically without having to restart the kernel
%load_ext autoreload
%autoreload 2

import torch
import torch.optim as optim
import torch.utils.data as data
from letters_dataset import LettersDataset
import torch.nn as nn
from train_collections import *
import numpy as np
from tqdm import tqdm

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
# model and training parameters
embedding_dim = 64
n_hidden = 256
batch_size = 64
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_epochs = 1

In [5]:
# load train data
dataset = LettersDataset(device=device)
loader = data.DataLoader(dataset, shuffle=True, batch_size=batch_size)

w = 415


In [6]:
n_chars = dataset.get_input_vocab_size()
n_harakat = dataset.get_output_vocab_size()
print("n_chars: ", n_chars)
print("n_harakat: ", n_harakat)

n_chars:  41
n_harakat:  15


In [13]:
from models.Accio import Accio
model = Accio(input_size=n_chars, output_size=n_harakat,device=device).to(device)
optimizer = optim.Adam(model.parameters())
loss_fn = nn.CrossEntropyLoss(ignore_index=dataset.char_encoder.get_pad_id())

In [14]:
num_batches = len(loader)
print("Number of batches:", num_batches)
best_model = None
best_loss = np.inf
for epoch in range(n_epochs):
    model.train()
    for i, (X_batch,y_batch) in tqdm(enumerate(loader)):
        y_pred = ''
        # y_pred = model(X_batch)['diacritics']
        y_pred = model(X_batch)
        # we transpose because the loss function expects the second dimension to be the classes
        # y_pred is now (batch_size, n_classes, seq_len)
        y_pred = y_pred.transpose(1, 2)
        loss = loss_fn(y_pred, y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if i % 100 == 0:
            print("Epoch %d, batch %d: Loss = %.4f" % (epoch, i, loss))
        
    # Validation
    model.eval()
    loss = 0
    with torch.no_grad():
        for (X_batch,y_batch) in loader:
            # y_pred = model(X_batch)['diacritics']
            y_pred = model(X_batch)
            y_pred = y_pred.transpose(1, 2) 
            loss += loss_fn(y_pred, y_batch)
        if loss < best_loss:
            best_loss = loss
            best_model = model.state_dict()
        print("Epoch %d: Cross-entropy: %.4f" % (epoch, loss))


Number of batches: 3150


0it [00:00, ?it/s]

1it [00:02,  2.03s/it]

Epoch 0, batch 0: Loss = 2.7121


101it [01:07,  1.38it/s]

Epoch 0, batch 100: Loss = 0.1027


201it [02:27,  1.15it/s]

Epoch 0, batch 200: Loss = 0.0710


301it [03:57,  1.09it/s]

Epoch 0, batch 300: Loss = 0.0409


401it [05:35,  1.02it/s]

Epoch 0, batch 400: Loss = 0.0456


501it [07:18,  1.10s/it]

Epoch 0, batch 500: Loss = 0.0283


601it [09:05,  1.14s/it]

Epoch 0, batch 600: Loss = 0.0237


701it [10:57,  1.06s/it]

Epoch 0, batch 700: Loss = 0.0217


801it [12:48,  1.14s/it]

Epoch 0, batch 800: Loss = 0.0207


901it [14:42,  1.12s/it]

Epoch 0, batch 900: Loss = 0.0182


1001it [16:34,  1.12s/it]

Epoch 0, batch 1000: Loss = 0.0164


1101it [18:28,  1.16s/it]

Epoch 0, batch 1100: Loss = 0.0135


1201it [20:20,  1.15s/it]

Epoch 0, batch 1200: Loss = 0.0187


1301it [22:14,  1.10s/it]

Epoch 0, batch 1300: Loss = 0.0183


1401it [24:10,  1.15s/it]

Epoch 0, batch 1400: Loss = 0.0202


1501it [25:57,  1.10s/it]

Epoch 0, batch 1500: Loss = 0.0198


1601it [27:41,  1.08s/it]

Epoch 0, batch 1600: Loss = 0.0147


1701it [29:27,  1.06s/it]

Epoch 0, batch 1700: Loss = 0.0138


1801it [31:11,  1.03s/it]

Epoch 0, batch 1800: Loss = 0.0152


1901it [32:56,  1.03s/it]

Epoch 0, batch 1900: Loss = 0.0171


2001it [34:41,  1.03s/it]

Epoch 0, batch 2000: Loss = 0.0148


2101it [36:27,  1.06s/it]

Epoch 0, batch 2100: Loss = 0.0148


2201it [38:11,  1.05s/it]

Epoch 0, batch 2200: Loss = 0.0232


2301it [39:55,  1.07s/it]

Epoch 0, batch 2300: Loss = 0.0095


2401it [41:43,  1.06s/it]

Epoch 0, batch 2400: Loss = 0.0108


2501it [43:30,  1.07s/it]

Epoch 0, batch 2500: Loss = 0.0127


2601it [45:17,  1.05s/it]

Epoch 0, batch 2600: Loss = 0.0096


2701it [47:05,  1.09s/it]

Epoch 0, batch 2700: Loss = 0.0116


2801it [48:54,  1.01s/it]

Epoch 0, batch 2800: Loss = 0.0155


2901it [50:41,  1.07s/it]

Epoch 0, batch 2900: Loss = 0.0172


3001it [52:26,  1.07s/it]

Epoch 0, batch 3000: Loss = 0.0149


3101it [54:12,  1.04s/it]

Epoch 0, batch 3100: Loss = 0.0127


3150it [55:01,  1.05s/it]


KeyboardInterrupt: 

In [16]:
val_dataset = LettersDataset('clean_out/X_val.csv', 'clean_out/y_val.csv',val_mode=True, device=device)   

val_loader = data.DataLoader(val_dataset,  batch_size=batch_size)
print(val_dataset.char_encoder.word2idx)
# evaluaate accuracy on validation set

model.eval()
letter_haraka = []
with torch.no_grad():
    for (X_batch,y_batch) in val_loader:
        # y_pred = model(X_batch)['diacritics']
        y_pred = model(X_batch)
        # we transpose because the loss function expects the second dimension to be the classes
        # y_pred is now (batch_size, n_classes, seq_len)
        y_pred = y_pred.transpose(1, 2) 
        _, predicted = torch.max(y_pred.data, 1)
        # Count only non-padding characters
        for x,y in zip(X_batch,predicted):
            for xx,yy in zip(x,y):
                # we reached the end of the sentence
                # print(xx.item())
                # print(val_dataset.char_encoder.get_pad_id())
                # print(val_dataset.char_encoder.get_id_by_token(UNK_TOKEN))
                if xx.item() == val_dataset.char_encoder.get_pad_id():
                    break
                ll = val_dataset.char_encoder.is_arabic_letter(xx.item())
                if ll:
                    letter_haraka.append([ll,yy.item()])

# save ID,Label pairs in a csv file
import pandas as pd
df = pd.DataFrame(letter_haraka, columns=['letter','label'])
df.to_csv('./results/letter_haraka.csv', index=True, index_label='ID')



w = 1129
{'ا': 0, 'ب': 1, 'ت': 2, 'ث': 3, 'ج': 4, 'ح': 5, 'خ': 6, 'د': 7, 'ذ': 8, 'ر': 9, 'ز': 10, 'س': 11, 'ش': 12, 'ص': 13, 'ض': 14, 'ط': 15, 'ظ': 16, 'ع': 17, 'غ': 18, 'ف': 19, 'ق': 20, 'ك': 21, 'ل': 22, 'م': 23, 'ن': 24, 'ه': 25, 'و': 26, 'ي': 27, 'ى': 28, 'ة': 29, 'آ': 30, 'أ': 31, 'إ': 32, 'ء': 33, 'ؤ': 34, 'ئ': 35, ' ': 36, '،': 37, '-': 38, '<pad>': 39, '<unk>': 40}


OutOfMemoryError: CUDA out of memory. Tried to allocate 916.00 MiB (GPU 0; 4.00 GiB total capacity; 2.06 GiB already allocated; 0 bytes free; 3.00 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [21]:
gold_val = pd.read_csv('clean_out/val_gold.csv',index_col=0)
sys_val = pd.read_csv('results/letter_haraka.csv',index_col=0)
# Accuracy per letter
# print(gold_val.head())
# print(sys_val.head())   
# print(gold_val.iloc[0]['label'])

correct = 0
total = len(gold_val)
for i in range(total):
    # print(gold_val[i][0], sys_val[i][0])
    correct +=( gold_val.iloc[i]['label'] == sys_val.iloc[i]['label'])
    
print("Accuracy: %.2f%%" % (100.0 * correct / total))

Accuracy: 95.32%


In [23]:
# save model 
# torch.save(model, 'models/lstm.pth')
# save model state dict
torch.save(model.state_dict(), 'models/Accio.pth')
# load model state dict
# model = BiLSTM()
# model.load_state_dict(torch.load('models/bilstm.pth'))
# load model
# model = torch.load('models/___.pth')

In [16]:
print('DER of the network on the validation set: %d %%' % (100 * (1 - correct / total)))


DER of the network on the validation set: 8 %


In [17]:
# val_dataset = LettersDataset('clean_out/X_test.csv', 'clean_out/Y_test.csv', device=device)   

# val_loader = data.DataLoader(val_dataset, shuffle=True, batch_size=batch_size)

# # evaluaate accuracy on validation set


# model.eval()
# correct = 0
# total = 0

# with torch.no_grad():
#     for (X_batch,y_batch) in val_loader:
#         is_padding = X_batch == val_dataset.char_encoder.get_pad_id()
#         # y_pred = model(X_batch)['diacritics']
#         y_pred = model(X_batch)
#         y_pred = y_pred.transpose(1, 2) 
#         _, predicted = torch.max(y_pred.data, 1)
#         # Count only non-padding characters
#         total += torch.sum(~is_padding).item()
        
#         # Count correct predictions
#         correct += torch.sum((predicted == y_batch) & (~is_padding)).item()
# print("Accuracy: %.2f%%" % (100 * correct / total))

