In [19]:
# to reload modules automatically without having to restart the kernel
%load_ext autoreload
%autoreload 2

import torch
import torch.optim as optim
import torch.utils.data as data
from letters_dataset import LettersDataset
import torch.nn as nn
from train_collections import *
from tqdm import tqdm
import pandas as pd
import numpy as np
from nltk.stem.isri import ISRIStemmer

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [20]:
with open("clean_out/merged_unsplited.txt", "r", encoding="utf8") as f:
    text = f.read()

stemmer = ISRIStemmer()
# replace , and - with space
text = text.replace("،", "")
text = text.replace("-", "")
text = text.split("\n")

text = [sentence.split() for sentence in text]
lengths = [len(sentence) for sentence in text]
lengths = np.cumsum(lengths)
text = [[stemmer.stem(word) for word in sentence] for sentence in text]

vocab = set([word for sentence in text for word in sentence] + ["<S>", "</S>", "<UNK>"])
vocab_size = len(vocab)

In [21]:
word2idx = {word: i for i, word in enumerate(vocab)}
idx2word = {i: word for i, word in enumerate(vocab)}


In [22]:
# model and training parameters
batch_size = 64
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_epochs = 20

In [24]:
# load train data
dataset = LettersDataset(device=device,word2idx=word2idx, return_sent_emb=False)
dataset =LettersDataset('clean_out/X_train.csv', 'clean_out/y_train.csv',val_mode=False, device=device) 
loader = data.DataLoader(dataset, shuffle=True, batch_size=batch_size)

sample = next(iter(loader))
print(sample)

w = 417


In [None]:
n_chars = dataset.get_input_vocab_size()
n_harakat = dataset.get_output_vocab_size()
print("n_chars: ", n_chars)
print("n_harakat: ", n_harakat)

n_chars:  41
n_harakat:  15


In [None]:
def save_checkpoint(model, optimizer, epoch, loss, filename):
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss
    }, filename)
    
    
def load_checkpoint(model, optimizer, filename):
    checkpoint = torch.load(filename)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    epoch = checkpoint['epoch'] + 1
    loss = checkpoint['loss']
    return epoch, loss

In [None]:
from models.Accio import Accio
model = Accio(input_size=n_chars, output_size=n_harakat,device=device).to(device)
# model.load_state_dict(torch.load("models/Accio_deep_9.pth"))
optimizer = optim.Adam(model.parameters())

_ = load_checkpoint(model, optimizer, "models/Accio_deep_19.pth")
loss_fn = nn.CrossEntropyLoss()

In [13]:
num_batches = len(loader)
print("Number of batches:", num_batches)
best_model = None
best_loss = float('inf')
for epoch in range(n_epochs):
    torch.cuda.empty_cache()  # Clear CUDA cache
    
    model.train()
    for i, (X_batch, y_batch) in tqdm(enumerate(loader)):
        y_pred = ''
        # y_pred = model(X_batch)['diacritics']
        y_pred = model(X_batch)
        # we transpose because the loss function expects the second dimension to be the classes
        # y_pred is now (batch_size, n_classes, seq_len)
        y_pred = y_pred.transpose(1, 2)
        loss = loss_fn(y_pred, y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if i % 100 == 0:
            print("Epoch %d, batch %d: Loss = %.4f" % (epoch, i, loss))


    # Validation
    model.eval()
    
    # torch.save(model.state_dict(), f'models/Accio_deept_{epoch}.pth')
    save_checkpoint(model, optimizer, epoch, 0, f'models/Accio_deep_{epoch}.pth')
    loss = 0
    with torch.no_grad():
        for (X_batch, y_batch) in loader:
            # y_pred = model(X_batch)['diacritics']
            y_pred = model(X_batch)
            y_pred = y_pred.transpose(1, 2)
            loss += loss_fn(y_pred, y_batch)
        if loss < best_loss:
            best_loss = loss
            best_model = model.state_dict()
        print("Epoch %d: Cross-entropy: %.4f" % (epoch, loss))


Number of batches: 176


1it [00:00,  2.18it/s]

Epoch 0, batch 0: Loss = 0.0014


101it [00:25,  4.05it/s]

Epoch 0, batch 100: Loss = 0.0010


176it [00:44,  3.94it/s]


Epoch 0: Cross-entropy: 0.0719


1it [00:00,  3.90it/s]

Epoch 1, batch 0: Loss = 0.0005


101it [00:25,  3.78it/s]

Epoch 1, batch 100: Loss = 0.0004


176it [00:44,  3.92it/s]


Epoch 1: Cross-entropy: 0.0350


1it [00:00,  3.60it/s]

Epoch 2, batch 0: Loss = 0.0001


101it [00:25,  3.83it/s]

Epoch 2, batch 100: Loss = 0.0002


176it [00:44,  3.92it/s]


Epoch 2: Cross-entropy: 0.0213


1it [00:00,  3.70it/s]

Epoch 3, batch 0: Loss = 0.0002


101it [00:26,  3.46it/s]

Epoch 3, batch 100: Loss = 0.0000


176it [00:47,  3.73it/s]


Epoch 3: Cross-entropy: 0.0129


1it [00:00,  3.71it/s]

Epoch 4, batch 0: Loss = 0.0001


101it [00:26,  3.81it/s]

Epoch 4, batch 100: Loss = 0.0000


176it [00:46,  3.82it/s]


Epoch 4: Cross-entropy: 0.0134


1it [00:00,  3.93it/s]

Epoch 5, batch 0: Loss = 0.0001


101it [00:25,  3.82it/s]

Epoch 5, batch 100: Loss = 0.0001


176it [00:44,  3.93it/s]


Epoch 5: Cross-entropy: 0.0333


1it [00:00,  3.60it/s]

Epoch 6, batch 0: Loss = 0.0002


101it [00:25,  3.80it/s]

Epoch 6, batch 100: Loss = 0.0028


176it [00:45,  3.90it/s]


Epoch 6: Cross-entropy: 0.3146


1it [00:00,  3.45it/s]

Epoch 7, batch 0: Loss = 0.0018


101it [00:26,  3.83it/s]

Epoch 7, batch 100: Loss = 0.0019


176it [00:45,  3.83it/s]


Epoch 7: Cross-entropy: 0.1292


1it [00:00,  3.90it/s]

Epoch 8, batch 0: Loss = 0.0004


101it [00:25,  3.76it/s]

Epoch 8, batch 100: Loss = 0.0011


176it [00:44,  3.91it/s]


Epoch 8: Cross-entropy: 0.0521


1it [00:00,  3.84it/s]

Epoch 9, batch 0: Loss = 0.0004


101it [00:26,  3.66it/s]

Epoch 9, batch 100: Loss = 0.0006


176it [00:45,  3.83it/s]


Epoch 9: Cross-entropy: 0.0276


1it [00:00,  3.90it/s]

Epoch 10, batch 0: Loss = 0.0001


101it [00:25,  4.03it/s]

Epoch 10, batch 100: Loss = 0.0001


176it [00:45,  3.87it/s]


Epoch 10: Cross-entropy: 0.0134


1it [00:00,  3.85it/s]

Epoch 11, batch 0: Loss = 0.0000


101it [00:26,  3.91it/s]

Epoch 11, batch 100: Loss = 0.0000


176it [00:45,  3.84it/s]


Epoch 11: Cross-entropy: 0.0090


1it [00:00,  3.73it/s]

Epoch 12, batch 0: Loss = 0.0000


101it [00:26,  4.03it/s]

Epoch 12, batch 100: Loss = 0.0000


176it [00:45,  3.84it/s]


Epoch 12: Cross-entropy: 0.0067


1it [00:00,  3.61it/s]

Epoch 13, batch 0: Loss = 0.0000


101it [00:26,  3.88it/s]

Epoch 13, batch 100: Loss = 0.0000


176it [00:46,  3.82it/s]


Epoch 13: Cross-entropy: 0.0063


1it [00:00,  3.90it/s]

Epoch 14, batch 0: Loss = 0.0000


101it [00:26,  3.87it/s]

Epoch 14, batch 100: Loss = 0.0000


176it [00:45,  3.84it/s]


Epoch 14: Cross-entropy: 0.0064


1it [00:00,  3.69it/s]

Epoch 15, batch 0: Loss = 0.0000


101it [00:26,  3.90it/s]

Epoch 15, batch 100: Loss = 0.0002


176it [00:46,  3.82it/s]


Epoch 15: Cross-entropy: 0.0056


1it [00:00,  3.60it/s]

Epoch 16, batch 0: Loss = 0.0000


101it [00:26,  3.92it/s]

Epoch 16, batch 100: Loss = 0.0001


176it [00:46,  3.79it/s]


Epoch 16: Cross-entropy: 0.0056


1it [00:00,  3.68it/s]

Epoch 17, batch 0: Loss = 0.0001


101it [00:25,  3.85it/s]

Epoch 17, batch 100: Loss = 0.0001


176it [00:45,  3.85it/s]


Epoch 17: Cross-entropy: 0.0058


1it [00:00,  3.94it/s]

Epoch 18, batch 0: Loss = 0.0000


101it [00:25,  3.94it/s]

Epoch 18, batch 100: Loss = 0.0000


176it [00:45,  3.90it/s]


Epoch 18: Cross-entropy: 0.0056


1it [00:00,  3.77it/s]

Epoch 19, batch 0: Loss = 0.0000


101it [00:26,  3.85it/s]

Epoch 19, batch 100: Loss = 0.0001


176it [00:45,  3.84it/s]


Epoch 19: Cross-entropy: 0.0059


In [None]:
val_dataset = LettersDataset('clean_out/X_val.csv', 'clean_out/y_val.csv', val_mode=True, device=device)

val_loader = data.DataLoader(val_dataset, batch_size=batch_size)
print(val_dataset.char_encoder.word2idx)

w = 1129
{'ا': 0, 'ب': 1, 'ت': 2, 'ث': 3, 'ج': 4, 'ح': 5, 'خ': 6, 'د': 7, 'ذ': 8, 'ر': 9, 'ز': 10, 'س': 11, 'ش': 12, 'ص': 13, 'ض': 14, 'ط': 15, 'ظ': 16, 'ع': 17, 'غ': 18, 'ف': 19, 'ق': 20, 'ك': 21, 'ل': 22, 'م': 23, 'ن': 24, 'ه': 25, 'و': 26, 'ي': 27, 'ى': 28, 'ة': 29, 'آ': 30, 'أ': 31, 'إ': 32, 'ء': 33, 'ؤ': 34, 'ئ': 35, ' ': 36, '،': 37, '-': 38, '<pad>': 39, '<unk>': 40}


In [None]:
# evaluaate accuracy on validation set
model.eval()
letter_haraka = []
with torch.no_grad():
    for (X_batch, y_batch) in val_loader:
        # y_pred = model(X_batch)['diacritics']
        y_pred = model(X_batch)
        # we transpose because the loss function expects the second dimension to be the classes
        # y_pred is now (batch_size, n_classes, seq_len)
        y_pred = y_pred.transpose(1, 2)
        _, predicted = torch.max(y_pred.data, 1)
        # Count only non-padding characters
        for x, y in zip(X_batch, predicted):
            for xx, yy in zip(x, y):
                # we reached the end of the sentence
                # print(xx.item())
                # print(val_dataset.char_encoder.get_pad_id())
                # print(val_dataset.char_encoder.get_id_by_token(UNK_TOKEN))
                if xx.item() == val_dataset.char_encoder.get_pad_id():
                    break
                ll = val_dataset.char_encoder.is_arabic_letter(xx.item())
                if ll:
                    letter_haraka.append([ll, yy.item()])

# save ID,Label pairs in a csv file
import pandas as pd

df = pd.DataFrame(letter_haraka, columns=['letter', 'label'])
df.to_csv('./results/letter_haraka.csv', index=True, index_label='ID')



In [None]:
gold_val = pd.read_csv('clean_out/val_gold.csv', index_col=0)
sys_val = pd.read_csv('results/letter_haraka.csv', index_col=0)
# Accuracy per letter
# print(gold_val.head())
# print(sys_val.head())   
# print(gold_val.iloc[0]['label'])

correct = 0
total = len(gold_val)
for i in range(total):
    # print(gold_val[i][0], sys_val[i][0])
    correct += (gold_val.iloc[i]['label'] == sys_val.iloc[i]['label'])

print("Accuracy: %.2f%%" % (100.0 * correct / total))

Accuracy: 98.51%


In [None]:
# save model 
# torch.save(model, 'models/lstm.pth')
# save model state dict
torch.save(model.state_dict(), 'models/Accio_4.pth')
# load model state dict
# model = BiLSTM()
# model.load_state_dict(torch.load('models/bilstm.pth'))
# load model
# model = torch.load('models/___.pth')

In [None]:
print('DER of the network on the validation set: %f %%' % (100.0 * (1 - correct / total)))


DER of the network on the validation set: 1.493498 %


In [14]:
test_dataset = LettersDataset('clean_out/X_test.csv', 'clean_out/y_test.csv',val_mode=True, device=device)   
val_loader = data.DataLoader(test_dataset,  batch_size=batch_size)
print(test_dataset.char_encoder.word2idx)
# evaluaate accuracy on validation set

model.eval()
letter_haraka = []
with torch.no_grad():
    for (X_batch,y_batch) in val_loader:
        # y_pred = model(X_batch)['diacritics']
        y_pred = model(X_batch)
        # we transpose because the loss function expects the second dimension to be the classes
        # y_pred is now (batch_size, n_classes, seq_len)
        y_pred = y_pred.transpose(1, 2) 
        _, predicted = torch.max(y_pred.data, 1)
        # Count only non-padding characters
        for x,y in zip(X_batch,predicted):
            for xx,yy in zip(x,y):
                # we reached the end of the sentence
                # print(xx.item())
                # print(test_dataset.char_encoder.get_pad_id())
                # print(test_dataset.char_encoder.get_id_by_token(UNK_TOKEN))
                if xx.item() == test_dataset.char_encoder.get_pad_id():
                    break
                ll = test_dataset.char_encoder.is_arabic_letter(xx.item())
                if ll:
                    letter_haraka.append([ll,yy.item()])

# save ID,Label pairs in a csv file
import pandas as pd
df = pd.DataFrame(letter_haraka, columns=['letter','label'])
df.to_csv('./results/letter_haraka.csv', index=True, index_label='ID')



w = 1174
{'ا': 0, 'ب': 1, 'ت': 2, 'ث': 3, 'ج': 4, 'ح': 5, 'خ': 6, 'د': 7, 'ذ': 8, 'ر': 9, 'ز': 10, 'س': 11, 'ش': 12, 'ص': 13, 'ض': 14, 'ط': 15, 'ظ': 16, 'ع': 17, 'غ': 18, 'ف': 19, 'ق': 20, 'ك': 21, 'ل': 22, 'م': 23, 'ن': 24, 'ه': 25, 'و': 26, 'ي': 27, 'ى': 28, 'ة': 29, 'آ': 30, 'أ': 31, 'إ': 32, 'ء': 33, 'ؤ': 34, 'ئ': 35, ' ': 36, '،': 37, '-': 38, '<pad>': 39, '<unk>': 40}


In [15]:
gold_test = pd.read_csv('clean_out/test_gold.csv',index_col=0)
sys_test = pd.read_csv('results/letter_haraka.csv',index_col=0)
# Accuracy per letter
# print(gold_test.head())
# print(sys_test.head())   
# print(gold_test.iloc[0]['label'])

correct = 0
total = len(gold_test)
for i in range(total):
    # print(gold_test[i][0], sys_test[i][0])
    if gold_test.iloc[i]['label'] == sys_test.iloc[i]['label']:
        correct +=1
    else:
        pass
        # print(i)
        # print(gold_test.iloc[i]['label'], sys_test.iloc[i]['label'])
    
print("Accuracy: %.2f%%" % (100.0 * correct / total))

Accuracy: 99.88%


In [16]:
test_dataset = LettersDataset('clean_out/X_test_no_harakat.csv', 'clean_out/X_test_no_harakat.csv',val_mode=True, device=device)   

val_loader = data.DataLoader(test_dataset,  batch_size=batch_size)
print(test_dataset.char_encoder.word2idx)
# evaluaate accuracy on validation set

model.eval()
letter_haraka = []
with torch.no_grad():
    for (X_batch,y_batch) in val_loader:
        # y_pred = model(X_batch)['diacritics']
        y_pred = model(X_batch)
        # we transpose because the loss function expects the second dimension to be the classes
        # y_pred is now (batch_size, n_classes, seq_len)
        y_pred = y_pred.transpose(1, 2) 
        _, predicted = torch.max(y_pred.data, 1)
        # Count only non-padding characters
        for x,y in zip(X_batch,predicted):
            for xx,yy in zip(x,y):
                # we reached the end of the sentence
                # print(xx.item())
                # print(test_dataset.char_encoder.get_pad_id())
                # print(test_dataset.char_encoder.get_id_by_token(UNK_TOKEN))
                if xx.item() == test_dataset.char_encoder.get_pad_id():
                    break
                ll = test_dataset.char_encoder.is_arabic_letter(xx.item())
                if ll:
                    letter_haraka.append([ll,yy.item()])

# save ID,Label pairs in a csv file
import pandas as pd
df = pd.DataFrame(letter_haraka, columns=['letter','label'])
df.to_csv('./results/letter_haraka.csv', index=True, index_label='ID')



gold_test = pd.read_csv('clean_out/test_gold.csv',index_col=0)
sys_test = pd.read_csv('results/letter_haraka.csv',index_col=0)
# Accuracy per letter
# print(gold_test.head())
# print(sys_test.head())   
# print(gold_test.iloc[0]['label'])

correct = 0
total = len(gold_test)
for i in range(total):
    # print(gold_test[i][0], sys_test[i][0])
    if gold_test.iloc[i]['label'] == sys_test.iloc[i]['label']:
        correct +=1
    else:
        print(i)
        print(gold_test.iloc[i]['label'], sys_test.iloc[i]['label'])
    
print("Accuracy: %.2f%%" % (100.0 * correct / total))

w = 1174
{'ا': 0, 'ب': 1, 'ت': 2, 'ث': 3, 'ج': 4, 'ح': 5, 'خ': 6, 'د': 7, 'ذ': 8, 'ر': 9, 'ز': 10, 'س': 11, 'ش': 12, 'ص': 13, 'ض': 14, 'ط': 15, 'ظ': 16, 'ع': 17, 'غ': 18, 'ف': 19, 'ق': 20, 'ك': 21, 'ل': 22, 'م': 23, 'ن': 24, 'ه': 25, 'و': 26, 'ي': 27, 'ى': 28, 'ة': 29, 'آ': 30, 'أ': 31, 'إ': 32, 'ء': 33, 'ؤ': 34, 'ئ': 35, ' ': 36, '،': 37, '-': 38, '<pad>': 39, '<unk>': 40}
904
5 3
1244
8 0
1247
0 6
1248
0 2
3277
2 14
6215
14 4
7310
0 2
7510
2 14
11403
14 0
11579
14 0
11972
2 14
19336
0 2
19344
0 6
21971
4 2
24401
0 4
39404
2 0
39405
4 6
39406
0 5
40093
4 2
46534
12 8
46535
0 5
47570
4 14
47878
4 14
47890
0 6
47892
8 0
47945
0 6
47946
0 4
48009
1 0
48061
2 4
53051
4 5
53065
0 6
53066
6 2
53161
2 0
53169
6 14
53293
2 0
53352
2 0
53369
0 4
53370
0 2
53390
0 6
54629
3 1
57516
2 4
57517
2 4
57536
0 2
57538
2 4
57668
4 0
57742
2 0
57743
4 0
57763
4 2
57827
4 0
57865
4 0
57866
4 2
57922
6 0
57923
4 0
57949
1 0
57969
2 0
57971
4 0
57976
0 2
58065
2 0
58080
0 6
58082
4 2
59330
2 0
59372
5 4
59

In [18]:
test_dataset = LettersDataset('clean_out/X_test_no_diacritics.csv', 'clean_out/Y_test_no_diacritics.csv',val_mode=True, device=device)   
val_loader = data.DataLoader(test_dataset,  batch_size=batch_size)
print(test_dataset.char_encoder.word2idx)
# evaluaate accuracy on validation set

model.eval()
letter_haraka = []
with torch.no_grad():
    for (X_batch,y_batch) in val_loader:
        # y_pred = model(X_batch)['diacritics']
        y_pred = model(X_batch)
        # we transpose because the loss function expects the second dimension to be the classes
        # y_pred is now (batch_size, n_classes, seq_len)
        y_pred = y_pred.transpose(1, 2) 
        _, predicted = torch.max(y_pred.data, 1)
        # Count only non-padding characters
        for x,y in zip(X_batch,predicted):
            for xx,yy in zip(x,y):
                # we reached the end of the sentence
                # print(xx.item())
                # print(test_dataset.char_encoder.get_pad_id())
                # print(test_dataset.char_encoder.get_id_by_token(UNK_TOKEN))
                if xx.item() == test_dataset.char_encoder.get_pad_id():
                    break
                ll = test_dataset.char_encoder.is_arabic_letter(xx.item())
                if ll:
                    letter_haraka.append(yy.item())

# save ID,Label pairs in a csv file
import pandas as pd
df = pd.DataFrame(letter_haraka, columns=['label'])
df.to_csv('./results/letter_diacritic.csv', index=True, index_label='ID')



w = 1184
{'ا': 0, 'ب': 1, 'ت': 2, 'ث': 3, 'ج': 4, 'ح': 5, 'خ': 6, 'د': 7, 'ذ': 8, 'ر': 9, 'ز': 10, 'س': 11, 'ش': 12, 'ص': 13, 'ض': 14, 'ط': 15, 'ظ': 16, 'ع': 17, 'غ': 18, 'ف': 19, 'ق': 20, 'ك': 21, 'ل': 22, 'م': 23, 'ن': 24, 'ه': 25, 'و': 26, 'ي': 27, 'ى': 28, 'ة': 29, 'آ': 30, 'أ': 31, 'إ': 32, 'ء': 33, 'ؤ': 34, 'ئ': 35, ' ': 36, '،': 37, '-': 38, '<pad>': 39, '<unk>': 40}
