In [5]:
%load_ext autoreload
%autoreload 2

import torch
import torch.optim as optim
import torch.utils.data as data
from torch.utils.data import Dataset, DataLoader
from letters_dataset import LettersDataset
import torch.nn as nn
from train_collections import *
import numpy as np
from tqdm import tqdm
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# autoreload notebook

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
embedding_dim = 32
n_epochs = 10
n_hidden = 128
batch_size = 64

In [7]:


dataset = LettersDataset(device=device)
loader = data.DataLoader(dataset, shuffle=True, batch_size=batch_size)

# load val data
# da = LettersDataset('clean_out/X_val.csv', 'clean_out/y_val.csv')

w = 415


In [8]:
n_chars = dataset.get_input_vocab_size()
n_harakat = dataset.get_output_vocab_size()
n_harakat

15

In [9]:
def save_checkpoint(model, optimizer, epoch, loss, filename):
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss
    }, filename)
    
    
def load_checkpoint(model, optimizer, filename):
    checkpoint = torch.load(filename)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    epoch = checkpoint['epoch'] + 1
    loss = checkpoint['loss']
    return epoch, loss

In [13]:
from models.baseline import BaseLineModel

class CharModel(nn.Module):
    def __init__(self):
        super().__init__()

        # embedding and LSTM layers
        self.embedding = nn.Embedding(n_chars, embedding_dim)

        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=n_hidden,
                            num_layers=1, batch_first=True )
        self.dropout = nn.Dropout(0.2)
        self.linear = nn.Linear(n_hidden, n_harakat)

    def forward(self, x):
        # pass thru embedding layer
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = self.linear(self.dropout(x))
        return x

class BiLSTM(nn.Module):
    def __init__(self):
        super(BiLSTM, self).__init__()
        self.embedding = nn.Embedding(n_chars, embedding_dim)

        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=n_hidden, bidirectional=True, batch_first=True)
        self.linear = nn.Linear(2*n_hidden, n_harakat)
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        # pass thru embedding layer
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = self.linear(self.dropout(x))
        return x
    
class DiacritizationModel(nn.Module):
    def __init__(self, hidden_size=256,embedding_dim=256,in_vocab=25,out_vocab=25):
        super(DiacritizationModel, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=in_vocab, embedding_dim=embedding_dim)
        
        self.blstm1 = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_size, bidirectional=True, batch_first=True)
        self.dropout1 = nn.Dropout(0.5)
        
        self.blstm2 = nn.LSTM(input_size=hidden_size*2, hidden_size=hidden_size, bidirectional=True, batch_first=True)
        self.dropout2 = nn.Dropout(0.5)
        
        self.dense1 = nn.Linear(hidden_size*2, 512)
        self.relu = nn.ReLU()
        self.dense2 = nn.Linear(512,out_vocab)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.blstm1(x)
        x = self.dropout1(x)
        x, _ = self.blstm2(x)
        x = self.dropout2(x)
        x = self.dense1(x)
        x = self.relu(x)
        x = self.dense2(x)
        return x


In [14]:

# model = BaseLineModel(n_chars, n_harakat,embedding_dim).to(device)
# model = BiLSTM().to(device)
# load model
# model_d = torch.load('models/base.pth')

# model = BaseLineModel(n_chars, n_harakat,embedding_dim,use_batch_norm=True).to(device)
# model.load_state_dict(model_d)
model = DiacritizationModel(in_vocab=n_chars,out_vocab=n_harakat).to(device)
optimizer = optim.Adam(model.parameters())
loss_fn = nn.CrossEntropyLoss()
num_batches = len(loader)

In [15]:

print("Number of batches:", num_batches)
best_model = None
best_loss = np.inf
for epoch in range(n_epochs):
    model.train()
    for i, (X_batch,y_batch) in tqdm(enumerate(loader)):
        y_pred = ''
        # y_pred = model(X_batch)['diacritics']
        y_pred = model(X_batch)
        # we transpose because the loss function expects the second dimension to be the classes
        # y_pred is now (batch_size, n_classes, seq_len)
        y_pred = y_pred.transpose(1, 2) 
        # print(y_pred.shape)
        # print(y_batch.shape)
        loss = loss_fn(y_pred, y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if i % 100 == 0:
            print("Epoch %d, batch %d: Loss = %.4f" % (epoch, i, loss))
        
    # Validation
    model.eval()
    torch.save(model.state_dict(), 'models/base.pth')
    
    loss = 0
    with torch.no_grad():
        for (X_batch,y_batch) in loader:
            # y_pred = model(X_batch)['diacritics']
            y_pred = model(X_batch)
            y_pred = y_pred.transpose(1, 2) 
            loss += loss_fn(y_pred, y_batch)
        if loss < best_loss:
            best_loss = loss
            best_model = model.state_dict()
        print("Epoch %d: Cross-entropy: %.4f" % (epoch, loss))


Number of batches: 3150


1it [00:05,  5.27s/it]

Epoch 0, batch 0: Loss = 2.7231


101it [05:40,  3.44s/it]

Epoch 0, batch 100: Loss = 0.1140


127it [07:16,  3.43s/it]


KeyboardInterrupt: 

: 

In [None]:
val_dataset = LettersDataset('clean_out/X_val.csv', 'clean_out/y_val.csv',val_mode=True, device=device)   

val_loader = data.DataLoader(val_dataset,  batch_size=batch_size)
print(val_dataset.char_encoder.word2idx)
# evaluaate accuracy on validation set

model.eval()
letter_haraka = []
with torch.no_grad():
    for (X_batch,y_batch) in val_loader:
        # y_pred = model(X_batch)['diacritics']
        y_pred = model(X_batch)
        # we transpose because the loss function expects the second dimension to be the classes
        # y_pred is now (batch_size, n_classes, seq_len)
        y_pred = y_pred.transpose(1, 2) 
        _, predicted = torch.max(y_pred.data, 1)
        # Count only non-padding characters
        for x,y in zip(X_batch,predicted):
            for xx,yy in zip(x,y):
                # we reached the end of the sentence
                # print(xx.item())
                # print(val_dataset.char_encoder.get_pad_id())
                # print(val_dataset.char_encoder.get_id_by_token(UNK_TOKEN))
                if xx.item() == val_dataset.char_encoder.get_pad_id():
                    break
                ll = val_dataset.char_encoder.is_arabic_letter(xx.item())
                if ll:
                    letter_haraka.append([ll,yy.item()])

# save ID,Label pairs in a csv file
import pandas as pd
df = pd.DataFrame(letter_haraka, columns=['letter','label'])
df.to_csv('./results/letter_haraka.csv', index=True, index_label='ID')



w = 1129
{'ا': 0, 'ب': 1, 'ت': 2, 'ث': 3, 'ج': 4, 'ح': 5, 'خ': 6, 'د': 7, 'ذ': 8, 'ر': 9, 'ز': 10, 'س': 11, 'ش': 12, 'ص': 13, 'ض': 14, 'ط': 15, 'ظ': 16, 'ع': 17, 'غ': 18, 'ف': 19, 'ق': 20, 'ك': 21, 'ل': 22, 'م': 23, 'ن': 24, 'ه': 25, 'و': 26, 'ي': 27, 'ى': 28, 'ة': 29, 'آ': 30, 'أ': 31, 'إ': 32, 'ء': 33, 'ؤ': 34, 'ئ': 35, ' ': 36, '،': 37, '-': 38, '<pad>': 39, '<unk>': 40}


KeyboardInterrupt: 

In [None]:
gold_val = pd.read_csv('clean_out/val_gold.csv',index_col=0)
sys_val = pd.read_csv('results/letter_haraka.csv',index_col=0)
# Accuracy per letter
# print(gold_val.head())
# print(sys_val.head())   
# print(gold_val.iloc[0]['label'])

correct = 0
total = len(gold_val)
for i in range(total):
    # print(gold_val[i][0], sys_val[i][0])
    correct +=( gold_val.iloc[i]['label'] == sys_val.iloc[i]['label'])
    
print("Accuracy: %.2f%%" % (100.0 * correct / total))

Accuracy: 97.17%


In [None]:
# save model 
# torch.save(model, 'models/lstm.pth')
# save model state dict
torch.save(model.state_dict(), 'models/bilstm.pth')
# load model state dict
# model = BiLSTM()
# model.load_state_dict(torch.load('models/bilstm.pth'))
# load model
# model = torch.load('models/___.pth')

In [None]:
print('DER of the network on the validation set: %d %%' % (100 * (1 - correct / total)))


DER of the network on the validation set: 2 %


In [None]:
test_dataset = LettersDataset('clean_out/X_test.csv', 'clean_out/y_test.csv',val_mode=True, device=device)   

val_loader = data.DataLoader(test_dataset,  batch_size=batch_size)
print(test_dataset.char_encoder.word2idx)
# evaluaate accuracy on validation set

model.eval()
letter_haraka = []
with torch.no_grad():
    for (X_batch,y_batch) in val_loader:
        # y_pred = model(X_batch)['diacritics']
        y_pred = model(X_batch)
        # we transpose because the loss function expects the second dimension to be the classes
        # y_pred is now (batch_size, n_classes, seq_len)
        y_pred = y_pred.transpose(1, 2) 
        _, predicted = torch.max(y_pred.data, 1)
        # Count only non-padding characters
        for x,y in zip(X_batch,predicted):
            for xx,yy in zip(x,y):
                # we reached the end of the sentence
                # print(xx.item())
                # print(test_dataset.char_encoder.get_pad_id())
                # print(test_dataset.char_encoder.get_id_by_token(UNK_TOKEN))
                if xx.item() == test_dataset.char_encoder.get_pad_id():
                    break
                ll = test_dataset.char_encoder.is_arabic_letter(xx.item())
                if ll:
                    letter_haraka.append([ll,yy.item()])

# save ID,Label pairs in a csv file
import pandas as pd
df = pd.DataFrame(letter_haraka, columns=['letter','label'])
df.to_csv('./results/letter_haraka.csv', index=True, index_label='ID')



w = 1174
{'ا': 0, 'ب': 1, 'ت': 2, 'ث': 3, 'ج': 4, 'ح': 5, 'خ': 6, 'د': 7, 'ذ': 8, 'ر': 9, 'ز': 10, 'س': 11, 'ش': 12, 'ص': 13, 'ض': 14, 'ط': 15, 'ظ': 16, 'ع': 17, 'غ': 18, 'ف': 19, 'ق': 20, 'ك': 21, 'ل': 22, 'م': 23, 'ن': 24, 'ه': 25, 'و': 26, 'ي': 27, 'ى': 28, 'ة': 29, 'آ': 30, 'أ': 31, 'إ': 32, 'ء': 33, 'ؤ': 34, 'ئ': 35, ' ': 36, '،': 37, '-': 38, '<pad>': 39, '<unk>': 40}


In [None]:
gold_test = pd.read_csv('clean_out/test_gold.csv',index_col=0)
sys_test = pd.read_csv('results/letter_haraka.csv',index_col=0)
# Accuracy per letter
# print(gold_test.head())
# print(sys_test.head())   
# print(gold_test.iloc[0]['label'])

correct = 0
total = len(gold_test)
for i in range(total):
    # print(gold_test[i][0], sys_test[i][0])
    if gold_test.iloc[i]['label'] == sys_test.iloc[i]['label']:
        correct +=1
    else:
        print(i)
        print(gold_test.iloc[i]['label'], sys_test.iloc[i]['label'])
    
print("Accuracy: %.2f%%" % (100.0 * correct / total))

1247
0 6
1248
0 4
1256
0 2
1635
2 3
6214
14 4
7307
0 2
11233
4 14
11896
4 0
13275
14 2
13901
4 2
14313
2 0
14314
14 0
14315
0 8
14316
2 0
14659
8 6
15954
14 2
15959
14 0
19253
2 0
19330
0 2
19332
4 0
22111
0 8
24393
0 4
25907
8 12
26646
2 10
26975
0 4
27527
5 3
29481
14 4
31622
0 2
33133
3 1
35369
2 0
35423
0 4
37618
6 2
38085
2 4
38171
14 2
39346
2 4
39396
2 0
39397
4 6
39398
0 5
40085
4 2
41697
4 2
42617
8 14
44731
2 0
44732
4 0
46330
0 2
46532
12 8
46553
2 0
46554
8 0
47295
14 2
47867
4 14
47880
2 0
47934
0 6
47935
0 2
47941
0 2
48050
2 4
50158
0 2
53032
2 0
53054
0 6
53055
6 2
53150
2 0
53161
6 0
53264
2 4
53282
2 4
53301
2 6
53333
4 0
53399
0 2
53400
2 3
54616
3 1
57500
2 4
57501
2 4
57516
10 12
57648
0 8
57726
2 0
57727
4 0
57737
2 0
57747
4 0
57805
0 2
57860
2 0
57881
6 0
57906
6 0
57907
4 0
57933
1 0
57953
2 0
57955
4 0
57960
0 4
58029
6 0
58049
2 0
58064
0 6
58065
8 4
58066
4 2
59314
2 0
59356
5 4
59416
2 0
59418
0 4
59422
2 0
60619
4 2
63535
0 8
63547
5 4
64649
2 0
66036
0 2
