In [7]:
%load_ext autoreload
%autoreload 2

import torch
import torch.optim as optim
import torch.utils.data as data
from torch.utils.data import Dataset, DataLoader
from letters_dataset import LettersDataset
import torch.nn as nn
from train_collections import *
import numpy as np
from tqdm import tqdm
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# autoreload notebook

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [18]:
embedding_dim = 32
n_epochs = 5
n_hidden = 128
batch_size = 64

In [10]:


dataset = LettersDataset(device=device)
loader = data.DataLoader(dataset, shuffle=True, batch_size=batch_size)

# load val data
# da = LettersDataset('clean_out/X_val.csv', 'clean_out/y_val.csv')

w = 415


In [11]:
n_chars = dataset.get_input_vocab_size()
n_harakat = dataset.get_output_vocab_size()
n_harakat

15

In [19]:
from models.baseline import BaseLineModel

class CharModel(nn.Module):
    def __init__(self):
        super().__init__()

        # embedding and LSTM layers
        self.embedding = nn.Embedding(n_chars, embedding_dim)

        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=n_hidden,
                            num_layers=1, batch_first=True )
        self.dropout = nn.Dropout(0.2)
        self.linear = nn.Linear(n_hidden, n_harakat)

    def forward(self, x):
        # pass thru embedding layer
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = self.linear(self.dropout(x))
        return x

class BiLSTM(nn.Module):
    def __init__(self):
        super(BiLSTM, self).__init__()
        self.embedding = nn.Embedding(n_chars, embedding_dim)

        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=n_hidden, bidirectional=True, batch_first=True)
        self.linear = nn.Linear(2*n_hidden, n_harakat)
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        # pass thru embedding layer
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = self.linear(self.dropout(x))
        return x

#model = BaseLineModel(n_chars, n_harakat,embedding_dim).to(device)
model = BiLSTM().to(device)
optimizer = optim.Adam(model.parameters())
loss_fn = nn.CrossEntropyLoss(ignore_index=dataset.char_encoder.get_pad_id())
num_batches = len(loader)
print("Number of batches:", num_batches)
best_model = None
best_loss = np.inf
for epoch in range(n_epochs):
    model.train()
    for i, (X_batch,y_batch) in tqdm(enumerate(loader)):
        y_pred = ''
        # y_pred = model(X_batch)['diacritics']
        y_pred = model(X_batch)
        # we transpose because the loss function expects the second dimension to be the classes
        # y_pred is now (batch_size, n_classes, seq_len)
        y_pred = y_pred.transpose(1, 2) 
        # print(y_pred.shape)
        # print(y_batch.shape)
        loss = loss_fn(y_pred, y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if i % 100 == 0:
            print("Epoch %d, batch %d: Loss = %.4f" % (epoch, i, loss))
        
    # Validation
    model.eval()
    loss = 0
    with torch.no_grad():
        for (X_batch,y_batch) in loader:
            # y_pred = model(X_batch)['diacritics']
            y_pred = model(X_batch)
            y_pred = y_pred.transpose(1, 2) 
            loss += loss_fn(y_pred, y_batch)
        if loss < best_loss:
            best_loss = loss
            best_model = model.state_dict()
        print("Epoch %d: Cross-entropy: %.4f" % (epoch, loss))


Number of batches: 3150


3it [00:00,  8.96it/s]

Epoch 0, batch 0: Loss = 0.0279


103it [00:06, 16.21it/s]

Epoch 0, batch 100: Loss = 0.0329


203it [00:12, 17.45it/s]

Epoch 0, batch 200: Loss = 0.0212


303it [00:18, 16.41it/s]

Epoch 0, batch 300: Loss = 0.0228


403it [00:24, 16.58it/s]

Epoch 0, batch 400: Loss = 0.0289


503it [00:30, 17.62it/s]

Epoch 0, batch 500: Loss = 0.0315


603it [00:36, 16.56it/s]

Epoch 0, batch 600: Loss = 0.0153


703it [00:41, 16.93it/s]

Epoch 0, batch 700: Loss = 0.0294


803it [00:47, 17.16it/s]

Epoch 0, batch 800: Loss = 0.0202


903it [00:53, 17.03it/s]

Epoch 0, batch 900: Loss = 0.0217


1003it [00:59, 16.10it/s]

Epoch 0, batch 1000: Loss = 0.0209


1103it [01:05, 16.46it/s]

Epoch 0, batch 1100: Loss = 0.0328


1203it [01:11, 17.19it/s]

Epoch 0, batch 1200: Loss = 0.0159


1303it [01:17, 16.87it/s]

Epoch 0, batch 1300: Loss = 0.0183


1403it [01:23, 16.55it/s]

Epoch 0, batch 1400: Loss = 0.0127


1503it [01:29, 15.55it/s]

Epoch 0, batch 1500: Loss = 0.0254


1603it [01:35, 17.64it/s]

Epoch 0, batch 1600: Loss = 0.0210


1703it [01:41, 17.08it/s]

Epoch 0, batch 1700: Loss = 0.0161


1803it [01:47, 17.60it/s]

Epoch 0, batch 1800: Loss = 0.0272


1903it [01:53, 16.80it/s]

Epoch 0, batch 1900: Loss = 0.0307


2003it [01:59, 15.98it/s]

Epoch 0, batch 2000: Loss = 0.0164


2103it [02:05, 16.86it/s]

Epoch 0, batch 2100: Loss = 0.0228


2203it [02:11, 15.80it/s]

Epoch 0, batch 2200: Loss = 0.0244


2303it [02:17, 17.15it/s]

Epoch 0, batch 2300: Loss = 0.0235


2403it [02:23, 17.04it/s]

Epoch 0, batch 2400: Loss = 0.0309


2503it [02:28, 17.50it/s]

Epoch 0, batch 2500: Loss = 0.0207


2603it [02:34, 15.58it/s]

Epoch 0, batch 2600: Loss = 0.0256


2703it [02:40, 17.35it/s]

Epoch 0, batch 2700: Loss = 0.0412


2803it [02:46, 17.07it/s]

Epoch 0, batch 2800: Loss = 0.0261


2903it [02:52, 17.25it/s]

Epoch 0, batch 2900: Loss = 0.0248


3003it [02:58, 17.80it/s]

Epoch 0, batch 3000: Loss = 0.0148


3103it [03:04, 17.81it/s]

Epoch 0, batch 3100: Loss = 0.0142


3150it [03:06, 16.85it/s]


Epoch 0: Cross-entropy: 54.0439


2it [00:00, 14.69it/s]

Epoch 1, batch 0: Loss = 0.0134


104it [00:06, 16.45it/s]

Epoch 1, batch 100: Loss = 0.0135


204it [00:12, 16.43it/s]

Epoch 1, batch 200: Loss = 0.0120


304it [00:18, 16.90it/s]

Epoch 1, batch 300: Loss = 0.0166


404it [00:24, 17.13it/s]

Epoch 1, batch 400: Loss = 0.0191


504it [00:30, 16.04it/s]

Epoch 1, batch 500: Loss = 0.0152


604it [00:36, 16.54it/s]

Epoch 1, batch 600: Loss = 0.0144


704it [00:42, 15.13it/s]

Epoch 1, batch 700: Loss = 0.0195


804it [00:48, 16.95it/s]

Epoch 1, batch 800: Loss = 0.0185


904it [00:54, 17.14it/s]

Epoch 1, batch 900: Loss = 0.0207


1004it [01:00, 17.13it/s]

Epoch 1, batch 1000: Loss = 0.0199


1104it [01:06, 16.39it/s]

Epoch 1, batch 1100: Loss = 0.0170


1204it [01:12, 17.02it/s]

Epoch 1, batch 1200: Loss = 0.0145


1304it [01:18, 16.62it/s]

Epoch 1, batch 1300: Loss = 0.0232


1404it [01:24, 16.83it/s]

Epoch 1, batch 1400: Loss = 0.0274


1504it [01:30, 17.08it/s]

Epoch 1, batch 1500: Loss = 0.0220


1604it [01:36, 15.83it/s]

Epoch 1, batch 1600: Loss = 0.0210


1704it [01:42, 16.98it/s]

Epoch 1, batch 1700: Loss = 0.0203


1804it [01:48, 16.73it/s]

Epoch 1, batch 1800: Loss = 0.0171


1904it [01:54, 16.12it/s]

Epoch 1, batch 1900: Loss = 0.0222


2004it [02:00, 15.81it/s]

Epoch 1, batch 2000: Loss = 0.0146


2104it [02:06, 16.71it/s]

Epoch 1, batch 2100: Loss = 0.0188


2204it [02:12, 16.79it/s]

Epoch 1, batch 2200: Loss = 0.0143


2304it [02:18, 16.55it/s]

Epoch 1, batch 2300: Loss = 0.0143


2404it [02:24, 16.80it/s]

Epoch 1, batch 2400: Loss = 0.0205


2504it [02:31, 15.90it/s]

Epoch 1, batch 2500: Loss = 0.0099


2604it [02:37, 17.12it/s]

Epoch 1, batch 2600: Loss = 0.0199


2704it [02:43, 16.77it/s]

Epoch 1, batch 2700: Loss = 0.0227


2804it [02:49, 17.27it/s]

Epoch 1, batch 2800: Loss = 0.0139


2904it [02:55, 16.93it/s]

Epoch 1, batch 2900: Loss = 0.0177


3004it [03:01, 16.39it/s]

Epoch 1, batch 3000: Loss = 0.0197


3104it [03:07, 17.01it/s]

Epoch 1, batch 3100: Loss = 0.0155


3150it [03:10, 16.57it/s]


Epoch 1: Cross-entropy: 46.8705


2it [00:00, 14.52it/s]

Epoch 2, batch 0: Loss = 0.0176


104it [00:06, 16.09it/s]

Epoch 2, batch 100: Loss = 0.0167


204it [00:12, 16.67it/s]

Epoch 2, batch 200: Loss = 0.0147


304it [00:18, 16.59it/s]

Epoch 2, batch 300: Loss = 0.0102


404it [00:24, 16.97it/s]

Epoch 2, batch 400: Loss = 0.0170


504it [00:30, 16.74it/s]

Epoch 2, batch 500: Loss = 0.0149


604it [00:36, 17.19it/s]

Epoch 2, batch 600: Loss = 0.0123


702it [00:42, 16.09it/s]

Epoch 2, batch 700: Loss = 0.0145


804it [00:48, 16.47it/s]

Epoch 2, batch 800: Loss = 0.0119


904it [00:54, 16.84it/s]

Epoch 2, batch 900: Loss = 0.0135


1004it [01:00, 17.27it/s]

Epoch 2, batch 1000: Loss = 0.0211


1104it [01:06, 16.73it/s]

Epoch 2, batch 1100: Loss = 0.0218


1204it [01:12, 16.95it/s]

Epoch 2, batch 1200: Loss = 0.0133


1304it [01:18, 15.83it/s]

Epoch 2, batch 1300: Loss = 0.0203


1404it [01:24, 16.57it/s]

Epoch 2, batch 1400: Loss = 0.0106


1504it [01:31, 16.56it/s]

Epoch 2, batch 1500: Loss = 0.0134


1604it [01:37, 16.85it/s]

Epoch 2, batch 1600: Loss = 0.0107


1704it [01:43, 17.11it/s]

Epoch 2, batch 1700: Loss = 0.0159


1804it [01:49, 16.71it/s]

Epoch 2, batch 1800: Loss = 0.0163


1904it [01:55, 16.51it/s]

Epoch 2, batch 1900: Loss = 0.0169


2004it [02:01, 17.29it/s]

Epoch 2, batch 2000: Loss = 0.0123


2104it [02:07, 17.09it/s]

Epoch 2, batch 2100: Loss = 0.0158


2202it [02:13, 15.66it/s]

Epoch 2, batch 2200: Loss = 0.0240


2304it [02:19, 16.72it/s]

Epoch 2, batch 2300: Loss = 0.0102


2404it [02:25, 16.47it/s]

Epoch 2, batch 2400: Loss = 0.0105


2504it [02:31, 16.35it/s]

Epoch 2, batch 2500: Loss = 0.0191


2604it [02:37, 17.09it/s]

Epoch 2, batch 2600: Loss = 0.0175


2704it [02:43, 16.75it/s]

Epoch 2, batch 2700: Loss = 0.0162


2804it [02:49, 17.26it/s]

Epoch 2, batch 2800: Loss = 0.0177


2904it [02:55, 17.21it/s]

Epoch 2, batch 2900: Loss = 0.0233


3004it [03:01, 17.34it/s]

Epoch 2, batch 3000: Loss = 0.0159


3104it [03:07, 16.96it/s]

Epoch 2, batch 3100: Loss = 0.0145


3150it [03:10, 16.55it/s]


Epoch 2: Cross-entropy: 42.6212


2it [00:00, 14.65it/s]

Epoch 3, batch 0: Loss = 0.0159


104it [00:06, 16.48it/s]

Epoch 3, batch 100: Loss = 0.0224


204it [00:12, 16.27it/s]

Epoch 3, batch 200: Loss = 0.0177


304it [00:18, 17.25it/s]

Epoch 3, batch 300: Loss = 0.0100


404it [00:24, 15.95it/s]

Epoch 3, batch 400: Loss = 0.0222


504it [00:30, 16.77it/s]

Epoch 3, batch 500: Loss = 0.0152


604it [00:36, 16.99it/s]

Epoch 3, batch 600: Loss = 0.0116


704it [00:42, 16.67it/s]

Epoch 3, batch 700: Loss = 0.0113


804it [00:48, 17.30it/s]

Epoch 3, batch 800: Loss = 0.0132


904it [00:54, 17.00it/s]

Epoch 3, batch 900: Loss = 0.0192


1004it [01:00, 16.04it/s]

Epoch 3, batch 1000: Loss = 0.0115


1104it [01:06, 16.48it/s]

Epoch 3, batch 1100: Loss = 0.0148


1204it [01:12, 16.24it/s]

Epoch 3, batch 1200: Loss = 0.0165


1304it [01:18, 15.23it/s]

Epoch 3, batch 1300: Loss = 0.0148


1402it [01:24, 15.69it/s]

Epoch 3, batch 1400: Loss = 0.0137


1504it [01:31, 16.24it/s]

Epoch 3, batch 1500: Loss = 0.0126


1602it [01:37, 15.96it/s]

Epoch 3, batch 1600: Loss = 0.0095


1704it [01:43, 16.31it/s]

Epoch 3, batch 1700: Loss = 0.0155


1804it [01:49, 16.79it/s]

Epoch 3, batch 1800: Loss = 0.0136


1902it [01:56, 13.79it/s]

Epoch 3, batch 1900: Loss = 0.0135


2002it [02:03, 14.39it/s]

Epoch 3, batch 2000: Loss = 0.0172


2104it [02:10, 13.80it/s]

Epoch 3, batch 2100: Loss = 0.0114


2202it [02:17, 14.71it/s]

Epoch 3, batch 2200: Loss = 0.0141


2304it [02:23, 16.65it/s]

Epoch 3, batch 2300: Loss = 0.0163


2404it [02:29, 16.66it/s]

Epoch 3, batch 2400: Loss = 0.0144


2502it [02:36, 15.32it/s]

Epoch 3, batch 2500: Loss = 0.0132


2604it [02:42, 16.17it/s]

Epoch 3, batch 2600: Loss = 0.0153


2702it [02:48, 15.74it/s]

Epoch 3, batch 2700: Loss = 0.0150


2804it [02:54, 15.86it/s]

Epoch 3, batch 2800: Loss = 0.0182


2904it [03:01, 14.69it/s]

Epoch 3, batch 2900: Loss = 0.0177


3002it [03:07, 14.10it/s]

Epoch 3, batch 3000: Loss = 0.0134


3104it [03:14, 15.40it/s]

Epoch 3, batch 3100: Loss = 0.0129


3150it [03:17, 15.95it/s]


Epoch 3: Cross-entropy: 40.1426


2it [00:00, 13.94it/s]

Epoch 4, batch 0: Loss = 0.0130


104it [00:06, 15.71it/s]

Epoch 4, batch 100: Loss = 0.0120


202it [00:12, 13.69it/s]

Epoch 4, batch 200: Loss = 0.0127


302it [00:19, 14.24it/s]

Epoch 4, batch 300: Loss = 0.0143


404it [00:26, 16.72it/s]

Epoch 4, batch 400: Loss = 0.0220


504it [00:32, 16.90it/s]

Epoch 4, batch 500: Loss = 0.0119


604it [00:38, 17.20it/s]

Epoch 4, batch 600: Loss = 0.0194


704it [00:44, 16.71it/s]

Epoch 4, batch 700: Loss = 0.0128


804it [00:51, 15.57it/s]

Epoch 4, batch 800: Loss = 0.0121


904it [00:57, 15.78it/s]

Epoch 4, batch 900: Loss = 0.0160


1002it [01:04, 12.75it/s]

Epoch 4, batch 1000: Loss = 0.0138


1104it [01:11, 16.72it/s]

Epoch 4, batch 1100: Loss = 0.0165


1204it [01:17, 16.60it/s]

Epoch 4, batch 1200: Loss = 0.0129


1304it [01:23, 15.88it/s]

Epoch 4, batch 1300: Loss = 0.0135


1404it [01:30, 15.77it/s]

Epoch 4, batch 1400: Loss = 0.0162


1504it [01:36, 16.05it/s]

Epoch 4, batch 1500: Loss = 0.0108


1602it [01:43, 13.71it/s]

Epoch 4, batch 1600: Loss = 0.0182


1702it [01:50, 13.08it/s]

Epoch 4, batch 1700: Loss = 0.0113


1804it [01:58, 13.93it/s]

Epoch 4, batch 1800: Loss = 0.0151


1902it [02:05, 12.88it/s]

Epoch 4, batch 1900: Loss = 0.0121


2004it [02:13, 14.63it/s]

Epoch 4, batch 2000: Loss = 0.0179


2102it [02:19, 13.73it/s]

Epoch 4, batch 2100: Loss = 0.0213


2204it [02:26, 16.74it/s]

Epoch 4, batch 2200: Loss = 0.0178


2302it [02:32, 15.13it/s]

Epoch 4, batch 2300: Loss = 0.0144


2404it [02:39, 15.26it/s]

Epoch 4, batch 2400: Loss = 0.0159


2502it [02:45, 14.58it/s]

Epoch 4, batch 2500: Loss = 0.0165


2604it [02:53, 15.37it/s]

Epoch 4, batch 2600: Loss = 0.0146


2704it [03:00, 14.81it/s]

Epoch 4, batch 2700: Loss = 0.0138


2802it [03:06, 13.04it/s]

Epoch 4, batch 2800: Loss = 0.0089


2902it [03:14, 12.44it/s]

Epoch 4, batch 2900: Loss = 0.0128


3004it [03:21, 16.97it/s]

Epoch 4, batch 3000: Loss = 0.0204


3104it [03:28, 16.59it/s]

Epoch 4, batch 3100: Loss = 0.0151


3150it [03:31, 14.92it/s]


Epoch 4: Cross-entropy: 38.3271


In [20]:
val_dataset = LettersDataset('clean_out/X_val.csv', 'clean_out/y_val.csv',val_mode=True, device=device)   

val_loader = data.DataLoader(val_dataset,  batch_size=batch_size)
print(val_dataset.char_encoder.word2idx)
# evaluaate accuracy on validation set

model.eval()
letter_haraka = []
with torch.no_grad():
    for (X_batch,y_batch) in val_loader:
        # y_pred = model(X_batch)['diacritics']
        y_pred = model(X_batch)
        # we transpose because the loss function expects the second dimension to be the classes
        # y_pred is now (batch_size, n_classes, seq_len)
        y_pred = y_pred.transpose(1, 2) 
        _, predicted = torch.max(y_pred.data, 1)
        # Count only non-padding characters
        for x,y in zip(X_batch,predicted):
            for xx,yy in zip(x,y):
                # we reached the end of the sentence
                # print(xx.item())
                # print(val_dataset.char_encoder.get_pad_id())
                # print(val_dataset.char_encoder.get_id_by_token(UNK_TOKEN))
                if xx.item() == val_dataset.char_encoder.get_pad_id():
                    break
                ll = val_dataset.char_encoder.is_arabic_letter(xx.item())
                if ll:
                    letter_haraka.append([ll,yy.item()])

# save ID,Label pairs in a csv file
import pandas as pd
df = pd.DataFrame(letter_haraka, columns=['letter','label'])
df.to_csv('./results/letter_haraka.csv', index=True, index_label='ID')



w = 1129
{'ا': 0, 'ب': 1, 'ت': 2, 'ث': 3, 'ج': 4, 'ح': 5, 'خ': 6, 'د': 7, 'ذ': 8, 'ر': 9, 'ز': 10, 'س': 11, 'ش': 12, 'ص': 13, 'ض': 14, 'ط': 15, 'ظ': 16, 'ع': 17, 'غ': 18, 'ف': 19, 'ق': 20, 'ك': 21, 'ل': 22, 'م': 23, 'ن': 24, 'ه': 25, 'و': 26, 'ي': 27, 'ى': 28, 'ة': 29, 'آ': 30, 'أ': 31, 'إ': 32, 'ء': 33, 'ؤ': 34, 'ئ': 35, ' ': 36, '،': 37, '-': 38, '<pad>': 39, '<unk>': 40}


In [21]:
gold_val = pd.read_csv('clean_out/val_gold.csv',index_col=0)
sys_val = pd.read_csv('results/letter_haraka.csv',index_col=0)
# Accuracy per letter
# print(gold_val.head())
# print(sys_val.head())   
# print(gold_val.iloc[0]['label'])

correct = 0
total = len(gold_val)
for i in range(total):
    # print(gold_val[i][0], sys_val[i][0])
    correct +=( gold_val.iloc[i]['label'] == sys_val.iloc[i]['label'])
    
print("Accuracy: %.2f%%" % (100.0 * correct / total))

Accuracy: 95.32%


In [23]:
# save model 
# torch.save(model, 'models/lstm.pth')
# save model state dict
torch.save(model.state_dict(), 'models/bilstm.pth')
# load model state dict
# model = BiLSTM()
# model.load_state_dict(torch.load('models/bilstm.pth'))
# load model
# model = torch.load('models/___.pth')

In [16]:
print('DER of the network on the validation set: %d %%' % (100 * (1 - correct / total)))


DER of the network on the validation set: 8 %


In [17]:
# val_dataset = LettersDataset('clean_out/X_test.csv', 'clean_out/Y_test.csv', device=device)   

# val_loader = data.DataLoader(val_dataset, shuffle=True, batch_size=batch_size)

# # evaluaate accuracy on validation set


# model.eval()
# correct = 0
# total = 0

# with torch.no_grad():
#     for (X_batch,y_batch) in val_loader:
#         is_padding = X_batch == val_dataset.char_encoder.get_pad_id()
#         # y_pred = model(X_batch)['diacritics']
#         y_pred = model(X_batch)
#         y_pred = y_pred.transpose(1, 2) 
#         _, predicted = torch.max(y_pred.data, 1)
#         # Count only non-padding characters
#         total += torch.sum(~is_padding).item()
        
#         # Count correct predictions
#         correct += torch.sum((predicted == y_batch) & (~is_padding)).item()
# print("Accuracy: %.2f%%" % (100 * correct / total))

