In [1]:
from tqdm import tqdm
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import numpy as np
import re

In [2]:
class ArabicDataset(Dataset):
    def __init__(self):

        self.arabic_letters = sorted(np.load(
            '../data/utils/arabic_letters.pkl', allow_pickle=True))
        self.diacritics = sorted(np.load(
            '../data/utils/diacritics.pkl', allow_pickle=True))
        self.punctuations = {".", "،", ":", "؛", "؟", "!", '"', "-"}

        self.valid_chars = set(self.arabic_letters).union(
            set(self.diacritics)).union(self.punctuations).union({" "})

        self.char2id = {char: id for id,
                        char in enumerate(self.arabic_letters)}
        self.char2id[" "] = len(self.arabic_letters)
        self.char2id["<PAD>"] = len(self.arabic_letters) + 1
        self.id2char = {id: char for char, id in self.char2id.items()}
        self.diacritic2id = np.load(
            '../data/utils/diacritic2id.pkl', allow_pickle=True)
        self.id2diacritic = {id: diacritic for diacritic,
                             id in self.diacritic2id.items()}

        self.train_data_Y = self.load_data('../data/train.txt')
        self.train_data_X = self.train_data_Y.copy()
        for diacritic, id in self.diacritic2id.items():
            self.train_data_X = np.char.replace(
                self.train_data_X, diacritic, '')

        encoded_train_dataX = []
        for sentence in self.train_data_X:
            encoded_train_dataX.append(
                [self.char2id[char] for char in sentence if char in self.char2id])

        encoded_train_dataY = []
        for sentence in self.train_data_Y:
            encoded_train_dataY.append(self.extract_diacritics(sentence))
            if len(encoded_train_dataY[-1]) != len(encoded_train_dataX[len(encoded_train_dataY)-1]):
                raise ValueError(
                    f"Mismatch between input and output lengths: {len(encoded_train_dataX[len(encoded_train_dataY)-1])} vs {len(encoded_train_dataY[-1])}, \nsentence: {sentence}, \ninput: {encoded_train_dataX[len(encoded_train_dataY)-1]}, \noutput: {encoded_train_dataY[-1]}")

        max_sentence_len = max(len(sentence)
                               for sentence in encoded_train_dataX)
        padded_train_dataX = np.full(
            (len(encoded_train_dataX), max_sentence_len), self.char2id["<PAD>"], dtype=np.int64)
        for i, seq in enumerate(encoded_train_dataX):
            padded_train_dataX[i, :len(seq)] = seq
        padded_train_dataY = np.full(
            (len(encoded_train_dataY), max_sentence_len), self.char2id["<PAD>"], dtype=np.int64)
        for i, seq in enumerate(encoded_train_dataY):
            padded_train_dataY[i, :len(seq)] = seq

        self.train_data_X = torch.tensor(padded_train_dataX, dtype=torch.int64)
        self.train_data_Y = torch.tensor(padded_train_dataY, dtype=torch.int64)

        self.val_data_Y = self.load_data('../data/val.txt')
        self.val_data_X = self.val_data_Y.copy()
        for diacritic, id in self.diacritic2id.items():
            self.val_data_X = np.char.replace(
                self.val_data_X, diacritic, '')

        encoded_val_dataX = []
        for sentence in self.val_data_X:
            encoded_val_dataX.append([self.char2id[char]
                                     for char in sentence if char in self.char2id])

        encoded_val_dataY = []
        for sentence in self.val_data_Y:
            encoded_val_dataY.append(self.extract_diacritics(sentence))

        max_sentence_len = max(len(sentence) for sentence in encoded_val_dataX)
        padded_val_dataX = np.full(
            (len(encoded_val_dataX), max_sentence_len), self.char2id["<PAD>"], dtype=np.int64)
        for i, seq in enumerate(encoded_val_dataX):
            padded_val_dataX[i, :len(seq)] = seq
        padded_val_dataY = np.full(
            (len(encoded_val_dataY), max_sentence_len), self.char2id["<PAD>"], dtype=np.int64)
        for i, seq in enumerate(encoded_val_dataY):
            padded_val_dataY[i, :len(seq)] = seq

        self.val_data_X = torch.tensor(padded_val_dataX, dtype=torch.int64)
        self.val_data_Y = torch.tensor(padded_val_dataY, dtype=torch.int64)

    def __len__(self):
        return len(self.train_data_X)

    def __getitem__(self, idx):
        return self.train_data_X[idx], self.train_data_Y[idx]

    def load_data(self, file_p):
        data = []
        with open(file_p, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if line:
                    # Remove invalid characters
                    line = re.sub(
                        f'[^{re.escape("".join(self.valid_chars))}]', '', line)
                    # Normalize spaces
                    line = re.sub(r'\s+', ' ', line)
                    # Split into sentences based on punctuation
                    sentences = re.split(
                        f'[{re.escape("".join(self.punctuations))}]', line)
                    sentences = [s.strip() for s in sentences if s.strip()]
                    data.extend(sentences)

        data = np.array(data)
        return data

    def extract_diacritics(self, sentence):
        result = []
        i = 0
        n = len(sentence)
        on_char = False

        while i < n:
            ch = sentence[i]
            if ch in self.diacritics:
                on_char = False
                # check if next char forms a stacked diacritic
                if i+1 < n and sentence[i+1] in self.diacritics:
                    combined = ch + sentence[i+1]
                    if combined in self.diacritic2id:
                        result.append(self.diacritic2id[combined])
                        i += 2
                        continue
                result.append(self.diacritic2id[ch])
            elif ch in self.char2id:
                if on_char:
                    result.append(self.diacritic2id[''])
                on_char = True

            i += 1
        if on_char:
            result.append(self.diacritic2id[''])
        return result

In [15]:
class ArabicModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, PAD):
        super(ArabicModel, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.embedding = nn.Embedding(
            vocab_size, embedding_dim, padding_idx=PAD)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim,
                            batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        output = self.fc(lstm_out)
        return output

In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [17]:
dataset = ArabicDataset()
data_loader = DataLoader(
    dataset,
    batch_size=64,
    shuffle=True
)

In [18]:
val_dataset = torch.utils.data.TensorDataset(
    dataset.val_data_X, dataset.val_data_Y)
val_loader = DataLoader(val_dataset, batch_size=64)

In [19]:
model = ArabicModel(
    vocab_size=len(dataset.char2id),
    embedding_dim=128,
    hidden_dim=256,
    output_dim=len(dataset.diacritic2id),
    PAD=dataset.char2id["<PAD>"]
).to(device)

In [None]:

learning_rate = 0.001
criterion = nn.CrossEntropyLoss(ignore_index=dataset.char2id["<PAD>"])
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(5):
    model.train()
    for X, Y in tqdm(data_loader, desc=f"Training Epoch {epoch+1}"):
        X = X.to(device)
        Y = Y.to(device)

        outputs = model(X)
        loss = criterion(outputs.view(-1, outputs.size(-1)), Y.view(-1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    total_correct = 0
    total_tokens = 0
    model.eval()
    for X, Y in tqdm(val_loader, desc=f"Validating Epoch {epoch+1}"):
        X = X.to(device)
        Y = Y.to(device)

        with torch.no_grad():
            outputs = model(X)

        predictions = outputs.argmax(dim=-1)
        mask = (Y != dataset.char2id["<PAD>"])

        total_correct += ((predictions == Y) & mask).sum().item()
        total_tokens += mask.sum().item()

    avg_acc = total_correct / total_tokens * 100
    print(f"Validation Accuracy: {avg_acc:.2f}%")

Training Epoch 1: 100%|██████████| 2912/2912 [02:55<00:00, 16.61it/s]
100%|██████████| 142/142 [00:02<00:00, 47.66it/s]


Validation Loss: 0.1129, Validation Accuracy: 96.46%


Training Epoch 2: 100%|██████████| 2912/2912 [02:54<00:00, 16.65it/s]
100%|██████████| 142/142 [00:02<00:00, 48.14it/s]


Validation Loss: 0.1112, Validation Accuracy: 96.53%


Training Epoch 3: 100%|██████████| 2912/2912 [02:54<00:00, 16.68it/s]
100%|██████████| 142/142 [00:02<00:00, 47.97it/s]


Validation Loss: 0.1094, Validation Accuracy: 96.59%


Training Epoch 4: 100%|██████████| 2912/2912 [02:55<00:00, 16.58it/s]
100%|██████████| 142/142 [00:02<00:00, 48.27it/s]


Validation Loss: 0.1110, Validation Accuracy: 96.54%


Training Epoch 5: 100%|██████████| 2912/2912 [02:55<00:00, 16.61it/s]
100%|██████████| 142/142 [00:02<00:00, 48.12it/s]

Validation Loss: 0.1096, Validation Accuracy: 96.62%



