In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from tqdm import tqdm
import os
import csv
import numpy as np
import re
from typing import List, Any


In [None]:
model_path = f"../models/ArabicBiLSTMModel.pth"
input_path = f"../input/dataset_no_diacritics.txt"
output_path = f"../output/output.txt"

In [3]:

# Configurations
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Global registries
DATASET_REGISTRY: dict[str, Any] = {}
MODEL_REGISTRY: dict[str, Any] = {}

# Model hyperparameters
EMBEDDING_DIM = 128
HIDDEN_DIM = 256
BATCH_SIZE = 32
NUM_EPOCHS = 5
LEARNING_RATE = 0.001
NUM_LAYERS = 3
DROPOUT = 0.2

# Data parameters
ARABIC_LETTERS = sorted(
    np.load('../data/utils/arabic_letters.pkl', allow_pickle=True))
DIACRITICS = sorted(np.load(
    '../data/utils/diacritics.pkl', allow_pickle=True))
PUNCTUATIONS = {".", "،", ":", "؛", "؟", "!", '"', "-"}

VALID_CHARS = set(ARABIC_LETTERS).union(
    set(DIACRITICS)).union(PUNCTUATIONS).union({" "})

CHAR2ID = {char: id for id, char in enumerate(ARABIC_LETTERS)}
CHAR2ID[" "] = len(ARABIC_LETTERS)
CHAR2ID["<PAD>"] = len(ARABIC_LETTERS) + 1
PAD = CHAR2ID["<PAD>"]
SPACE = CHAR2ID[" "]
ID2CHAR = {id: char for char, id in CHAR2ID.items()}

DIACRITIC2ID = np.load('../data/utils/diacritic2id.pkl', allow_pickle=True)
ID2DIACRITIC = {id: diacritic for diacritic, id in DIACRITIC2ID.items()}


In [4]:

def register_dataset(name):
    def decorator(cls):
        DATASET_REGISTRY[name] = cls
        return cls
    return decorator


def generate_dataset(dataset_name: str, *args, **kwargs):
    try:
        dataset_cls = DATASET_REGISTRY[dataset_name]
    except KeyError:
        raise ValueError(f"Dataset '{dataset_name}' is not recognized.")
    return dataset_cls(*args, **kwargs)



def register_model(name):
    def decorator(cls):
        MODEL_REGISTRY[name] = cls
        return cls
    return decorator


def generate_model(model_name: str, *args, **kwargs):
    try:
        model_cls = MODEL_REGISTRY[model_name]
    except KeyError:
        raise ValueError(f"Model '{model_name}' is not recognized.")
    return model_cls(*args, **kwargs)


In [5]:

@register_dataset("ArabicDataset")
class ArabicDataset(Dataset):
    def __init__(self, file_path: str):
        self.data_X, self.data_Y = self.generate_tensor_data(file_path)

    def __len__(self):
        return len(self.data_X)

    def __getitem__(self, idx):
        return self.data_X[idx], self.data_Y[idx]

    def generate_tensor_data(self, data_path: str):
        data_Y = self.load_data(data_path)
        data_X = self.extract_text_without_diacritics(data_Y)

        encoded_data_X, encoded_data_Y = self.encode_data(data_X, data_Y)
        data_X = torch.tensor(
            encoded_data_X, dtype=torch.int64)
        data_Y = torch.tensor(
            encoded_data_Y, dtype=torch.int64)

        return data_X, data_Y

    def load_data(self, file_path: str):
        data = []
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if line:
                    line = re.sub(
                        f'[^{re.escape("".join(VALID_CHARS))}]', '', line)
                    line = re.sub(r'\s+', ' ', line)
                    sentences = re.split(
                        f'[{re.escape("".join(PUNCTUATIONS))}]', line)
                    sentences = [s.strip() for s in sentences if s.strip()]
                    data.extend(sentences)

        return np.array(data)

    def extract_text_without_diacritics(self, dataY):
        dataX = dataY.copy()
        for diacritic, _ in DIACRITIC2ID.items():
            dataX = np.char.replace(
                dataX, diacritic, '')
        return dataX

    def encode_data(self, dataX: List[str], dataY: List[str]):
        encoded_data_X = []
        for sentence in dataX:
            encoded_data_X.append([CHAR2ID[char]
                                   for char in sentence if char in CHAR2ID])
        encoded_data_Y = []
        for sentence in dataY:
            encoded_data_Y.append(self.extract_diacritics(sentence))

        max_sentence_len = max(len(sentence) for sentence in encoded_data_X)
        padded_dataX = np.full(
            (len(encoded_data_X), max_sentence_len), PAD, dtype=np.int64)
        for i, seq in enumerate(encoded_data_X):
            padded_dataX[i, :len(seq)] = seq

        padded_dataY = np.full(
            (len(encoded_data_Y), max_sentence_len), PAD, dtype=np.int64)
        for i, seq in enumerate(encoded_data_Y):
            padded_dataY[i, :len(seq)] = seq

        return padded_dataX, padded_dataY

    def extract_diacritics(self, sentence: str):
        result = []
        i = 0
        n = len(sentence)
        on_char = False

        while i < n:
            ch = sentence[i]
            if ch in DIACRITICS:
                on_char = False
                # check if next char forms a stacked diacritic
                if i+1 < n and sentence[i+1] in DIACRITICS:
                    combined = ch + sentence[i+1]
                    if combined in DIACRITIC2ID:
                        result.append(DIACRITIC2ID[combined])
                        i += 2
                        continue
                result.append(DIACRITIC2ID[ch])
            elif ch in CHAR2ID:
                if on_char:
                    result.append(DIACRITIC2ID[''])
                on_char = True
            i += 1
        if on_char:
            result.append(DIACRITIC2ID[''])
        return result


In [6]:

@register_model("LSTMArabicModel")
class LSTMArabicModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, PAD):
        super(LSTMArabicModel, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.embedding = nn.Embedding(
            vocab_size, embedding_dim, padding_idx=PAD)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim,
                            batch_first=True, bidirectional=True,
                            num_layers=NUM_LAYERS, dropout=DROPOUT)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        output = self.fc(lstm_out)
        return output


In [7]:
def train(model: nn.Module, train_dataset: Dataset, model_path: str):

    train_data_loader = DataLoader(
        train_dataset,
        batch_size=BATCH_SIZE,
        shuffle=True
    )
    criterion = nn.CrossEntropyLoss(ignore_index=PAD)
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

    if torch.cuda.is_available():
        model = model.cuda()
        criterion = criterion.cuda()

    for epoch in range(NUM_EPOCHS):
        total_correct = 0
        total_tokens = 0
        epoch_loss = 0

        model.train()
        for train_X, train_Y in tqdm(train_data_loader, desc=f"Training Epoch {epoch + 1}"):
            train_X = train_X.to(DEVICE)
            train_Y = train_Y.to(DEVICE)

            output = model(train_X)

            loss = criterion(output.view(-1, output.size(-1)),
                             train_Y.view(-1))
            epoch_loss += loss.item()

            mask = (train_Y != PAD)
            prediction = output.argmax(dim=-1)
            total_correct += ((prediction == train_Y) & mask).sum().item()
            total_tokens += mask.sum().item()

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        epoch_acc = (total_correct / total_tokens) * 100
        print(
            f'Epochs: {epoch + 1} | Train Loss: {epoch_loss} \
            | Train Accuracy: {epoch_acc}\n')

    torch.save(model.state_dict(), model_path)

In [8]:
def evaluate(model: torch.nn.Module, val_dataset: torch.utils.data.Dataset):

    val_data_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

    if torch.cuda.is_available():
        model = model.cuda()

    total_correct_without_ending = 0
    total_tokens_without_ending = 0
    total_correct_ending = 0
    total_tokens_ending = 0
    total_correct = 0
    total_tokens = 0
    model.eval()
    with torch.no_grad():

        for val_X, val_Y in tqdm(val_data_loader):
            val_X = val_X.to(DEVICE)
            val_Y = val_Y.to(DEVICE)

            output = model(val_X)
            prediction = output.argmax(dim=-1)

            padding_mask = (val_Y == PAD)
            shifted = torch.roll(val_X, shifts=-1, dims=1)
            end_of_word_mask = (shifted == SPACE) | (shifted == PAD)

            last_char_mask = end_of_word_mask & (~padding_mask)
            rest_of_word_mask = (~end_of_word_mask) & (~padding_mask)
            everything_mask = ~padding_mask

            total_correct_ending += ((prediction == val_Y)
                                     & last_char_mask).sum().item()
            total_tokens_ending += last_char_mask.sum().item()

            total_correct_without_ending += ((prediction == val_Y) &
                                             rest_of_word_mask).sum().item()
            total_tokens_without_ending += rest_of_word_mask.sum().item()

            total_correct += ((prediction == val_Y) &
                              everything_mask).sum().item()
            total_tokens += everything_mask.sum().item()

        val_accuracy = (total_correct / total_tokens) * 100
        val_accuracy_without_ending = (total_correct_without_ending /
                                       total_tokens_without_ending) * 100
        val_accuracy_ending = (total_correct_ending /
                               total_tokens_ending) * 100
        print(
            f"Validation Accuracy (Overall): {val_accuracy:.2f}%\n" +
            f"Validation Accuracy (Without Last Character): {val_accuracy_without_ending:.2f}%\n" +
            f"Validation Accuracy (Last Character): {val_accuracy_ending:.2f}%\n")


In [9]:
def predict(model, encoded_sentence):
    input_tensor = torch.tensor(
        [encoded_sentence], dtype=torch.int64).to(DEVICE)
    with torch.no_grad():
        outputs = model(input_tensor)
    return outputs.argmax(dim=-1).squeeze(0).cpu().numpy()

In [None]:
def infer(model, model_path, input_path, output_path):

    model_state_dict = torch.load(model_path, map_location=DEVICE)
    model.load_state_dict(model_state_dict)

    with open(input_path, 'r', encoding='utf-8') as f:
        input_data = f.readlines()

    output_list = []
    output_csv = [["ID", "Label"]]
    current_id = 0

    model.eval()
    for sentence in input_data:
        encoded_sentence = [CHAR2ID[char]
                            for char in sentence if char in CHAR2ID]

        predictions = predict(model, encoded_sentence)

        diacritized_sentence = ""
        for char_id, diacritic_id in zip(encoded_sentence, predictions):
            char = ID2CHAR[char_id]
            diacritic = ID2DIACRITIC[diacritic_id]
            if char in ARABIC_LETTERS:
                output_csv.append([current_id, diacritic_id])
                current_id += 1
            diacritized_sentence += char + diacritic

        output_list.append(diacritized_sentence)

    with open(output_path, 'w', encoding='utf-8') as f:
        for line in output_list:
            f.write(line + '\n')

    output_path_csv = os.path.splitext(output_path)[0] + ".csv"
    with open(output_path_csv, "w", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerows(output_csv)

In [11]:
train_dataset = generate_dataset("ArabicDataset", "../data/train.txt")

In [12]:
val_dataset = generate_dataset("ArabicDataset", "../data/val.txt")

In [13]:
model = generate_model(
    model_name="LSTMArabicModel",
    vocab_size=len(CHAR2ID),
    embedding_dim=EMBEDDING_DIM,
    hidden_dim=HIDDEN_DIM,
    output_dim=len(DIACRITIC2ID),
    PAD=PAD
)

In [14]:
model_state_dict = torch.load(model_path, map_location=DEVICE)
model.load_state_dict(model_state_dict)

<All keys matched successfully>

In [None]:
train(model, train_dataset, model_path)

In [15]:
evaluate(model, val_dataset)

100%|██████████| 284/284 [00:14<00:00, 19.25it/s]

Validation Accuracy (Overall): 97.73%
Validation Accuracy (Without Last Character): 98.26%
Validation Accuracy (Last Character): 95.69%






In [None]:
infer(model, model_path, input_path, output_path)