### Imports

In [115]:
import pickle
import torch
import torch.nn as nn
import csv
import pandas as pd

%run preprocessing.ipynb

474
['ق', 'و', 'ل', 'ه', 'ل', 'ع', 'د', 'م', 'م', 'ا', 'ت', 'ت', 'ع', 'ل', 'ق', 'إ', 'ل', 'خ', 'أ', 'ي', 'ا', 'ل', 'و', 'ص', 'ي', 'ة', 'ق', 'و', 'ل', 'ه', 'م', 'ا', 'م', 'ر', 'أ', 'ي', 'ق', 'ب', 'ي', 'ل', 'ق', 'و', 'ل', 'ا', 'ل', 'م', 'ت', 'ن', 'ل', 'غ', 'ت', 'و', 'ل', 'و', 'ا', 'ق', 'ت', 'ص', 'ر', 'ع', 'ل', 'ى', 'أ', 'و', 'ص', 'ي', 'ت', 'ل', 'ه', 'ب', 'ش', 'ا', 'ة', 'أ', 'و', 'أ', 'ع', 'ط', 'و', 'ه', 'ش', 'ا', 'ة', 'و', 'ل', 'ا', 'غ', 'ن', 'م', 'ل', 'ه', 'ع', 'ن', 'د', 'ا', 'ل', 'م', 'و', 'ت', 'ه', 'ل', 'ت', 'ب', 'ط', 'ل', 'ا', 'ل', 'و', 'ص', 'ي', 'ة', 'أ', 'و', 'ي', 'ش', 'ت', 'ر', 'ى', 'ل', 'ه', 'ش', 'ا', 'ة', 'و', 'ي', 'ؤ', 'خ', 'ذ', 'م', 'ن', 'ق', 'و', 'ل', 'ه', 'ا', 'ل', 'آ', 'ت', 'ي', 'ك', 'م', 'ا', 'ل', 'و', 'ل', 'م', 'ي', 'ق', 'ل', 'م', 'ن', 'م', 'ا', 'ل', 'ي', 'و', 'ل', 'ا', 'م', 'ن', 'غ', 'ن', 'م', 'ي', 'أ', 'ن', 'ه', 'ا', 'ل', 'ا', 'ت', 'ب', 'ط', 'ل', 'و', 'ع', 'ب', 'ا', 'ر', 'ة', 'ا', 'ل', 'ك', 'ن', 'ز', 'و', 'ل', 'و', 'ل', 'م', 'ي', 'ق', 'ل', 'م', 'ن', 'م', 'ا', 'ل', 'ي', 

### Constants

In [116]:
EMBEDDING_DIM = 300
HIDDEN_SIZE = 512
NUM_LAYERS = 1
NUM_EPOCHS = 10
LEARNING_RATE = 0.001
BATCH_SIZE = 256
VOCAB_SIZE = len(basic_arabic_letters) + 1
LABELS_SIZE = len(DIACRITICS)

TEST_PATH = "./test/test_no_diacritics.txt"

with open("./utils/diacritic2id.pickle", 'rb') as file:
    diacritic2id = pickle.load(file)

### Read the model

In [117]:
class RNN(nn.Module):
    def __init__(
        self,
        vocab_size=VOCAB_SIZE,
        n_classes=LABELS_SIZE,
        embedding_dim=EMBEDDING_DIM,
        hidden_size=HIDDEN_SIZE,
        num_layers=NUM_LAYERS,
    ):
        """
        The constructor of our RNN model
        Inputs:
        - vacab_size: the number of unique characters
        - embedding_dim: the embedding dimension
        - n_classes: the number of final classes (diacritics)
        """
        super(RNN, self).__init__()

        # (1) Create an embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # (2) Create an LSTM layer with hidden size = hidden_size and batch_first = True
        self.lstm = nn.LSTM(
            embedding_dim,
            hidden_size,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=True,
        )

        # (3) Create a linear layer with number of neorons = n_classes
        self.linear = nn.Linear(hidden_size * 2, n_classes)

    def forward(self, sentences):
        """
        This function does the forward pass of our model
        Inputs:
        - sentences: tensor of shape (batch_size, max_length)

        Returns:
        - final_output: tensor of shape (batch_size, max_length, n_classes)
        """

        embeddings = self.embedding(sentences)
        lstm_out, _ = self.lstm(embeddings)
        output = self.linear(lstm_out)

        return output

In [118]:
# Instantiate the model
model = RNN()

# Load the saved model weights
model_path = "./models/rnn.pth"
model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
model.eval()

print(model)

RNN(
  (embedding): Embedding(37, 300)
  (lstm): LSTM(300, 512, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=1024, out_features=15, bias=True)
)


### Prepare the testing dataset

In [119]:
test_corpus = readFile(TEST_PATH)

X_test = []

for sentence in test_corpus:
    sentence = separate_words_to_char(sentence.strip())

    X_test.append(sentence)

print(X_test)
# X_test_padded = [torch.tensor([char_to_index[char] for char in sentence]) for sentence in X_test]
# X_test_padded = pad_sequence(X_test_padded, batch_first=True)

[['ل', 'ي', 'س', 'ل', 'ل', 'و', 'ك', 'ي', 'ل', 'ب', 'ا', 'ل', 'ق', 'ب', 'ض', 'أ', 'ن', 'ي', 'ب', 'ر', 'أ', 'ا', 'ل', 'م', 'د', 'ي', 'ن', 'أ', 'و', 'ي', 'ه', 'ب', 'ا', 'ل', 'د', 'ي', 'ن', 'ل', 'ه', 'أ', 'و', 'ي', 'أ', 'خ', 'ذ', 'ر', 'ه', 'ن', 'ا', 'م', 'ن', 'ا', 'ل', 'م', 'د', 'ي', 'ن', 'ف', 'ي', 'م', 'ق', 'ا', 'ب', 'ل', 'ا', 'ل', 'د', 'ي', 'ن', 'أ', 'و', 'ي', 'ق', 'ب', 'ل', 'إ', 'ح', 'ا', 'ل', 'ت', 'ه', 'ع', 'ل', 'ى', 'ش', 'خ', 'ص', 'آ', 'خ', 'ر', 'ل', 'ك', 'ن', 'ل', 'ه', 'أ', 'ن', 'ي', 'أ', 'خ', 'ذ', 'ك', 'ف', 'ي', 'ل', 'ا', 'ل', 'ك', 'ن', 'ل', 'ي', 'س', 'ل', 'ه', 'أ', 'ن', 'ي', 'أ', 'خ', 'ذ', 'ك', 'ف', 'ي', 'ل', 'ا', 'ب', 'ش', 'ر', 'ط', 'ب', 'ر', 'ا', 'ء', 'ة', 'ا', 'ل', 'أ', 'ص', 'ي', 'ل', 'ا', 'ن', 'ظ', 'ر', 'ا', 'ل', 'م', 'ا', 'د', 'ة', 'ا', 'ل', 'أ', 'ن', 'ق', 'ر', 'و', 'ي', 'ا', 'ل', 'ط', 'ح', 'ط', 'ا', 'و', 'ي', 'و', 'ص', 'ر', 'ة', 'ا', 'ل', 'ف', 'ت', 'ا', 'و', 'ى', 'ا', 'ل', 'ب', 'ح', 'ر']]


In [None]:
test_corpus = readFile("../dataset/test.txt")

X_test = []
total_len = 0

for sentence in test_corpus:
    char_list, _ = separate_words_and_diacritics(sentence.strip())

    for i in range(len(char_list)):
        X_test.append(char_list[i])

### Testing

In [120]:
final_predictions = []
total_len = 0
for i in range(len(X_test)):
    sentence = X_test[i]
    sentence = torch.tensor([char_to_index[char] for char in sentence])
    total_len += len(sentence)

    with torch.no_grad():
        predictions = model(sentence)
    
    # Get the predictions
    predictions = predictions.argmax(dim=1)
    predictions = [index_to_diacritic[index.item()] for index in predictions]
    predictions = [diacritic2id["" if diacritic == " " else diacritic] for diacritic in predictions]

    final_predictions += predictions

In [121]:
data = []
for i in range(len(final_predictions)):
    data.append({"ID": i, "label": final_predictions[i]})

with open("./test/output.csv", "w", newline="") as csvfile:
    fieldnames = ["ID", "label"]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()

    for row in data:
        writer.writerow(row)

### Get the accuracy

In [122]:
# Load your output and gold output CSV files
output_file = "./test/output.csv"
gold_output_file = "./test/test_set_gold.csv"

output_df = pd.read_csv(output_file)
gold_output_df = pd.read_csv(gold_output_file)

# Merge DataFrames on 'ID'
merged_df = pd.merge(output_df, gold_output_df, on="ID", suffixes=('_your', '_gold'))

# Check if the labels match
merged_df['correct'] = merged_df['label_your'] == merged_df['label_gold']

# Calculate accuracy
accuracy = merged_df['correct'].mean()

print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 90.11%
