### Imports

In [157]:
import pickle
import torch
import torch.nn as nn
import csv
import pandas as pd

%run preprocessing.ipynb

### Constants

In [158]:
EMBEDDING_DIM = 300
HIDDEN_SIZE = 512
NUM_LAYERS = 3
NUM_EPOCHS = 30
LEARNING_RATE = 0.001
BATCH_SIZE = 200

VOCAB_SIZE = len(basic_arabic_letters) + 1
LABELS_SIZE = len(DIACRITICS)

RNN_PATH = "./models/rnn.pth"
CNN_PATH = "./models/cnn.pth"

TEST_PATH = "../dataset/test.txt"
TEST2_PATH = "../dataset/test2.txt"
TEST3_PATH = "../dataset/test3.txt"
TEST4_PATH = "../dataset/test4.txt"
VAL_PATH = "../dataset/val.txt"

with open("./utils/diacritic2id.pickle", 'rb') as file:
    diacritic2id = pickle.load(file)

### 3-Layer-LSTM

In [159]:
class RNN(nn.Module):
    def __init__(
        self,
        vocab_size=VOCAB_SIZE,
        n_classes=LABELS_SIZE,
        embedding_dim=EMBEDDING_DIM,
        hidden_size=HIDDEN_SIZE,
        num_layers=NUM_LAYERS,
    ):
        """
        The constructor of our RNN model
        Inputs:
        - vacab_size: the number of unique characters
        - embedding_dim: the embedding dimension
        - n_classes: the number of final classes (diacritics)
        """
        super(RNN, self).__init__()

        # (1) Create an embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # (2) Create an LSTM layer with hidden size = hidden_size and batch_first = True
        self.lstm = nn.LSTM(
            embedding_dim,
            hidden_size,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=True,
        )

        # (3) Create a linear layer with number of neorons = n_classes
        self.linear = nn.Linear(hidden_size * 2, n_classes)

    def forward(self, sentences):
        """
        This function does the forward pass of our model
        Inputs:
        - sentences: tensor of shape (batch_size, max_length)

        Returns:
        - final_output: tensor of shape (batch_size, max_length, n_classes)
        """

        embeddings = self.embedding(sentences)
        lstm_out, _ = self.lstm(embeddings)
        output = self.linear(lstm_out)

        return output

### Conv Layer + 2-Layer LSTM

In [127]:
class CNN(nn.Module):
    def __init__(self, vocab_size=VOCAB_SIZE, num_classes=LABELS_SIZE, embedding_dim=EMBEDDING_DIM, hidden_size=HIDDEN_SIZE, num_layers=NUM_LAYERS):
        super(CNN, self).__init__()
        
        # Embedding Layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # Convolutional Layer
        self.conv1d_1 = nn.Conv1d(embedding_dim, 256, kernel_size=3, padding=1)
        
        # LSTM Layer
        self.lstm = nn.LSTM(256, hidden_size, num_layers=num_layers, batch_first=True, bidirectional=True)
        
        # Linear Layer
        self.linear = nn.Linear(2 * hidden_size, num_classes)

    def forward(self, sentences):
        embeddings = self.embedding(sentences)
        
        # Convolutional Layer
        conv_out = self.conv1d_1(embeddings.permute(0, 2, 1))
        conv_out = nn.functional.relu(conv_out)
        
        # LSTM Layer
        lstm_out, _ = self.lstm(conv_out.permute(0, 2, 1))
        
        # Linear Layer
        output = self.linear(lstm_out)

        return output

### Read CNN model

In [128]:
# Instantiate the model
model = CNN()

# Load the saved model weights
model.load_state_dict(torch.load(CNN_PATH, map_location=torch.device('cpu')))
model.eval()

print(model)

CNN(
  (embedding): Embedding(37, 300)
  (conv1d_1): Conv1d(300, 256, kernel_size=(3,), stride=(1,), padding=(1,))
  (lstm): LSTM(256, 512, num_layers=2, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=1024, out_features=15, bias=True)
)


### Read RNN model

In [160]:
# Instantiate the model
model = RNN()

# Load the saved model weights
model.load_state_dict(torch.load(RNN_PATH, map_location=torch.device('cpu')))
model.eval()

print(model)

RNN(
  (embedding): Embedding(37, 300)
  (lstm): LSTM(300, 512, num_layers=3, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=1024, out_features=15, bias=True)
)


### Prepare the sample one-line test

In [119]:
test_corpus = readFile(TEST_PATH)

X_test = []

for sentence in test_corpus:
    sentence = separate_words_to_char(sentence.strip())

    X_test.append(sentence)

[['ل', 'ي', 'س', 'ل', 'ل', 'و', 'ك', 'ي', 'ل', 'ب', 'ا', 'ل', 'ق', 'ب', 'ض', 'أ', 'ن', 'ي', 'ب', 'ر', 'أ', 'ا', 'ل', 'م', 'د', 'ي', 'ن', 'أ', 'و', 'ي', 'ه', 'ب', 'ا', 'ل', 'د', 'ي', 'ن', 'ل', 'ه', 'أ', 'و', 'ي', 'أ', 'خ', 'ذ', 'ر', 'ه', 'ن', 'ا', 'م', 'ن', 'ا', 'ل', 'م', 'د', 'ي', 'ن', 'ف', 'ي', 'م', 'ق', 'ا', 'ب', 'ل', 'ا', 'ل', 'د', 'ي', 'ن', 'أ', 'و', 'ي', 'ق', 'ب', 'ل', 'إ', 'ح', 'ا', 'ل', 'ت', 'ه', 'ع', 'ل', 'ى', 'ش', 'خ', 'ص', 'آ', 'خ', 'ر', 'ل', 'ك', 'ن', 'ل', 'ه', 'أ', 'ن', 'ي', 'أ', 'خ', 'ذ', 'ك', 'ف', 'ي', 'ل', 'ا', 'ل', 'ك', 'ن', 'ل', 'ي', 'س', 'ل', 'ه', 'أ', 'ن', 'ي', 'أ', 'خ', 'ذ', 'ك', 'ف', 'ي', 'ل', 'ا', 'ب', 'ش', 'ر', 'ط', 'ب', 'ر', 'ا', 'ء', 'ة', 'ا', 'ل', 'أ', 'ص', 'ي', 'ل', 'ا', 'ن', 'ظ', 'ر', 'ا', 'ل', 'م', 'ا', 'د', 'ة', 'ا', 'ل', 'أ', 'ن', 'ق', 'ر', 'و', 'ي', 'ا', 'ل', 'ط', 'ح', 'ط', 'ا', 'و', 'ي', 'و', 'ص', 'ر', 'ة', 'ا', 'ل', 'ف', 'ت', 'ا', 'و', 'ى', 'ا', 'ل', 'ب', 'ح', 'ر']]


### Prepare test.txt

In [161]:
test_corpus = readFile(VAL_PATH)

X_test = []
total_len = 0

for sentence in test_corpus:
    char_list, _ = separate_words_and_diacritics(sentence.strip())

    for i in range(len(char_list)):
        X_test.append(char_list[i])

### Get CNN predictions

In [130]:
final_predictions = []
total_len = 0
for i in range(len(X_test)):
    sentence = X_test[i]
    sentence = torch.tensor([char_to_index[char] for char in sentence])
    total_len += len(sentence)

    with torch.no_grad():
        predictions = model(sentence.view(1, sentence.shape[0]))
    
    # Get the predictions
    for prediction in predictions:
        # Get the predictions
        prediction = prediction.argmax(dim=1)
        prediction = [index_to_diacritic[index.item()] for index in prediction]
        prediction = [diacritic2id["" if diacritic == " " else diacritic] for diacritic in prediction]
        
        final_predictions += prediction

### Get RNN predictions

In [162]:
final_predictions = []
total_len = 0
for i in range(len(X_test)):
    sentence = X_test[i]
    sentence = torch.tensor([char_to_index[char] for char in sentence])
    total_len += len(sentence)

    with torch.no_grad():
        predictions = model(sentence)
    
    # Get the predictions
    predictions = predictions.argmax(dim=1)
    predictions = [index_to_diacritic[index.item()] for index in predictions]
    predictions = [diacritic2id["" if diacritic == " " else diacritic] for diacritic in predictions]

    final_predictions += predictions

### Write Output File

In [163]:
data = []
for i in range(len(final_predictions)):
    data.append({"ID": i, "label": final_predictions[i]})

with open("./test/output.csv", "w", newline="") as csvfile:
    fieldnames = ["ID", "label"]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()

    for row in data:
        writer.writerow(row)

### Calculate accuracy

In [164]:
# Load your output and gold output CSV files
output_file = "./test/output.csv"
gold_output_file = "./test/test_set_gold.csv"

output_df = pd.read_csv(output_file)
gold_output_df = pd.read_csv(gold_output_file)

# Merge DataFrames on 'ID'
merged_df = pd.merge(output_df, gold_output_df, on="ID", suffixes=('_your', '_gold'))

# Check if the labels match
merged_df['correct'] = merged_df['label_your'] == merged_df['label_gold']

# Calculate accuracy
accuracy = merged_df['correct'].mean()

print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 96.44%
