### Imports

In [1]:
import pickle
import torch
import torch.nn as nn
import csv
import pandas as pd
import torch.nn as nn
from TorchCRF import CRF

%run preprocessing.ipynb

  from .autonotebook import tqdm as notebook_tqdm


### Constants

In [2]:
EMBEDDING_DIM = 300
HIDDEN_SIZE = 512
NUM_LAYERS = 2
NUM_EPOCHS = 10
LEARNING_RATE = 0.001
BATCH_SIZE = 128
VOCAB_SIZE = len(basic_arabic_letters) + 1
LABELS_SIZE = len(DIACRITICS)
PAD = 15
CONTEXTUAL_EMBEDDING_DIM=100

RNN_PATH = "./models/rnn.pth"
CNN_PATH = "./models/cnn.pth"
CRF_PATH = "./models/crf_300_64/crf.pth"

TEST_PATH = "../dataset/test.txt"

with open("./utils/diacritic2id.pickle", 'rb') as file:
    diacritic2id = pickle.load(file)

### 1-Layer-LSTM

In [36]:
class RNN(nn.Module):
    def __init__(
        self,
        vocab_size=VOCAB_SIZE,
        n_classes=LABELS_SIZE,
        embedding_dim=EMBEDDING_DIM,
        hidden_size=HIDDEN_SIZE,
        num_layers=NUM_LAYERS,
    ):
        """
        The constructor of our RNN model
        Inputs:
        - vacab_size: the number of unique characters
        - embedding_dim: the embedding dimension
        - n_classes: the number of final classes (diacritics)
        """
        super(RNN, self).__init__()

        # (1) Create an embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # (2) Create an LSTM layer with hidden size = hidden_size and batch_first = True
        self.lstm = nn.LSTM(
            embedding_dim,
            hidden_size,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=True,
        )

        # (3) Create a linear layer with number of neorons = n_classes
        self.linear = nn.Linear(hidden_size * 2, n_classes)

    def forward(self, sentences):
        """
        This function does the forward pass of our model
        Inputs:
        - sentences: tensor of shape (batch_size, max_length)

        Returns:
        - final_output: tensor of shape (batch_size, max_length, n_classes)
        """

        embeddings = self.embedding(sentences)
        lstm_out, _ = self.lstm(embeddings)
        output = self.linear(lstm_out)

        return output

### Conv Layer + 2-Layer LSTM

In [37]:
class CNN(nn.Module):
    def __init__(self, vocab_size=VOCAB_SIZE, num_classes=LABELS_SIZE, embedding_dim=EMBEDDING_DIM, hidden_size=HIDDEN_SIZE, num_layers=NUM_LAYERS):
        super(CNN, self).__init__()
        
        # Embedding Layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # Convolutional Layer
        self.conv1d = nn.Conv1d(embedding_dim, 256, kernel_size=3, padding=1)
        
        # LSTM Layer
        self.lstm = nn.LSTM(256, hidden_size, num_layers=num_layers, batch_first=True, bidirectional=True)
        
        # Linear Layer
        self.linear = nn.Linear(2 * hidden_size, num_classes)

    def forward(self, sentences):
        embeddings = self.embedding(sentences)
        
        # Convolutional Layer
        conv_out = self.conv1d(embeddings.permute(0, 2, 1))
        conv_out = nn.functional.relu(conv_out)
        
        # LSTM Layer
        lstm_out, _ = self.lstm(conv_out.permute(0, 2, 1))
        
        # Linear Layer
        output = self.linear(lstm_out)

        return output

# CRF and 2Lstm

In [3]:
class RNN_CRF(nn.Module):
    def __init__(self,  vocab_size=VOCAB_SIZE, num_classes=LABELS_SIZE, embedding_dim=EMBEDDING_DIM, hidden_size=HIDDEN_SIZE, num_layers=NUM_LAYERS, dropout=0.5):
        super(RNN_CRF, self).__init__()

        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # LSTM layer
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers=num_layers, batch_first=True, bidirectional=True)

        # Linear layer
        self.linear = nn.Linear(hidden_size * 2, num_classes)
        self.dropout = nn.Dropout(dropout)  # Apply dropout before the linear layer

        # CRF layer
        self.crf = CRF(num_classes)  # Place the CRF layer after the linear layer

    def forward(self, word):
        embeddings = self.embedding(word)
        lstm_out, _ = self.lstm(embeddings)
        dropout_out = self.dropout(lstm_out)  # Apply dropout
        output = self.linear(dropout_out)
        return output  # Return raw output for CRF loss calculation

    def predict(self, word):
        output = self.forward(word)
        predictions = self.crf.decode(output)
        return predictions

### Read CNN model

In [24]:
# Instantiate the model
model = CNN()

# Load the saved model weights
model.load_state_dict(torch.load(CNN_PATH, map_location=torch.device('cpu')))
model.eval()

print(model)

RuntimeError: Error(s) in loading state_dict for CNN:
	size mismatch for embedding.weight: copying a param with shape torch.Size([121, 300]) from checkpoint, the shape in current model is torch.Size([37, 300]).

In [4]:
# Instantiate the model
model = RNN_CRF()

# Load the saved model weights
model.load_state_dict(torch.load(CRF_PATH, map_location=torch.device('cpu')))
model.eval()

print(model)

RNN_CRF(
  (embedding): Embedding(37, 300)
  (lstm): LSTM(300, 512, num_layers=2, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=1024, out_features=15, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (crf): CRF()
)


### Read RNN model

In [6]:
# Instantiate the model
model = RNN()

# Load the saved model weights
model.load_state_dict(torch.load(RNN_PATH, map_location=torch.device('cpu')))
model.eval()

print(model)

NameError: name 'RNN' is not defined

### Prepare the sample one-line test

In [26]:
test_corpus = readFile(TEST_PATH)

X_test = []

for sentence in test_corpus:
    sentence = separate_words_to_char(sentence.strip())

    X_test.append(sentence)

### Prepare test.txt

In [5]:
test_corpus = readFile(TEST_PATH)

X_test = []
total_len = 0

for sentence in test_corpus:
    char_list, _ = separate_words_and_diacritics(sentence.strip())

    for i in range(len(char_list)):
        X_test.append(char_list[i])

### Get CNN predictions

In [None]:
final_predictions = []
total_len = 0
for i in range(len(X_test)):
    sentence = X_test[i]
    sentence = torch.tensor([char_to_index[char] for char in sentence])
    total_len += len(sentence)

    with torch.no_grad():
        predictions = model(sentence.view(1, sentence.shape[0]))
    
    # Get the predictions
    for prediction in predictions:
        # Get the predictions
        prediction = prediction.argmax(dim=1)
        prediction = [index_to_diacritic[index.item()] for index in prediction]
        prediction = [diacritic2id["" if diacritic == " " else diacritic] for diacritic in prediction]
        
        final_predictions += prediction

### Get RNN predictions

In [6]:
final_predictions = []
total_len = 0
for i in range(len(X_test)):
    sentence = X_test[i]
    sentence = torch.tensor([char_to_index[char] for char in sentence])
    total_len += len(sentence)

    with torch.no_grad():
        predictions = model(sentence)
    
    # Get the predictions
    predictions = predictions.argmax(dim=1)
    predictions = [index_to_diacritic[index.item()] for index in predictions]
    predictions = [diacritic2id["" if diacritic == " " else diacritic] for diacritic in predictions]

    final_predictions += predictions

### Write Output File

In [7]:
data = []
for i in range(len(final_predictions)):
    data.append({"ID": i, "label": final_predictions[i]})

with open("./test/output.csv", "w", newline="") as csvfile:
    fieldnames = ["ID", "label"]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()

    for row in data:
        writer.writerow(row)

### Calculate accuracy

In [8]:
# Load your output and gold output CSV files
output_file = "./test/output.csv"
gold_output_file = "./test/test_set_gold.csv"

output_df = pd.read_csv(output_file)
gold_output_df = pd.read_csv(gold_output_file)

# Merge DataFrames on 'ID'
merged_df = pd.merge(output_df, gold_output_df, on="ID", suffixes=('_your', '_gold'))

# Check if the labels match
merged_df['correct'] = merged_df['label_your'] == merged_df['label_gold']

# Calculate accuracy
accuracy = merged_df['correct'].mean()

print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 20.96%
