### Imports

In [59]:
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F
from tqdm import tqdm
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from TorchCRF import CRF
from torch.optim.lr_scheduler import StepLR
from sklearn.model_selection import train_test_split

%run updatePreprocessing.ipynb

[['ق', 'و', 'ل', 'ه'], ['ل', 'ع', 'د', 'م'], ['م', 'ا'], ['ت', 'ت', 'ع', 'ل', 'ق'], ['إ', 'ل', 'خ'], ['أ', 'ي'], ['ا', 'ل', 'و', 'ص', 'ي', 'ة'], ['ق', 'و', 'ل', 'ه'], ['م', 'ا'], ['م', 'ر'], ['أ', 'ي'], ['ق', 'ب', 'ي', 'ل'], ['ق', 'و', 'ل'], ['ا', 'ل', 'م', 'ت', 'ن'], ['ل', 'غ', 'ت'], ['و', 'ل', 'و'], ['ا', 'ق', 'ت', 'ص', 'ر'], ['ع', 'ل', 'ى'], ['أ', 'و', 'ص', 'ي', 'ت'], ['ل', 'ه'], ['ب', 'ش', 'ا', 'ة'], ['أ', 'و'], ['أ', 'ع', 'ط', 'و', 'ه'], ['ش', 'ا', 'ة'], ['و', 'ل', 'ا'], ['غ', 'ن', 'م'], ['ل', 'ه'], ['ع', 'ن', 'د'], ['ا', 'ل', 'م', 'و', 'ت'], ['ه', 'ل'], ['ت', 'ب', 'ط', 'ل'], ['ا', 'ل', 'و', 'ص', 'ي', 'ة'], ['أ', 'و'], ['ي', 'ش', 'ت', 'ر', 'ى'], ['ل', 'ه'], ['ش', 'ا', 'ة'], ['و', 'ي', 'ؤ', 'خ', 'ذ'], ['م', 'ن'], ['ق', 'و', 'ل', 'ه'], ['ا', 'ل', 'آ', 'ت', 'ي'], ['ك', 'م', 'ا'], ['ل', 'و'], ['ل', 'م'], ['ي', 'ق', 'ل'], ['م', 'ن'], ['م', 'ا', 'ل', 'ي'], ['و', 'ل', 'ا'], ['م', 'ن'], ['غ', 'ن', 'م', 'ي'], ['أ', 'ن', 'ه', 'ا'], ['ل', 'ا'], ['ت', 'ب', 'ط', 'ل'], ['و', 'ع', 'ب', 'ا', 'ر',

### Constants

In [60]:
EMBEDDING_DIM = 300
HIDDEN_SIZE = 512
NUM_LAYERS = 3
NUM_EPOCHS = 10
LEARNING_RATE = 0.001
BATCH_SIZE = 256
VOCAB_SIZE = len(basic_arabic_letters) + 1
LABELS_SIZE = len(DIACRITICS)
PAD = 15

TRAIN_PATH = "../dataset/train.txt"
VAL_PATH = "../dataset/val.txt"
LSTM_PATH="./models/lstm.pth"
RNN_PATH="./models/rnn.pth"
CNN_PATH = "./models/cnn.pth"
CRF_PATH = "./models/crf.pth"

### Model building

### RNN

In [61]:
class RNN(nn.Module):
    def __init__(self, vocab_size=VOCAB_SIZE, n_classes=LABELS_SIZE, embedding_dim=EMBEDDING_DIM, hidden_size=HIDDEN_SIZE, num_layers=NUM_LAYERS):
        """
        The constructor of our RNN model
        Inputs:
        - vacab_size: the number of unique characters
        - embedding_dim: the embedding dimension
        - n_classes: the number of final classes (diacritics)
        """
        super(RNN, self).__init__()

        # (1) Create an embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # (2) Create an LSTM layer with hidden size = hidden_size and batch_first = True
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers=num_layers, batch_first=True, bidirectional=True)
        
        # (3) Create a linear layer with number of neorons = n_classes
        self.linear = nn.Linear(hidden_size * 2, n_classes)

    def forward(self, sentences):
        """
        This function does the forward pass of our model
        Inputs:
        - sentences: tensor of shape (batch_size, max_length)

        Returns:
        - final_output: tensor of shape (batch_size, max_length, n_classes)
        """

        final_output = None
        
        embeddings = self.embedding(sentences)
        lstm_out, _ = self.lstm(embeddings)
        output = self.linear(lstm_out)
        # final_output = F.softmax(output, dim=1)
        return output

### CNN

In [62]:
class CNN(nn.Module):
    def __init__(self, vocab_size=VOCAB_SIZE, num_classes=LABELS_SIZE, embedding_dim=EMBEDDING_DIM, hidden_size=HIDDEN_SIZE, num_layers=NUM_LAYERS):
        super(CNN, self).__init__()
        
        # Embedding Layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # First Convolutional Layer
        self.conv1d_1 = nn.Conv1d(embedding_dim, 256, kernel_size=3, padding=1)
        
        # Second Convolutional Layer
        self.conv1d_2 = nn.Conv1d(256, 128, kernel_size=3, padding=1)
        
        # LSTM Layer
        self.lstm = nn.LSTM(128, hidden_size, num_layers=num_layers, batch_first=True, bidirectional=True)

        # Dropout Layer
        self.dropout = nn.Dropout(p=0.5)
        
        # Linear Layer
        self.linear = nn.Linear(2 * hidden_size, num_classes)

    def forward(self, sentences):
        embeddings = self.embedding(sentences)
        
        # First Convolutional Layer
        conv_out_1 = self.conv1d_1(embeddings.permute(0, 2, 1))
        conv_out_1 = F.relu(conv_out_1)
        conv_out_1 = conv_out_1.permute(0, 2, 1)
        
        # Second Convolutional Layer
        conv_out_2 = self.conv1d_2(conv_out_1.permute(0, 2, 1))
        conv_out_2 = F.relu(conv_out_2)
        conv_out_2 = conv_out_2.permute(0, 2, 1)
        
        # LSTM Layer
        lstm_out, _ = self.lstm(self.dropout(conv_out_2))
        
        # Linear Layer
        output = self.linear(lstm_out)

        return output

### CRF

In [63]:
class LSTM_CRF(nn.Module):
    def __init__(self, vocab_size, n_classes, embedding_dim=EMBEDDING_DIM, hidden_size=HIDDEN_SIZE, num_layers=NUM_LAYERS, dropout=0.6):
        super(LSTM_CRF, self).__init__()

        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # LSTM layer
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers=num_layers, batch_first=True, bidirectional=True)

        # Linear layer
        self.linear = nn.Linear(hidden_size * 2, n_classes)
        self.dropout = nn.Dropout(dropout)  # Apply dropout before the linear layer

        # CRF layer
        self.crf = CRF(n_classes)  # Place the CRF layer after the linear layer

    def forward(self, sentences):
        embeddings = self.embedding(sentences)
        lstm_out, _ = self.lstm(embeddings)
        dropout_out = self.dropout(lstm_out)  # Apply dropout
        output = self.linear(dropout_out)
        return output  # Return raw output for CRF loss calculation

    def predict(self, sentences):
        output = self.forward(sentences)
        predictions = self.crf.decode(output)
        return predictions

### Train

In [72]:
def train(model, path, train_dataset, train_labels, val_dataset, val_labels, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS, learning_rate=LEARNING_RATE):
    """
    This function implements the training logic
    Inputs:
    - model: the model to be trained
    - train_dataset: the training set
    - batch_size: integer represents the number of examples per step
    - epochs: integer represents the total number of epochs (full training pass)
    - learning_rate: the learning rate to be used by the optimizer
    """
    print(f'Number of Iterations: {epochs}, Batch Size: {batch_size}, LR: {learning_rate}')

    # (1) create the dataloader of the training set & validation set (make the shuffle=True)
    tensor_train_dataset = TensorDataset(train_dataset, train_labels)
    train_dataloader = DataLoader(tensor_train_dataset, batch_size=batch_size, shuffle=True)

    tensor_val_dataset = TensorDataset(val_dataset, val_labels)
    val_dataloader = DataLoader(tensor_val_dataset, batch_size=batch_size, shuffle=False)

    # (2) make the criterion cross entropy loss
    criterion = nn.CrossEntropyLoss(ignore_index=PAD)

    # (3) create the optimizer (Adam)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    
    # (4) create a learning rate scheduler (optional but recommended)
    scheduler = StepLR(optimizer, step_size=5, gamma=0.1)

    # GPU configuration
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    if use_cuda:
        model = model.cuda()
        criterion = criterion.cuda()

    best_acc = 0.0
    loss = 0.0

    for epoch_num in range(epochs):
        
        for train_input, train_label in tqdm(train_dataloader):
            # Zero your gradients
            optimizer.zero_grad()

            # Move the train input to the device
            train_label = train_label.to(device)

            # Move the train label to the device
            train_input = train_input.to(device)

            # Do the forward pass
            output = model(train_input).float()

            # Loss calculation
            batch_loss = criterion(output.view(-1, output.shape[-1]), train_label.view(-1))

            # Do the backward pass
            batch_loss.backward()

            # Update the weights with your optimizer
            optimizer.step()
            
            # Calculate the loss
            loss = batch_loss.item()

        # Step the learning rate scheduler
        scheduler.step()

        # ================== Validation ===================
        
        model.eval() # Set the model to evaluation mode
        
        correct_predictions = 0
        actual_predictions = 0

        with torch.no_grad():
            for val_input, val_label in tqdm(val_dataloader):
                val_label = val_label.to(device)
                val_input = val_input.to(device)

                output = model(val_input)

                predictions = output.argmax(dim=2)
                labels_without_pad = (val_label != PAD)

                correct_predictions += ((predictions == val_label) & labels_without_pad).sum().item()
                actual_predictions += labels_without_pad.sum().item()

        acc = correct_predictions / actual_predictions

        print(f'\nEpoch: {epoch_num + 1} | Loss: {loss} | Accuracy: {acc}\n')

        if acc > best_acc:
            best_acc = acc
            torch.save(model.state_dict(), path)
            print(f'Saved the best model with validation accuracy: {best_acc} to {path}')

        model.train()

### Prepare Training Data

In [65]:
corpus=  readFile(TRAIN_PATH)

x_train = []
y_train = []

for sentence in corpus[:100]:
	# Clean each sentence in the corpus
	# Get the char list for each word in the sentence and its corresponding diacritics
	char_list, diacritics_list = separate_words_and_diacritics(sentence.strip())

	x_train.append(char_list)
	y_train.append(diacritics_list)

X_train_padded = [torch.tensor([char_to_index[char] for char in word]) for sentence in x_train for word in sentence]
X_train_padded = pad_sequence(X_train_padded, batch_first=True)

y_train_padded = [torch.tensor([diacritic_to_index[char] for char in word]) for sentence in y_train for word in sentence]
y_train_padded = pad_sequence(y_train_padded, batch_first=True, padding_value=PAD)

### Prepare Validation Data

In [66]:
valid_corpus = readFile(VAL_PATH)

x_val = []
y_val = []

for sentence in valid_corpus[:50]:
	# Clean each sentence in the corpus
	# Get the char list for each word in the sentence and its corresponding diacritics
	char_list, diacritics_list = separate_words_and_diacritics(sentence.strip())

	x_val.append(char_list)
	y_val.append(diacritics_list)

x_val_padded = [torch.tensor([char_to_index[char] for char in word]) for sentence in x_val for word in sentence ]
x_val_padded = pad_sequence(x_val_padded, batch_first=True)

y_val_padded = [torch.tensor([diacritic_to_index[char] for char in word]) for sentence in y_val for word in sentence ]
y_val_padded = pad_sequence(y_val_padded, batch_first=True, padding_value=PAD)

### Split test and val

In [67]:
# Create an index array
indices = list(range(len(x_val_padded)))

# Split the indices into validation and test sets
indices_val, indices_test = train_test_split(indices, test_size=0.5, random_state=42)

# Use the indices to get the corresponding data for validation and test sets
x_val = x_val_padded[indices_val]
y_val = y_val_padded[indices_val]

x_test = x_val_padded[indices_test]
y_test = y_val_padded[indices_test]

### Execute

In [68]:
def run_RNN():
    model=RNN()
    print(model)
    train(model, LSTM_PATH, X_train_padded, y_train_padded, x_val, y_val)
    
def run_CNN():
    model=CNN()
    print(model)
    train(model, CNN_PATH, X_train_padded, y_train_padded, x_val, y_val)

def run_CRF():
    model=LSTM_CRF()
    print(model)
    train(model, CRF_PATH, X_train_padded, y_train_padded, x_val, y_val)

In [None]:
# run_RNN()
run_CNN()
# run_CRF()