In [1]:
import torch
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
import torch.nn.functional as F
from tqdm import tqdm
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from TorchCRF import CRF
from torch.optim.lr_scheduler import StepLR
from sklearn.model_selection import train_test_split
import gensim
import numpy as np
from utilities import *
import time

# %run train.ipynb
%run updatePreprocessing.ipynb

  from .autonotebook import tqdm as notebook_tqdm


[['ق', 'و', 'ل', 'ه'], ['ل', 'ع', 'د', 'م'], ['م', 'ا'], ['ت', 'ت', 'ع', 'ل', 'ق'], ['إ', 'ل', 'خ'], ['أ', 'ي'], ['ا', 'ل', 'و', 'ص', 'ي', 'ة'], ['ق', 'و', 'ل', 'ه'], ['م', 'ا'], ['م', 'ر'], ['أ', 'ي'], ['ق', 'ب', 'ي', 'ل'], ['ق', 'و', 'ل'], ['ا', 'ل', 'م', 'ت', 'ن'], ['ل', 'غ', 'ت'], ['و', 'ل', 'و'], ['ا', 'ق', 'ت', 'ص', 'ر'], ['ع', 'ل', 'ى'], ['أ', 'و', 'ص', 'ي', 'ت'], ['ل', 'ه'], ['ب', 'ش', 'ا', 'ة'], ['أ', 'و'], ['أ', 'ع', 'ط', 'و', 'ه'], ['ش', 'ا', 'ة'], ['و', 'ل', 'ا'], ['غ', 'ن', 'م'], ['ل', 'ه'], ['ع', 'ن', 'د'], ['ا', 'ل', 'م', 'و', 'ت'], ['ه', 'ل'], ['ت', 'ب', 'ط', 'ل'], ['ا', 'ل', 'و', 'ص', 'ي', 'ة'], ['أ', 'و'], ['ي', 'ش', 'ت', 'ر', 'ى'], ['ل', 'ه'], ['ش', 'ا', 'ة'], ['و', 'ي', 'ؤ', 'خ', 'ذ'], ['م', 'ن'], ['ق', 'و', 'ل', 'ه'], ['ا', 'ل', 'آ', 'ت', 'ي'], ['ك', 'م', 'ا'], ['ل', 'و'], ['ل', 'م'], ['ي', 'ق', 'ل'], ['م', 'ن'], ['م', 'ا', 'ل', 'ي'], ['و', 'ل', 'ا'], ['م', 'ن'], ['غ', 'ن', 'م', 'ي'], ['أ', 'ن', 'ه', 'ا'], ['ل', 'ا'], ['ت', 'ب', 'ط', 'ل'], ['و', 'ع', 'ب', 'ا', 'ر',

In [2]:
EMBEDDING_DIM = 300
HIDDEN_SIZE = 512
NUM_LAYERS = 2
NUM_EPOCHS = 10
LEARNING_RATE = 0.001
BATCH_SIZE = 100
VOCAB_SIZE = len(basic_arabic_letters) + 1
LABELS_SIZE = len(DIACRITICS)
PAD = 15
WINDOW_SIZE_BEFORE = 10
WINDOW_SIZE_AFTER = 3
CONTEXTUAL_EMBEDDING_DIM=100

TRAIN_PATH = "./dataset/train.txt"
VAL_PATH = "./dataset/val.txt"
TEST_PATH = "./dataset/test.txt"
LSTM_PATH="./models/lstm.pth"
RNN_PATH="./models/rnn.pth"
CNN_PATH = "./models/cnn.pth"
CRF_PATH = "./models/crf.pth"

In [3]:
class CNN(nn.Module):
    def __init__(self, vocab_size=VOCAB_SIZE, num_classes=LABELS_SIZE, embedding_dim=EMBEDDING_DIM, hidden_size=HIDDEN_SIZE, num_layers=NUM_LAYERS
                , pretrained_embedding=None, freeze_embedding=False,contextual_embedding_dim=CONTEXTUAL_EMBEDDING_DIM):
        super(CNN, self).__init__()
        
        # Embedding layer
        if pretrained_embedding is not None:
            self.embedding  = nn.Embedding.from_pretrained(pretrained_embedding, freeze=freeze_embedding)
        else:
            self.embedding =nn.Embedding(vocab_size, embedding_dim)
        
        # Convolutional Layer
        self.conv1d = nn.Conv1d(contextual_embedding_dim, 256, kernel_size=3, padding=1).double()
        
        # LSTM Layer
        self.lstm = nn.LSTM(256, hidden_size, num_layers=num_layers, batch_first=True, bidirectional=True).double()
        
        # Linear Layer
        self.linear = nn.Linear(2 * hidden_size, num_classes).float()

    def forward(self, sentences):
        embeddings = self.embedding(sentences)
        # if self.contextual_embedding is not None:
        #     embeddings = torch.cat([embeddings, self.contextual_embedding(sentences)], dim=2)  # Concatenate embeddings
        # Convolutional Layer
        conv_out = self.conv1d(embeddings.permute(0, 2, 1))
        conv_out = F.relu(conv_out)
        
        # LSTM Layer
        lstm_out, _ = self.lstm(conv_out.permute(0, 2, 1))
        
        # Linear Layer
        output = self.linear(lstm_out.float())

        return output

In [4]:
t_model = gensim.models.Word2Vec.load('models/full_grams_cbow_100_twitter.mdl')
embedding_dim = t_model.vector_size


In [5]:
def validate(model, val_dataset, val_labels, batch_size=BATCH_SIZE):
    """
    This function implements the validation logic
    Inputs:
    - model: the trained model
    - val_dataset: the validation set
    - batch_size: integer representing the number of examples per step
    """

    # (1) create the dataloader for the validation set (make shuffle=False)
    tensor_val_dataset = TensorDataset(val_dataset, val_labels)
    val_dataloader = DataLoader(tensor_val_dataset, batch_size=batch_size, shuffle=False)

    # GPU configuration
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    if use_cuda:
        model = model.cuda()

    total_acc_val = 0

    with torch.no_grad():
        for val_input, val_label in tqdm(val_dataloader):

            # Move the validation input to the device
            val_label = val_label.to(device)

            # Move the validation label to the device
            val_input = val_input.to(device)

            # Do the forward pass
            output = model(val_input).float()

            # Calculate the batch accuracy
            correct_predictions = (output.argmax(dim=2) == val_label)
            acc = correct_predictions.sum().item()
            total_acc_val += acc

    # Calculate metrics for the entire validation set
    val_accuracy = total_acc_val / (len(val_dataset) * len(val_dataset[0]))

    print(f'Validation Accuracy: {val_accuracy} | DER: {1 - val_accuracy}\n')

In [6]:
aravec_embeddings_val_test = []

valid_corpus = readFile(VAL_PATH)

X_val = []
y_val = []

for sentence in valid_corpus:
	# Clean each sentence in the corpus
	# Get the char list for each word in the sentence and its corresponding diacritics
  char_list, diacritics_list = separate_words_and_diacritics(sentence.strip())
  words = [''.join(sublist) for sublist in char_list]
  windows=get_all_windows(' '.join(words), WINDOW_SIZE_BEFORE, WINDOW_SIZE_AFTER)
  for window in windows:
    embeddings = [t_model.wv[clean_str(word)] if clean_str(word) in t_model.wv else np.zeros(embedding_dim) for word in window]
    aravec_embeddings_val_test.append(np.mean(embeddings, axis=0))
  X_val.append(char_list)
  y_val.append(diacritics_list)

X_val_padded = [torch.tensor([char_to_index[char] for char in word]) for sentence in X_val for word in sentence ]
X_val_padded = pad_sequence(X_val_padded, batch_first=True)

y_val_padded = [torch.tensor([diacritic_to_index[char] for char in word]) for sentence in y_val for word in sentence ]
y_val_padded = pad_sequence(y_val_padded, batch_first=True, padding_value=PAD)


In [7]:
aravec_embeddings_train = []
corpus=  readFile(TRAIN_PATH)

x_train = []
y_train = []

for sentence in corpus:
	# Clean each sentence in the corpus
	# Get the char list for each word in the sentence and its corresponding diacritics
	char_list, diacritics_list = separate_words_and_diacritics(sentence.strip())
	words = [''.join(sublist) for sublist in char_list]
	windows=get_all_windows(' '.join(words), WINDOW_SIZE_BEFORE, WINDOW_SIZE_AFTER)
	for window in windows:
		embeddings = [t_model.wv[clean_str(word)] if clean_str(word) in t_model.wv else np.zeros(embedding_dim) for word in window]
		aravec_embeddings_train.append(np.mean(embeddings, axis=0))
	x_train.append(char_list)
	y_train.append(diacritics_list)

X_train_padded = [torch.tensor([char_to_index[char] for char in word]) for sentence in x_train for word in sentence]
X_train_padded = pad_sequence(X_train_padded, batch_first=True)

y_train_padded = [torch.tensor([diacritic_to_index[char] for char in word]) for sentence in y_train for word in sentence]
y_train_padded = pad_sequence(y_train_padded, batch_first=True, padding_value=PAD)

In [8]:
# Load the saved RNN model for inference
loaded_CNN_model = CNN(pretrained_embedding=torch.tensor(aravec_embeddings_val_test+aravec_embeddings_train), freeze_embedding=True)
loaded_CNN_model.load_state_dict(torch.load("./models/cnn.pth", map_location=torch.device('cpu')))
loaded_CNN_model.eval()
validate(loaded_CNN_model, X_val_padded, y_val_padded)

  loaded_CNN_model = CNN(pretrained_embedding=torch.tensor(aravec_embeddings_val_test+aravec_embeddings_train), freeze_embedding=True)
100%|██████████| 1061/1061 [04:40<00:00,  3.78it/s]

Validation Accuracy: 0.32879897895596705 | DER: 0.6712010210440329




