### Imports

In [91]:
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F
from tqdm import tqdm
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from TorchCRF import CRF
from sklearn.model_selection import train_test_split

%run updatePreprocessing.ipynb

7
379
قَوْلُهُ لِعَدَمِ مَا تَتَعَلَّقُ إلَخْ أَيْ الْوَصِيَّةُ قَوْلُهُ مَا مَرَّ أَيْ قُبَيْلَ قَوْلِ الْمَتْنِ لَغَتْ وَلَوْ اقْتَصَرَ عَلَى أَوْصَيْت لَهُ بِشَاةٍ أَوْ أَعْطُوهُ شَاةً وَلَا غَنَمَ لَهُ عِنْدَ الْمَوْتِ هَلْ تَبْطُلُ الْوَصِيَّةُ أَوْ يُشْتَرَى لَهُ شَاةٌ وَيُؤْخَذُ مِنْ قَوْلِهِ الْآتِي كَمَا لَوْ لَمْ يَقُلْ مِنْ مَالِي وَلَا مِنْ غَنَمِي أَنَّهَا لَا تَبْطُلُ 
----------------------------
234
 وَعِبَارَةُ الْكَنْزِ وَلَوْ لَمْ يَقُلْ مِنْ مَالِي وَلَا مِنْ غَنَمِي لَمْ يَتَعَيَّنْ غَنَمُهُ إنْ كَانَتْ انْتَهَتْ ا ه سم قَوْلُهُ فَيُعْطَى وَاحِدَةً مِنْهَا إلَخْ كَمَا لَوْ كَانَتْ مَوْجُودَةً عِنْدَ الْوَصِيَّةِ وَالْمَوْتِ 
----------------------------
96
 وَلَا يَجُوزُ أَنْ يُعْطَى وَاحِدَةً مِنْ غَيْرِ غَنَمِهِ فِي الصُّورَتَيْنِ وَإِنْ تَرَاضَيَا 
----------------------------
106
 لِأَنَّهُ صُلْحٌ عَلَى مَجْهُولٍ مُغْنِي وَنِهَايَةٌ قَالَ ع ش قَوْلُهُ وَاحِدَةً مِنْهَا أَيْ كَامِلَةً 
----------------------------
54
 وَلَا يَجُوزُ أَنْ يُعْطَى نِصْفَيْنِ مِنْ ش

### Constants

In [92]:
EMBEDDING_DIM = 200
HIDDEN_SIZE = 512
NUM_LAYERS = 1
NUM_EPOCHS = 10
LEARNING_RATE = 0.001
BATCH_SIZE = 256
VOCAB_SIZE = len(basic_arabic_letters) + 1
LABELS_SIZE = len(DIACRITICS)

TRAIN_PATH = "../dataset/train.txt"
VAL_PATH = "../dataset/val.txt"
LSTM_PATH="./models/lstm.pth"
RNN_PATH="./models/rnn.pth"
CNN_PATH = "./models/cnn.pth"
CRF_Val_PATH="./models/crf_val.pth"
CRF_PATH="./models/crf.pth"
CNN_val_PATH="./models/cnn_val.pth"

### Model building

### RNN

In [93]:
class RNN(nn.Module):
    def __init__(self, vocab_size, n_classes, embedding_dim=EMBEDDING_DIM, hidden_size=HIDDEN_SIZE, num_layers=NUM_LAYERS):
        """
        The constructor of our RNN model
        Inputs:
        - vacab_size: the number of unique characters
        - embedding_dim: the embedding dimension
        - n_classes: the number of final classes (diacritics)
        """
        super(RNN, self).__init__()

        # (1) Create an embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # (2) Create an LSTM layer with hidden size = hidden_size and batch_first = True
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers=num_layers, batch_first=True, bidirectional=True)
        
        # (3) Create a linear layer with number of neorons = n_classes
        self.linear = nn.Linear(hidden_size * 2, n_classes)

    def forward(self, sentences):
        """
        This function does the forward pass of our model
        Inputs:
        - sentences: tensor of shape (batch_size, max_length)

        Returns:
        - final_output: tensor of shape (batch_size, max_length, n_classes)
        """

        final_output = None
        
        embeddings = self.embedding(sentences)
        lstm_out, _ = self.lstm(embeddings)
        output = self.linear(lstm_out)
        # final_output = F.softmax(output, dim=1)
        return output

### CNN

In [94]:
class CNN(nn.Module):
    def __init__(self, vocab_size, n_classes, embedding_dim=EMBEDDING_DIM):
        super(CNN, self).__init__()

        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # Convolutional layers
        self.conv1 = nn.Conv1d(embedding_dim, 256, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv1d(256, 256, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv1d(256, 256, kernel_size=3, stride=1, padding=1)

        # Max pooling layers
        # self.pool = nn.MaxPool1d(kernel_size=2, stride=2, padding=0)

        # Fully connected layers
        self.fc1 = nn.Linear(128, 256)
        self.fc2 = nn.Linear(256, n_classes)

    def forward(self, x):
        print("1", x.shape)
        x = self.embedding(x)
        x = x.permute(0, 2, 1)
        print("2", x.shape)
        # Convolutional layers with ReLU activation and max pooling
        x = F.relu(self.conv1(x))
        # x = self.pool(x)
        print("3", x.shape)
        x = F.relu(self.conv2(x))
        # x = self.pool(x)
        print("4", x.shape)
        x = F.relu(self.conv3(x))
        # x = self.pool(x)
        print("5", x.shape)

        # Fully connected layers with ReLU activation
        x = x.view(-1, 128)
        x = F.relu(self.fc1(x))
        print("6", x.shape)
        x = self.fc2(x)
        print("7", x.shape)

        return x

In [95]:
class RNN_CNN(nn.Module):
    def __init__(self, vocab_size, n_classes, embedding_dim=EMBEDDING_DIM, hidden_size=HIDDEN_SIZE, num_layers=NUM_LAYERS):
        super(RNN_CNN, self).__init__()

        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # CNN layer
        self.conv1d = nn.Conv1d(in_channels=embedding_dim, out_channels=64, kernel_size=3, padding=1)  # Adjust parameters as needed

        # LSTM layer
        self.lstm = nn.LSTM(64, hidden_size, num_layers=num_layers, batch_first=True, bidirectional=True)

        # Linear layer
        self.linear = nn.Linear(hidden_size * 2, n_classes)

    def forward(self, sentences):
        embeddings = self.embedding(sentences)

        # Pass embeddings through CNN layer
        conv_out = self.conv1d(embeddings.permute(0, 2, 1))  # Reshape for CNN
        conv_out = torch.relu(conv_out)
        conv_out = conv_out.permute(0, 2, 1)  # Reshape back for LSTM

        # Pass CNN output through LSTM layer
        lstm_out, _ = self.lstm(conv_out)

        # Final output layer
        output = self.linear(lstm_out)
        return output


### CRF

In [96]:
class LSTM_CRF(nn.Module):
    def __init__(self, vocab_size, n_classes, embedding_dim=EMBEDDING_DIM, hidden_size=HIDDEN_SIZE, num_layers=NUM_LAYERS):
        super(LSTM_CRF, self).__init__()
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # BiLSTM layer
        self.bilstm = nn.LSTM(embedding_dim, hidden_size, num_layers=num_layers, batch_first=True, bidirectional=True)

        # CRF layer
        self.crf = CRF(n_classes)

        # Linear layer
        self.linear = nn.Linear(hidden_size * 2, n_classes)
        # Create a CRF layer
        self.crf = CRF(n_classes, batch_first=True)

    def forward(self, sentences):
        # Input x is a sequence of indices
        embedded = self.embedding(sentences)

        # BiLSTM layer
        lstm_out, _ = self.bilstm(embedded)

        # Linear layer for classification
        linear_out = self.linear(lstm_out)
        if labels is not None:
            # Calculate the negative log-likelihood loss using the CRF layer
            loss = self.crf(output, labels)
            return -loss  # Return negative log-likelihood as we usually minimize it during training
        else:
            # If labels are not provided, return the raw output
            return output


In [97]:
class RNN_CRF(nn.Module):
    def __init__(self, vocab_size, n_classes, embedding_dim=EMBEDDING_DIM, hidden_size=HIDDEN_SIZE, num_layers=NUM_LAYERS, dropout=0.5):
        super(RNN_CRF, self).__init__()

        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # LSTM layer
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers=num_layers, batch_first=True, bidirectional=True)

        # Linear layer
        self.linear = nn.Linear(hidden_size * 2, n_classes)
        self.dropout = nn.Dropout(dropout)  # Apply dropout before the linear layer

        # CRF layer
        self.crf = CRF(n_classes)  # Place the CRF layer after the linear layer

    def forward(self, sentences):
        embeddings = self.embedding(sentences)
        lstm_out, _ = self.lstm(embeddings)
        dropout_out = self.dropout(lstm_out)  # Apply dropout
        output = self.linear(dropout_out)
        return output  # Return raw output for CRF loss calculation

    def predict(self, sentences):
        output = self.forward(sentences)
        predictions = self.crf.decode(output)
        return predictions


In [None]:
class RNN_CRF_MultiLayer(nn.Module):
    def __init__(self, vocab_size, word_vocab_size, n_classes, embedding_dim=EMBEDDING_DIM, hidden_size=HIDDEN_SIZE, num_layers=NUM_LAYERS, dropout=0.5):
        super(RNN_CRF_MultiLayer, self).__init__()

        # Word embedding layer
        self.word_embedding = nn.Embedding(word_vocab_size, embedding_dim)

        # Word LSTM layer
        self.word_lstm = nn.LSTM(embedding_dim, hidden_size, num_layers=num_layers, batch_first=True, bidirectional=True)

        # Character embedding layer (already exists)
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # Character LSTM layer (already exists)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers=num_layers, batch_first=True, bidirectional=True)

        # Linear layer after concatenation
        self.linear = nn.Linear(hidden_size * 4, n_classes)  # Combined output from both LSTMs
        self.dropout = nn.Dropout(dropout)

        # CRF layer
        self.crf = CRF(n_classes)

    def forward(self, sentences, words):
        # Word layer processing
        word_embeddings = self.word_embedding(words)
        word_lstm_out, _ = self.word_lstm(word_embeddings)

        # Character layer processing
        embeddings = self.embedding(sentences)
        lstm_out, _ = self.lstm(embeddings)

        # Concatenate outputs
        combined_output = torch.cat([lstm_out, word_lstm_out], dim=2)

        # Linear transformation and CRF
        dropout_out = self.dropout(combined_output)
        output = self.linear(dropout_out)
        return output


### Train

In [98]:

def train(model, path,val_path, train_dataset, train_labels, val_dataset,val_labels,batch_size=BATCH_SIZE, epochs=NUM_EPOCHS, learning_rate=LEARNING_RATE):
    """
    This function implements the training logic
    Inputs:
    - model: the model to be trained
    - train_dataset: the training set
    - batch_size: integer represents the number of examples per step
    - epochs: integer represents the total number of epochs (full training pass)
    - learning_rate: the learning rate to be used by the optimizer
    """

    # (1) create the dataloader of the training set (make the shuffle=True)
    tensor_train_dataset = TensorDataset(train_dataset, train_labels)
    train_dataloader = DataLoader(tensor_train_dataset, batch_size=batch_size, shuffle=True)
    tensor_val_dataset = TensorDataset(val_dataset, val_labels)
    val_dataloader = DataLoader(tensor_val_dataset, batch_size=batch_size, shuffle=False)

    # (2) make the criterion cross entropy loss
    criterion = torch.nn.CrossEntropyLoss()

    # (3) create the optimizer (Adam)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    # (4) create a learning rate scheduler (optional but recommended)
    scheduler = StepLR(optimizer, step_size=5, gamma=0.1)  # Adjust parameters as needed

    # GPU configuration
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    if use_cuda:
        model = model.cuda()
        criterion = criterion.cuda()
    
    best_accuracy = 0.0
    best_accuracy_val=0.0
    for epoch_num in range(epochs):
        total_acc_train = 0
        total_loss_train = 0
        
        for train_input, train_label in tqdm(train_dataloader):
            # Zero your gradients
            optimizer.zero_grad()

            # Move the train input to the device
            train_label = train_label.to(device)

            # Move the train label to the device
            train_input = train_input.to(device)

            # Do the forward pass
            output = model(train_input).float()

            # Loss calculation
            batch_loss = criterion(output.view(-1, output.shape[-1]), train_label.view(-1))

            # Append the batch loss to the total_loss_train
            total_loss_train += batch_loss.item()
            
            # Calculate the batch accuracy (just add the number of correct predictions)
            # Compare predicted diacritic with true diacritic and count correct predictions
            correct_predictions = (output.argmax(dim=2) == train_label)

            # Calculate accuracy for the current batch
            acc = correct_predictions.sum().item()
            total_acc_train += acc

            # Do the backward pass
            batch_loss.backward()

            # Update the weights with your optimizer
            optimizer.step()     
        # Step the learning rate scheduler
        scheduler.step()
        # Calculate the epoch loss
        epoch_loss = total_loss_train / len(train_dataset)

        # Calculate the accuracy
        epoch_acc = total_acc_train / (len(train_dataset) * len(train_dataset[0]))

        print(f'Epochs: {epoch_num + 1} | Train Loss: {epoch_loss} \
            | Train Accuracy: {epoch_acc}\n')
        
        if epoch_acc > best_accuracy:
            best_accuracy = epoch_acc
            torch.save(model.state_dict(), path)
            print(f'Saved the best model with accuracy: {best_accuracy} to {path}\n')
         # Validation
        model.eval()  # Set the model to evaluation mode
        total_acc_val = 0
        total_loss_val = 0

        with torch.no_grad():
            for val_input, val_label in tqdm(val_dataloader):
                val_label = val_label.to(device)
                val_input = val_input.to(device)

                output = model(val_input)
                batch_loss = criterion(output.view(-1, output.shape[-1]), val_label.view(-1))
                total_loss_val += batch_loss.item()

                correct_predictions = (output.argmax(dim=2) == val_label)
                acc = correct_predictions.sum().item()
                total_acc_val += acc

        epoch_loss_val = total_loss_val / len(val_dataloader)
        epoch_acc_val = total_acc_val / (len(val_dataset) * len(val_dataset[0]))

        print(f'Epochs: {epoch_num + 1} | Validation Loss: {epoch_loss_val} | Validation Accuracy: {epoch_acc_val}')

        if epoch_acc_val > best_accuracy_val:
            best_accuracy_val = epoch_acc_val
            torch.save(model.state_dict(), val_path)
            print(f'Saved the best model with validation accuracy: {best_accuracy_val} to {val_path}')
        model.train()


In [99]:
corpus=  readFile(TRAIN_PATH)

x_train = []
y_train = []

for sentence in corpus[:1000]:
	# Clean each sentence in the corpus
	# Get the char list for each word in the sentence and its corresponding diacritics
	char_list, diacritics_list = separate_words_and_diacritics(sentence.strip())

	x_train.append(char_list)
	y_train.append(diacritics_list)

X_train_padded = [torch.tensor([char_to_index[char] for char in word]) for sentence in x_train for word in sentence]
X_train_padded = pad_sequence(X_train_padded, batch_first=True)

y_train_padded = [torch.tensor([diacritic_to_index[char] for char in word]) for sentence in y_train for word in sentence]
y_train_padded = pad_sequence(y_train_padded, batch_first=True)

In [100]:
valid_corpus = readFile(VAL_PATH)

X_val = []
y_val = []

for sentence in valid_corpus[:50]:
	# Clean each sentence in the corpus
	# Get the char list for each word in the sentence and its corresponding diacritics
	char_list, diacritics_list = separate_words_and_diacritics(sentence.strip())

	X_val.append(char_list)
	y_val.append(diacritics_list)

X_val_padded = [torch.tensor([char_to_index[char] for char in word]) for sentence in X_val for word in sentence ]
X_val_padded = pad_sequence(X_val_padded, batch_first=True)

y_val_padded = [torch.tensor([diacritic_to_index[char] for char in word]) for sentence in y_val for word in sentence ]
# print(y_val_padded)
y_val_padded = pad_sequence(y_val_padded, batch_first=True)
# print(y_val_padded)

In [101]:
# Create an index array
indices = list(range(len(X_val_padded)))

# Split the indices into validation and test sets
indices_val, indices_test = train_test_split(indices, test_size=0.5, random_state=42)

# Use the indices to get the corresponding data for validation and test sets
x_val = X_val_padded[indices_val]
y_val = y_val_padded[indices_val]

x_test = X_val_padded[indices_test]
y_test = y_val_padded[indices_test]

In [102]:
def run_RNN():
    model=RNN(VOCAB_SIZE, LABELS_SIZE)
    print(model)
    train(model, LSTM_PATH, X_train_padded, y_train_padded)
    
def run_CNN():
    model=CNN(VOCAB_SIZE, LABELS_SIZE)
    print(model)
    train(model, CNN_PATH, X_train_padded, y_train_padded)
    
def run_CNN_eslam():
    model=RNN_CNN(VOCAB_SIZE, LABELS_SIZE)
    print(model)
    train(model,CNN_PATH,CNN_val_PATH, X_train_padded, y_train_padded,X_val,y_val)
    
def run_CRF():
    model=LSTM_CRF(VOCAB_SIZE, LABELS_SIZE)
    print(model)
    train(model, CNN_PATH, X_train_padded, y_train_padded)
def run_CRF_eslam():
    model=RNN_CRF(VOCAB_SIZE, LABELS_SIZE)
    print(model)
    train(model,CRF_PATH,CRF_Val_PATH, X_train_padded, y_train_padded,x_val,y_val)

In [104]:
# run_RNN()
# run_CNN()
# run_CRF()
run_CRF_eslam()
# run_CNN_eslam()

RNN_CRF(
  (embedding): Embedding(37, 200)
  (lstm): LSTM(200, 512, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=1024, out_features=15, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (crf): CRF()
)


  0%|          | 0/179 [00:00<?, ?it/s]

100%|██████████| 179/179 [01:03<00:00,  2.83it/s]


Epochs: 1 | Train Loss: 0.001466312140909259             | Train Accuracy: 0.8741685022575276

Saved the best model with accuracy: 0.8741685022575276 to ./models/crf.pth



100%|██████████| 5/5 [00:00<00:00,  7.54it/s]


Epochs: 1 | Validation Loss: 0.23451324701309204 | Validation Accuracy: 0.9177075098814229
Saved the best model with validation accuracy: 0.9177075098814229 to ./models/crf_val.pth


100%|██████████| 179/179 [01:12<00:00,  2.48it/s]


Epochs: 2 | Train Loss: 0.0007669565973934302             | Train Accuracy: 0.9321884562099451

Saved the best model with accuracy: 0.9321884562099451 to ./models/crf.pth



100%|██████████| 5/5 [00:00<00:00,  6.60it/s]


Epochs: 2 | Validation Loss: 0.18367249071598052 | Validation Accuracy: 0.9339920948616601
Saved the best model with validation accuracy: 0.9339920948616601 to ./models/crf_val.pth


100%|██████████| 179/179 [01:12<00:00,  2.48it/s]


Epochs: 3 | Train Loss: 0.000641517197369562             | Train Accuracy: 0.9425262381517178

Saved the best model with accuracy: 0.9425262381517178 to ./models/crf.pth



100%|██████████| 5/5 [00:00<00:00,  7.48it/s]


Epochs: 3 | Validation Loss: 0.1627883493900299 | Validation Accuracy: 0.9410276679841897
Saved the best model with validation accuracy: 0.9410276679841897 to ./models/crf_val.pth


100%|██████████| 179/179 [01:10<00:00,  2.55it/s]


Epochs: 4 | Train Loss: 0.000573394979259291             | Train Accuracy: 0.9488473153860721

Saved the best model with accuracy: 0.9488473153860721 to ./models/crf.pth



100%|██████████| 5/5 [00:00<00:00,  7.29it/s]


Epochs: 4 | Validation Loss: 0.1523817628622055 | Validation Accuracy: 0.9454545454545454
Saved the best model with validation accuracy: 0.9454545454545454 to ./models/crf_val.pth


100%|██████████| 179/179 [01:11<00:00,  2.52it/s]


Epochs: 5 | Train Loss: 0.0005251803545559228             | Train Accuracy: 0.9528719937008502

Saved the best model with accuracy: 0.9528719937008502 to ./models/crf.pth



100%|██████████| 5/5 [00:00<00:00,  7.39it/s]


Epochs: 5 | Validation Loss: 0.14708330631256103 | Validation Accuracy: 0.945296442687747


100%|██████████| 179/179 [01:13<00:00,  2.44it/s]


Epochs: 6 | Train Loss: 0.0004683776966195765             | Train Accuracy: 0.958008990242298

Saved the best model with accuracy: 0.958008990242298 to ./models/crf.pth



100%|██████████| 5/5 [00:00<00:00,  6.84it/s]


Epochs: 6 | Validation Loss: 0.14153141975402833 | Validation Accuracy: 0.9480632411067194
Saved the best model with validation accuracy: 0.9480632411067194 to ./models/crf_val.pth


100%|██████████| 179/179 [01:16<00:00,  2.35it/s]


Epochs: 7 | Train Loss: 0.0004543478700848293             | Train Accuracy: 0.9593445694750377

Saved the best model with accuracy: 0.9593445694750377 to ./models/crf.pth



100%|██████████| 5/5 [00:00<00:00,  6.88it/s]


Epochs: 7 | Validation Loss: 0.14015962779521943 | Validation Accuracy: 0.9490909090909091
Saved the best model with validation accuracy: 0.9490909090909091 to ./models/crf_val.pth


100%|██████████| 179/179 [01:16<00:00,  2.34it/s]


Epochs: 8 | Train Loss: 0.00044747963866282025             | Train Accuracy: 0.959779131076138

Saved the best model with accuracy: 0.959779131076138 to ./models/crf.pth



100%|██████████| 5/5 [00:00<00:00,  6.09it/s]


Epochs: 8 | Validation Loss: 0.13921998739242553 | Validation Accuracy: 0.9490909090909091


100%|██████████| 179/179 [01:13<00:00,  2.43it/s]


Epochs: 9 | Train Loss: 0.0004381850323431613             | Train Accuracy: 0.9608376274531302

Saved the best model with accuracy: 0.9608376274531302 to ./models/crf.pth



100%|██████████| 5/5 [00:00<00:00,  7.27it/s]


Epochs: 9 | Validation Loss: 0.13860681354999543 | Validation Accuracy: 0.9490118577075098


100%|██████████| 179/179 [01:11<00:00,  2.51it/s]


Epochs: 10 | Train Loss: 0.0004343780059652911             | Train Accuracy: 0.9609791589837637

Saved the best model with accuracy: 0.9609791589837637 to ./models/crf.pth



100%|██████████| 5/5 [00:00<00:00,  7.55it/s]

Epochs: 10 | Validation Loss: 0.13853615820407866 | Validation Accuracy: 0.9496442687747035
Saved the best model with validation accuracy: 0.9496442687747035 to ./models/crf_val.pth





In [105]:
# Download via terminal commands
!wget "https://bakrianoo.sfo2.digitaloceanspaces.com/aravec/full_grams_cbow_100_twitter.zip"
!unzip "full_grams_cbow_100_twitter.zip"

'wget' is not recognized as an internal or external command,
operable program or batch file.
'unzip' is not recognized as an internal or external command,
operable program or batch file.
