### Imports

In [20]:
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F
from tqdm import tqdm
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence

%run utils.ipynb

['ث', 'م']
[' ', 'َّ']


### Constants

In [21]:
EMBEDDING_DIM = 200
HIDDEN_SIZE = 512
NUM_LAYERS = 1
NUM_EPOCHS = 10
LEARNING_RATE = 0.001
BATCH_SIZE = 256

### Model building

In [22]:
class RNN(nn.Module):
    def __init__(self, vocab_size, n_classes, embedding_dim=EMBEDDING_DIM, hidden_size=HIDDEN_SIZE, num_layers=NUM_LAYERS):
        """
        The constructor of our RNN model
        Inputs:
        - vacab_size: the number of unique characters
        - embedding_dim: the embedding dimension
        - n_classes: the number of final classes (diacritics)
        """
        super(RNN, self).__init__()

        # (1) Create an embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # (2) Create an LSTM layer with hidden size = hidden_size and batch_first = True
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers=num_layers, batch_first=True, bidirectional=True)

        # (3) Create a linear layer with number of neorons = n_classes
        self.linear = nn.Linear(hidden_size * 2, n_classes)

    def forward(self, sentences):
        """
        This function does the forward pass of our model
        Inputs:
        - sentences: tensor of shape (batch_size, max_length)

        Returns:
        - final_output: tensor of shape (batch_size, max_length, n_classes)
        """

        final_output = None
        
        embeddings = self.embedding(sentences)
        lstm_out, _ = self.lstm(embeddings)
        output = self.linear(lstm_out)
        # final_output = F.softmax(output, dim=1)
        return output

### Train

In [23]:
def train(model, path, train_dataset, train_labels, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS, learning_rate=LEARNING_RATE):
    """
    This function implements the training logic
    Inputs:
    - model: the model to be trained
    - train_dataset: the training set
    - batch_size: integer represents the number of examples per step
    - epochs: integer represents the total number of epochs (full training pass)
    - learning_rate: the learning rate to be used by the optimizer
    """

    # (1) create the dataloader of the training set (make the shuffle=True)
    tensor_train_dataset = TensorDataset(train_dataset, train_labels)
    train_dataloader = DataLoader(tensor_train_dataset, batch_size=batch_size, shuffle=True)

    # (2) make the criterion cross entropy loss
    criterion = torch.nn.CrossEntropyLoss()

    # (3) create the optimizer (Adam)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    # GPU configuration
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    if use_cuda:
        model = model.cuda()
        criterion = criterion.cuda()
    
    best_accuracy = 0.0
    for epoch_num in range(epochs):
        total_acc_train = 0
        total_loss_train = 0
        
        for train_input, train_label in tqdm(train_dataloader):
            # Zero your gradients
            optimizer.zero_grad()

            # Move the train input to the device
            train_label = train_label.to(device)

            # Move the train label to the device
            train_input = train_input.to(device)

            # Do the forward pass
            output = model(train_input).float()

            # Loss calculation
            batch_loss = criterion(output.view(-1, output.shape[-1]), train_label.view(-1))

            # Append the batch loss to the total_loss_train
            total_loss_train += batch_loss.item()
            
            # Calculate the batch accuracy (just add the number of correct predictions)
            # Compare predicted diacritic with true diacritic and count correct predictions
            correct_predictions = (output.argmax(dim=2) == train_label)

            # Calculate accuracy for the current batch
            acc = correct_predictions.sum().item()
            total_acc_train += acc

            # Do the backward pass
            batch_loss.backward()

            # Update the weights with your optimizer
            optimizer.step()     
        
        # Calculate the epoch loss
        epoch_loss = total_loss_train / len(train_dataset)

        # Calculate the accuracy
        epoch_acc = total_acc_train / (len(train_dataset) * len(train_dataset[0]))

        print(f'Epochs: {epoch_num + 1} | Train Loss: {epoch_loss} \
            | Train Accuracy: {epoch_acc}\n')
        
        if epoch_acc > best_accuracy:
            best_accuracy = epoch_acc
            torch.save(model.state_dict(), path)
            print(f'Saved the best model with accuracy: {best_accuracy} to {path}\n')


In [27]:
corpus=  readFile(TRAIN_PATH)

x_train = []
y_train = []

for sentence in corpus[:1000]:
	# Clean each sentence in the corpus
	# Get the char list for each word in the sentence and its corresponding diacritics
	char_list, diacritics_list = separate_words_and_diacritics(sentence.strip())

	x_train.append(char_list)
	y_train.append(diacritics_list)

X_train_padded = [torch.tensor([char_to_index[char] for char in word]) for sentence in x_train for word in sentence]
X_train_padded = pad_sequence(X_train_padded, batch_first=True)

y_train_padded = [torch.tensor([diacritic_to_index[char] for char in word]) for sentence in y_train for word in sentence]
y_train_padded = pad_sequence(y_train_padded, batch_first=True)

In [25]:
def run():
    model=RNN(len(unique_characters) + 1, len(unique_diacritics))
    print(model)
    train(model, RNN_PATH, X_train_padded, y_train_padded)

In [28]:
# run()

RNN(
  (embedding): Embedding(39, 200)
  (lstm): LSTM(200, 512, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=1024, out_features=15, bias=True)
)


  0%|          | 0/179 [00:00<?, ?it/s]

100%|██████████| 179/179 [00:41<00:00,  4.36it/s]


Epochs: 1 | Train Loss: 0.001415177905275926             | Train Accuracy: 0.8779360317349573

Saved the best model with accuracy: 0.8779360317349573 to ./models/rnn.pth



100%|██████████| 179/179 [00:43<00:00,  4.07it/s]


Epochs: 2 | Train Loss: 0.0006746101369265467             | Train Accuracy: 0.9406962952626805

Saved the best model with accuracy: 0.9406962952626805 to ./models/rnn.pth



100%|██████████| 179/179 [00:42<00:00,  4.23it/s]


Epochs: 3 | Train Loss: 0.0005507510701510326             | Train Accuracy: 0.9514706322073935

Saved the best model with accuracy: 0.9514706322073935 to ./models/rnn.pth



100%|██████████| 179/179 [00:43<00:00,  4.16it/s]


Epochs: 4 | Train Loss: 0.0004805724885285704             | Train Accuracy: 0.9571817284787354

Saved the best model with accuracy: 0.9571817284787354 to ./models/rnn.pth



100%|██████████| 179/179 [00:42<00:00,  4.17it/s]


Epochs: 5 | Train Loss: 0.00043040468705587905             | Train Accuracy: 0.9614954500603005

Saved the best model with accuracy: 0.9614954500603005 to ./models/rnn.pth



100%|██████████| 179/179 [00:44<00:00,  3.99it/s]


Epochs: 6 | Train Loss: 0.0003945800209173494             | Train Accuracy: 0.9644636253999262

Saved the best model with accuracy: 0.9644636253999262 to ./models/rnn.pth



100%|██████████| 179/179 [00:44<00:00,  4.02it/s]


Epochs: 7 | Train Loss: 0.00036659487986196067             | Train Accuracy: 0.9667520507121428

Saved the best model with accuracy: 0.9667520507121428 to ./models/rnn.pth



100%|██████████| 179/179 [00:46<00:00,  3.82it/s]


Epochs: 8 | Train Loss: 0.0003424850109016643             | Train Accuracy: 0.9688391424385284

Saved the best model with accuracy: 0.9688391424385284 to ./models/rnn.pth



100%|██████████| 179/179 [00:41<00:00,  4.36it/s]


Epochs: 9 | Train Loss: 0.0003227396972829503             | Train Accuracy: 0.9704577847325353

Saved the best model with accuracy: 0.9704577847325353 to ./models/rnn.pth



100%|██████████| 179/179 [00:42<00:00,  4.23it/s]

Epochs: 10 | Train Loss: 0.000306658795669719             | Train Accuracy: 0.9718352254039131

Saved the best model with accuracy: 0.9718352254039131 to ./models/rnn.pth




