### Imports

In [15]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
from torch.nn.utils.rnn import pad_sequence

%run train.ipynb

15
Char [['m', 's', '>', 'l', 'p'], ['w', 'm', 'n'], ['H', 'n', 'v'], ['w', 'h', 'w'], ['q', 'A', 'd', 'r'], ['E', 'l', 'Y'], ['A', 'l', '<', 'T', 'E', 'A', 'm'], ['>', 'w'], ['A', 'l', 'k', 's', 'w', 'p'], ['>', 'w'], ['A', 'l', 'E', 't', 'q'], ['v', 'm'], ['A', 'f', 't', 'q', 'r'], ['f', 'E', 'j', 'z'], ['E', 'n'], ['k', 'l'], ['*', 'l', 'k'], ['l', 'm'], ['y', 'j', 'z', 'h'], ['A', 'l', 'S', 'w', 'm'], ['>', 'S', 'l', 'A']]
Diac [['a', 'o', 'a', 'a', 'N'], ['a', 'a', 'o'], ['a', 'i', 'a'], ['a', 'u', 'a'], ['a', ' ', 'i', 'N'], ['a', 'a', ' '], [' ', ' ', 'i', 'o', 'a', ' ', 'i'], ['a', 'o'], [' ', 'o', 'i', 'o', 'a', 'i'], ['a', 'o'], [' ', 'o', 'i', 'o', 'i'], ['u', '~a'], [' ', 'o', 'a', 'a', 'a'], ['a', 'a', 'a', 'a'], ['a', 'o'], ['u', '~i'], ['a', 'i', 'a'], ['a', 'o'], ['u', 'o', 'i', 'i'], [' ', ' ', '~a', 'o', 'u'], ['a', 'o', 'F', ' ']]
38
15
[tensor([2, 1, 5, 5]), tensor([2, 1]), tensor([2, 2, 2]), tensor([0, 1, 2, 9, 5]), tensor([2, 2, 5]), tensor([0, 2, 1]), tensor([2, 

### Validation

In [16]:
def validate(model, val_dataset, val_labels, batch_size=512):
    """
    This function implements the validation logic
    Inputs:
    - model: the trained model
    - val_dataset: the validation set
    - batch_size: integer representing the number of examples per step
    """

    # (1) create the dataloader for the validation set (make shuffle=False)
    tensor_val_dataset = TensorDataset(val_dataset, val_labels)
    val_dataloader = DataLoader(tensor_val_dataset, batch_size=batch_size, shuffle=False)

    # GPU configuration
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    if use_cuda:
        model = model.cuda()

    # model.eval()  # Set the model to evaluation mode

    total_acc_val = 0

    with torch.no_grad():
        for val_input, val_label in tqdm(val_dataloader):

            # (2) move the validation input to the device
            val_label = val_label.to(device)

            # (3) move the validation label to the device
            val_input = val_input.to(device)

            # (4) do the forward pass
            output = model(val_input).float()

            # (7) calculate the batch accuracy
            correct_predictions = (output.argmax(dim=2) == val_label)
            acc = correct_predictions.sum().item()
            total_acc_val += acc

    # Calculate metrics for the entire validation set
    val_accuracy = total_acc_val / (len(val_dataset) * len(val_dataset[0]))

    print(f'Validation Accuracy: {val_accuracy} | DER: {1 - val_accuracy}\n')


### Prepare the data

In [17]:
valid_corpus = readFile(VAL_PATH)

X_val = []
y_val = []

for sentence in valid_corpus:
	# Clean each sentence in the corpus
	clean_sentence = run_buckwalter(sentence.strip())
	# Get the char list for each word in the sentence and its corresponding diacritics
	char_list, diacritics_list = extract_labels(clean_sentence)

	X_val.append(char_list)
	y_val.append(diacritics_list)

X_val_padded = [torch.tensor([char_to_index[char] for char in word]) for sentence in X_val for word in sentence ]
X_val_padded = pad_sequence(X_val_padded, batch_first=True)

y_val_padded = [torch.tensor([diacritic_to_index[char] for char in word]) for sentence in y_val for word in sentence ]
print(y_val_padded)
y_val_padded = pad_sequence(y_val_padded, batch_first=True)
print(y_val_padded)

[tensor([2, 1, 5, 5]), tensor([2, 2, 0]), tensor([5, 1, 2, 5]), tensor([3, 2, 0, 2, 5, 5]), tensor([0, 1, 2, 1, 5]), tensor([0, 0, 9, 0, 3, 5]), tensor([2, 0, 0, 9, 2, 0, 5, 0, 2]), tensor([2, 1, 2]), tensor([2, 0, 3, 2, 3]), tensor([ 2,  2, 11,  3]), tensor([0, 1, 5, 1, 3]), tensor([2, 2, 0]), tensor([2, 2, 3, 3]), tensor([5, 0, 2]), tensor([2, 1, 3, 3]), tensor([2, 1]), tensor([2, 1, 3, 3]), tensor([5, 0, 2]), tensor([2, 2, 3, 3]), tensor([2, 2, 1, 2]), tensor([2, 0, 3, 2, 3]), tensor([ 2,  2, 11,  3,  3]), tensor([2, 2, 0]), tensor([0, 0, 9, 2, 3]), tensor([2, 0, 0, 9, 1, 3]), tensor([2, 3, 0, 7, 0]), tensor([2, 2, 1, 3, 0, 5, 5]), tensor([2, 9]), tensor([0, 1, 5, 1, 2]), tensor([0, 1]), tensor([2, 0, 2]), tensor([2, 5]), tensor([2, 2, 6]), tensor([3, 2, 1, 3]), tensor([2, 1, 8]), tensor([2, 2, 2, 9, 2]), tensor([2, 2, 1, 3]), tensor([2, 0]), tensor([5, 1, 2, 2, 5]), tensor([2, 1]), tensor([2, 0, 2]), tensor([2, 5]), tensor([2, 2, 2, 0, 3]), tensor([2, 1]), tensor([2, 1, 2, 0, 6]), 

### Load the model

In [18]:
# Load the saved RNN model for inference
loaded_rnn_model = RNN(len(unique_characters) + 1, len(unique_diacritics), embedding_dim=200, hidden_size=256, num_layers=3)
loaded_rnn_model.load_state_dict(torch.load('./models/lstm.pth'))
loaded_rnn_model.eval()

print(loaded_rnn_model)

RNN(
  (embedding): Embedding(39, 200)
  (lstm): LSTM(200, 256, num_layers=3, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=512, out_features=15, bias=True)
)


In [19]:
validate(loaded_rnn_model, X_val_padded, y_val_padded, batch_size=512)

100%|██████████| 208/208 [00:40<00:00,  5.19it/s]

Validation Accuracy: 0.7896421267718384 | DER: 0.21035787322816157




