### Imports

In [1]:
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F
from tqdm import tqdm
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
import pickle

%run utils.ipynb

15
Char [['m', 's', '>', 'l', 'p'], ['w', 'm', 'n'], ['H', 'n', 'v'], ['w', 'h', 'w'], ['q', 'A', 'd', 'r'], ['E', 'l', 'Y'], ['A', 'l', '<', 'T', 'E', 'A', 'm'], ['>', 'w'], ['A', 'l', 'k', 's', 'w', 'p'], ['>', 'w'], ['A', 'l', 'E', 't', 'q'], ['v', 'm'], ['A', 'f', 't', 'q', 'r'], ['f', 'E', 'j', 'z'], ['E', 'n'], ['k', 'l'], ['*', 'l', 'k'], ['l', 'm'], ['y', 'j', 'z', 'h'], ['A', 'l', 'S', 'w', 'm'], ['>', 'S', 'l', 'A']]
Diac [['a', 'o', 'a', 'a', 'N'], ['a', 'a', 'o'], ['a', 'i', 'a'], ['a', 'u', 'a'], ['a', ' ', 'i', 'N'], ['a', 'a', ' '], [' ', ' ', 'i', 'o', 'a', ' ', 'i'], ['a', 'o'], [' ', 'o', 'i', 'o', 'a', 'i'], ['a', 'o'], [' ', 'o', 'i', 'o', 'i'], ['u', '~a'], [' ', 'o', 'a', 'a', 'a'], ['a', 'a', 'a', 'a'], ['a', 'o'], ['u', '~i'], ['a', 'i', 'a'], ['a', 'o'], ['u', 'o', 'i', 'i'], [' ', ' ', '~a', 'o', 'u'], ['a', 'o', 'F', ' ']]
38
15


### Model building

In [2]:
class RNN(nn.Module):
    def __init__(self, vocab_size, n_classes, embedding_dim = 200, hidden_size = 256, num_layers=3):
        """
        The constructor of our RNN model
        Inputs:
        - vacab_size: the number of unique characters
        - embedding_dim: the embedding dimension
        - n_classes: the number of final classes (diacritics)
        """
        super(RNN, self).__init__()

        # (1) Create an embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # (2) Create an LSTM layer with hidden size = hidden_size and batch_first = True
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers=num_layers, batch_first=True, bidirectional=True)

        # (3) Create a linear layer with number of neorons = n_classes
        self.linear = nn.Linear(hidden_size*2, n_classes)

    def forward(self, sentences):
        """
        This function does the forward pass of our model
        Inputs:
        - sentences: tensor of shape (batch_size, max_length)

        Returns:
        - final_output: tensor of shape (batch_size, max_length, n_classes)
        """

        final_output = None
        
        embeddings = self.embedding(sentences)
        lstm_out, _ = self.lstm(embeddings)
        output = self.linear(lstm_out)
        final_output = F.softmax(output, dim=1)
        return final_output
    

class RNN_FastText(nn.Module):
    def __init__(self, vocab_size, n_classes, embedding_dim = 200, hidden_size = 256, num_layers=3):
        """
        The constructor of our RNN model
        Inputs:
        - vacab_size: the number of unique characters
        - embedding_dim: the embedding dimension
        - n_classes: the number of final classes (diacritics)
        """
        super(RNN, self).__init__()

        # Load FastText model
        from gensim.models import FastText
        fasttext_model = FastText.load("./models/ft_model")

        # Extract word embeddings from FastText model
        words = fasttext_model.wv.index_to_key
        weights = [fasttext_model.wv.get_vector(word) for word in words]
        pre_trained_embeddings = torch.tensor(weights)

        # (1) Create the embedding layer and load pre-trained word embeddings
        self.embedding = nn.Embedding.from_pretrained(pre_trained_embeddings, freeze=True)

        # (2) Create an LSTM layer with hidden size = hidden_size and batch_first = True
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers=num_layers, batch_first=True, bidirectional=True)

        # (3) Create a linear layer with number of neorons = n_classes
        self.linear = nn.Linear(hidden_size*2, n_classes)

    def forward(self, sentences):
        """
        This function does the forward pass of our model
        Inputs:
        - sentences: tensor of shape (batch_size, max_length)

        Returns:
        - final_output: tensor of shape (batch_size, max_length, n_classes)
        """

        final_output = None
        
        embeddings = self.embedding(sentences)
        lstm_out, _ = self.lstm(embeddings)
        output = self.linear(lstm_out)
        final_output = F.softmax(output, dim=1)
        return final_output

In [3]:
save_path="./models/lstm.pth"

### Train

In [4]:
def train(model, train_dataset, train_labels, batch_size=512, epochs=20, learning_rate=0.001):
    """
    This function implements the training logic
    Inputs:
    - model: the model to be trained
    - train_dataset: the training set
    - batch_size: integer represents the number of examples per step
    - epochs: integer represents the total number of epochs (full training pass)
    - learning_rate: the learning rate to be used by the optimizer
    """

    # (1) create the dataloader of the training set (make the shuffle=True)
    tensor_train_dataset = TensorDataset(train_dataset, train_labels)
    train_dataloader = DataLoader(tensor_train_dataset, batch_size=batch_size, shuffle=True)

    # (2) make the criterion cross entropy loss
    criterion = torch.nn.CrossEntropyLoss()

    # (3) create the optimizer (Adam)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    # GPU configuration
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    if use_cuda:
        model = model.cuda()
        criterion = criterion.cuda()
    
    best_accuracy = 0.0
    for epoch_num in range(epochs):
        total_acc_train = 0
        total_loss_train = 0
        
        for train_input, train_label in tqdm(train_dataloader):

            # (4) move the train input to the device
            train_label = train_label.to(device)

            # (5) move the train label to the device
            train_input = train_input.to(device)

            # (6) do the forward pass
            output = model(train_input).float()

            # (7) loss calculation (you need to think in this part how to calculate the loss correctly)
            one_hot_train_label = F.one_hot(train_label, num_classes=15).float()
            batch_loss = criterion(output, one_hot_train_label)

            # (8) append the batch loss to the total_loss_train
            total_loss_train += batch_loss
            
            # (9) calculate the batch accuracy (just add the number of correct predictions)
            # Compare predicted diacritic with true diacritic and count correct predictions
            correct_predictions = (output.argmax(dim=2) == train_label)

            # Calculate accuracy for the current batch
            acc = correct_predictions.sum().item()

            total_acc_train += acc

            # (10) zero your gradients
            optimizer.zero_grad()

            # (11) do the backward pass
            batch_loss.backward()

            # (12) update the weights with your optimizer
            optimizer.step()     
        
        # epoch loss
        epoch_loss = total_loss_train / len(train_dataset)

        # (13) calculate the accuracy
        epoch_acc = total_acc_train / (len(train_dataset) * len(train_dataset[0]))
        print(
            f'Epochs: {epoch_num + 1} | Train Loss: {epoch_loss} \
            | Train Accuracy: {epoch_acc}\n')
        if epoch_acc > best_accuracy:
            best_accuracy = epoch_acc
            torch.save(model.state_dict(), save_path)
            print(f'Saved the best model with accuracy: {best_accuracy} to {save_path}')


In [5]:
corpus=  readFile(TRAIN_PATH)
valid_corpus = readFile(VAL_PATH)

X_train = []
y_train = []

max_sequence_length = 0

for sentence in corpus[:100]:
	# Clean each sentence in the corpus
	clean_sentence = run_buckwalter(sentence.strip())
	# Get the char list for each word in the sentence and its corresponding diacritics
	char_list, diacritics_list = extract_labels(clean_sentence)

	X_train.append(char_list)
	y_train.append(diacritics_list)

	# Get the max sequence length and concatenate the embeddings of the words
	for word in char_list:
		max_sequence_length = max(max_sequence_length, len(word))

X_train_padded = [torch.tensor([char_to_index[char] for char in word]) for sentence in X_train for word in sentence ]
X_train_padded = pad_sequence(X_train_padded, batch_first=True)

y_train_padded = [torch.tensor([diacritic_to_index[char] for char in word]) for sentence in y_train for word in sentence ]
print(y_train_padded)
y_train_padded = pad_sequence(y_train_padded, batch_first=True)
print(y_train_padded)

[tensor([2, 1, 5, 5]), tensor([2, 1]), tensor([2, 2, 2]), tensor([0, 1, 2, 9, 5]), tensor([2, 2, 5]), tensor([0, 2, 1]), tensor([2, 0, 2]), tensor([ 0,  0,  9,  1,  2,  3, 11]), tensor([0, 1, 5]), tensor([2, 2, 2, 2]), tensor([2, 1, 5, 5]), tensor([3, 2, 1, 8]), tensor([2, 1, 2, 3, 0, 0]), tensor([2, 3, 1, 2, 0, 3]), tensor([2, 1, 3]), tensor([2, 3, 0, 8]), tensor([3, 0, 1, 3, 1, 2, 0, 3]), tensor([5, 5, 0, 2]), tensor([2, 0]), tensor([5, 3, 2]), tensor([5, 5, 0, 5, 5]), tensor([3, 1]), tensor([ 0,  0, 10,  0,  3]), tensor([2, 5, 0, 2, 7]), tensor([2, 3, 1, 2, 0, 3]), tensor([5, 1, 2, 8]), tensor([3, 2, 2, 8]), tensor([ 2,  2, 10]), tensor([5, 9, 0, 8]), tensor([0, 1, 5]), tensor([2, 2, 2, 2]), tensor([2, 1, 5]), tensor([0, 1, 3]), tensor([2, 0, 8]), tensor([2, 1]), tensor([3, 3, 1, 8]), tensor([2, 2, 2, 9, 5, 5]), tensor([5, 2]), tensor([2, 5, 1, 3]), tensor([ 0,  0, 11,  9,  0,  3]), tensor([2, 3, 1, 2, 0, 3]), tensor([0, 1, 5, 1, 2, 3]), tensor([3, 0]), tensor([2, 3, 0, 3]), tensor(

In [6]:


model=RNN(len(unique_characters) + 1, len(unique_diacritics), embedding_dim=200, hidden_size=256, num_layers=3)
print(model)

RNN(
  (embedding): Embedding(39, 200)
  (lstm): LSTM(200, 256, num_layers=3, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=512, out_features=15, bias=True)
)


In [7]:
train(model, X_train_padded, y_train_padded, batch_size=512, epochs=20, learning_rate=0.01)

  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 9/9 [00:06<00:00,  1.50it/s]


Epochs: 1 | Train Loss: 0.003420752240344882             | Train Accuracy: 0.516653866364849

Saved the best model with accuracy: 0.516653866364849 to ./models/lstm.pth


100%|██████████| 9/9 [00:06<00:00,  1.49it/s]


Epochs: 2 | Train Loss: 0.003307867096737027             | Train Accuracy: 0.7237560127733538

Saved the best model with accuracy: 0.7237560127733538 to ./models/lstm.pth


100%|██████████| 9/9 [00:06<00:00,  1.39it/s]


Epochs: 3 | Train Loss: 0.0032837274484336376             | Train Accuracy: 0.7575892315776709

Saved the best model with accuracy: 0.7575892315776709 to ./models/lstm.pth


100%|██████████| 9/9 [00:06<00:00,  1.30it/s]


Epochs: 4 | Train Loss: 0.0032768594101071358             | Train Accuracy: 0.7621973402320223

Saved the best model with accuracy: 0.7621973402320223 to ./models/lstm.pth


100%|██████████| 9/9 [00:07<00:00,  1.23it/s]


Epochs: 5 | Train Loss: 0.0032701913733035326             | Train Accuracy: 0.7691499252192894

Saved the best model with accuracy: 0.7691499252192894 to ./models/lstm.pth


100%|██████████| 9/9 [00:07<00:00,  1.17it/s]


Epochs: 6 | Train Loss: 0.0032661284785717726             | Train Accuracy: 0.7771332713529245

Saved the best model with accuracy: 0.7771332713529245 to ./models/lstm.pth


100%|██████████| 9/9 [00:08<00:00,  1.10it/s]


Epochs: 7 | Train Loss: 0.0032622257713228464             | Train Accuracy: 0.7846113424148107

Saved the best model with accuracy: 0.7846113424148107 to ./models/lstm.pth


100%|██████████| 9/9 [00:08<00:00,  1.10it/s]


Epochs: 8 | Train Loss: 0.003260076278820634             | Train Accuracy: 0.7847932414406403

Saved the best model with accuracy: 0.7847932414406403 to ./models/lstm.pth


100%|██████████| 9/9 [00:08<00:00,  1.05it/s]


Epochs: 9 | Train Loss: 0.0032576341181993484             | Train Accuracy: 0.7876632038481749

Saved the best model with accuracy: 0.7876632038481749 to ./models/lstm.pth


100%|██████████| 9/9 [00:10<00:00,  1.14s/it]


Epochs: 10 | Train Loss: 0.0032558543607592583             | Train Accuracy: 0.7881078459113141

Saved the best model with accuracy: 0.7881078459113141 to ./models/lstm.pth


100%|██████████| 9/9 [00:09<00:00,  1.06s/it]


Epochs: 11 | Train Loss: 0.0032544054556638002             | Train Accuracy: 0.7908767533044989

Saved the best model with accuracy: 0.7908767533044989 to ./models/lstm.pth


100%|██████████| 9/9 [00:10<00:00,  1.13s/it]


Epochs: 12 | Train Loss: 0.0032531784381717443             | Train Accuracy: 0.7910788633331985

Saved the best model with accuracy: 0.7910788633331985 to ./models/lstm.pth


100%|██████████| 9/9 [00:10<00:00,  1.12s/it]


Epochs: 13 | Train Loss: 0.0032515886705368757             | Train Accuracy: 0.7925946885484457

Saved the best model with accuracy: 0.7925946885484457 to ./models/lstm.pth


100%|██████████| 9/9 [00:10<00:00,  1.17s/it]


Epochs: 14 | Train Loss: 0.003250658279284835             | Train Accuracy: 0.7940296697522131

Saved the best model with accuracy: 0.7940296697522131 to ./models/lstm.pth


100%|██████████| 9/9 [00:10<00:00,  1.21s/it]


Epochs: 15 | Train Loss: 0.003249876433983445             | Train Accuracy: 0.7941307247665629

Saved the best model with accuracy: 0.7941307247665629 to ./models/lstm.pth


100%|██████████| 9/9 [00:10<00:00,  1.18s/it]


Epochs: 16 | Train Loss: 0.0032492466270923615             | Train Accuracy: 0.7930393306115849



100%|██████████| 9/9 [00:09<00:00,  1.11s/it]


Epochs: 17 | Train Loss: 0.0032454465981572866             | Train Accuracy: 0.7941307247665629



100%|██████████| 9/9 [00:09<00:00,  1.11s/it]


Epochs: 18 | Train Loss: 0.0032434891909360886             | Train Accuracy: 0.7951210639071911

Saved the best model with accuracy: 0.7951210639071911 to ./models/lstm.pth


100%|██████████| 9/9 [00:09<00:00,  1.07s/it]


Epochs: 19 | Train Loss: 0.003241664730012417             | Train Accuracy: 0.798415457374995

Saved the best model with accuracy: 0.798415457374995 to ./models/lstm.pth


100%|██████████| 9/9 [00:10<00:00,  1.16s/it]

Epochs: 20 | Train Loss: 0.003240669844672084             | Train Accuracy: 0.8019523828772384

Saved the best model with accuracy: 0.8019523828772384 to ./models/lstm.pth



