In this notebook, we train the RNN model that predicts the pronounciation (as phonemes) given some input word.

In [0]:
import json
import matplotlib.ticker as ticker
import matplotlib.pyplot as plt
import numpy as np
import random

import torch
import torch.nn as nn
from torch import optim
from torch.nn.functional import relu

Use a CUDA device if available.

In [0]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Preparing the Data

We mount our Google Drive to access the data files from within this notebook.

In [0]:
from google.colab import drive

mounted_folder_fp = '/content/drive'
drive.mount(mounted_folder_fp, force_remount=True)

We then load all the JSON files we need from the mounted drive.

In [0]:
with open('{}/My Drive/Colab Notebooks/hola/data/cmudict-char-ids.json'.format(mounted_folder_fp), 'r', encoding='ascii') as f:
    char_ids_dict = json.load(f)

with open('{}/My Drive/Colab Notebooks/hola/data/cmudict-phoneme-ids.json'.format(mounted_folder_fp), 'r', encoding='ascii') as f:
    phoneme_ids_dict = json.load(f)

with open('{}/My Drive/Colab Notebooks/hola/data/cmudict-processed.json'.format(mounted_folder_fp), 'r', encoding='ascii') as f:
    cmudict_pairs = json.load(f)

We find the length of the longest word/list of phonemes to define the length of our output PyTorch tensors later.

In [0]:
max_length = max([max(len(pair[0]), len(pair[1])) for pair in cmudict_pairs])

We split our pairs into training and testing sets.

In [0]:
train_proportion = 0.9

random.shuffle(cmudict_pairs)
cmudict_train = cmudict_pairs[:int(train_proportion * len(cmudict_pairs))]
cmudict_test = cmudict_pairs[int(train_proportion * len(cmudict_pairs)):]

# Designing the Model

For model architecture, we use a vanilla encoder-decoder RNN model with GRUs.

In [0]:
class Encoder(nn.Module):
    def __init__(self, num_inputs, hidden_size):
        super(Encoder, self).__init__()

        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(num_inputs, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        output = torch.reshape(self.embedding(input), (1, 1, -1))
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def init_hidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [0]:
class Decoder(nn.Module):
    def __init__(self, num_outputs, hidden_size):
        super(Decoder, self).__init__()

        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(num_outputs, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, num_outputs)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = torch.reshape(self.embedding(input), (1, 1, -1))
        output = relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def init_hidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

# Training the Model

The `SOS` and `EOS` tokens will be important later during training, so we give them their special variables now.

In [0]:
SOS_token = char_ids_dict['SOS']
EOS_token = char_ids_dict['EOS']

For training, we invoke teacher forcing at random in order to speed up training.

In [0]:
teacher_forcing_ratio = 0.5

def train(input_tensor, target_tensor, encoder, decoder, encoder_optim, decoder_optim, loss_fn):
    loss = 0

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_optim.zero_grad()
    decoder_optim.zero_grad()

    encoder_hidden = encoder.init_hidden()
    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    for i in range(input_length):
        encoder_input = input_tensor[i]
        encoder_output, encoder_hidden = encoder(encoder_input, encoder_hidden)
        encoder_outputs[i] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)
    decoder_hidden = encoder_hidden

    if random.random() < teacher_forcing_ratio:  # if True, invoke teacher forcing
        for i in range(target_length):
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            decoder_input = target_tensor[i]
            loss += loss_fn(decoder_output, target_tensor[i])
    else:
        for i in range(target_length):
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            _, top_index = decoder_output.topk(1)
            decoder_input = top_index.squeeze().detach()
            loss += loss_fn(decoder_output, target_tensor[i])

            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optim.step()
    decoder_optim.step()

    return loss.item() / target_length

We want to repeat this training process over our entire training set, simultaneously plotting loss over time.

In [0]:
plt.switch_backend('agg')

def plot_losses(losses_over_time):
    plt.figure()
    fig, ax = plt.subplots()
    loc = ticker.MultipleLocator(base=0.2)  # puts ticks at regular intervals
    ax.yaxis.set_major_locator(loc)
    plt.plot(losses_over_time)

In [0]:
def train_epoch(encoder, decoder, plot_every=1000, learning_rate=0.01):
    losses_over_time = []
    curr_loss = 0

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    loss_fn = nn.NLLLoss()

    for i in range(len(cmudict_train)):
        input_char_ids, target_phoneme_ids = cmudict_train[i]
        input_tensor = torch.tensor(input_char_ids, device=device).view(-1, 1)
        target_tensor = torch.tensor(target_phoneme_ids, device=device).view(-1, 1)

        loss = train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, loss_fn)
        curr_loss += loss

        if i != 0 and i % plot_every == 0:
            avg_curr_loss = curr_loss / plot_every
            losses_over_time.append(avg_curr_loss)
            curr_loss = 0
            print('ITER {} ({:.2f}%) LOSS: {:.4f}'.format(i, i / len(cmudict_train) * 100, avg_curr_loss))

    plot_losses(losses_over_time)

# Evaluating the Model

It will be important to have a `phoneme_id` to `phoneme` dict, so we set that up now.

In [0]:
phoneme_ids_inv_dict = {phoneme_id: phoneme for phoneme, phoneme_id in phoneme_ids_dict.items()}

We use accuracy as the metric to evaluate our model.

In [0]:
def predict(input_tensor, encoder, decoder):
    decoded_phonemes = []
    with torch.no_grad():
        input_length = input_tensor.size(0)

        encoder_hidden = encoder.init_hidden()
        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for i in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[i], encoder_hidden)
            encoder_outputs[i] = encoder_output[0, 0]  # = or +=?

        decoder_input = torch.tensor([[SOS_token]], device=device)
        decoder_hidden = encoder_hidden

        for i in range(max_length):
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            _, top_index = decoder_output.topk(1)
            decoder_input = top_index.squeeze().detach()
            if decoder_input.item() == EOS_token:
                decoded_phonemes.append(phoneme_ids_inv_dict[EOS_token])
                break
            else:
                decoded_phonemes.append(phoneme_ids_inv_dict[decoder_input.item()])

    return decoded_phonemes

In [0]:
def evaluate(encoder, decoder):
    correct = 0
    with torch.no_grad():
        for i in range(len(cmudict_test)):
            input_char_ids, target_phoneme_ids = cmudict_test[i]
            input_tensor = torch.tensor(input_char_ids, device=device).view(-1, 1)

            predicted_phonemes = predict(input_tensor, encoder, decoder)
            actual_phonemes = [phoneme_ids_inv_dict[phoneme_id] for phoneme_id in target_phoneme_ids]
            if predicted_phonemes == actual_phonemes:
                correct += 1

    print('ACCURACY: {:.4f}'.format(correct / len(cmudict_test)))

# Putting it all Together!

In [18]:
hidden_size = 64

encoder = Encoder(len(cmudict_train), hidden_size).to(device)
decoder = Decoder(len(cmudict_train), hidden_size).to(device)

train_epoch(encoder, decoder)

ITER 1000 (0.83%) LOSS: 3.6251
ITER 2000 (1.66%) LOSS: 3.0569
ITER 3000 (2.49%) LOSS: 2.9868
ITER 4000 (3.32%) LOSS: 2.9624
ITER 5000 (4.15%) LOSS: 2.8811
ITER 6000 (4.98%) LOSS: 2.8651
ITER 7000 (5.81%) LOSS: 2.8356
ITER 8000 (6.64%) LOSS: 2.7948
ITER 9000 (7.47%) LOSS: 2.6926
ITER 10000 (8.30%) LOSS: 2.7211
ITER 11000 (9.13%) LOSS: 2.6522
ITER 12000 (9.96%) LOSS: 2.6069
ITER 13000 (10.79%) LOSS: 2.5510
ITER 14000 (11.62%) LOSS: 2.4768
ITER 15000 (12.45%) LOSS: 2.4415
ITER 16000 (13.28%) LOSS: 2.3451
ITER 17000 (14.11%) LOSS: 2.3587
ITER 18000 (14.94%) LOSS: 2.2567
ITER 19000 (15.77%) LOSS: 2.2251
ITER 20000 (16.60%) LOSS: 2.1500
ITER 21000 (17.43%) LOSS: 2.1660
ITER 22000 (18.26%) LOSS: 2.0576
ITER 23000 (19.09%) LOSS: 2.0352
ITER 24000 (19.92%) LOSS: 1.9705
ITER 25000 (20.75%) LOSS: 1.9680
ITER 26000 (21.58%) LOSS: 1.8707
ITER 27000 (22.41%) LOSS: 1.8895
ITER 28000 (23.24%) LOSS: 1.7809
ITER 29000 (24.07%) LOSS: 1.7808
ITER 30000 (24.90%) LOSS: 1.7507
ITER 31000 (25.73%) LOSS: 1.706

In [36]:
evaluate(encoder, decoder)

ACCURACY: 0.213


In [0]:
torch.save(encoder, '{}/My Drive/Colab Notebooks/hola/data/encoder.pt'.format(mounted_folder_fp))
torch.save(decoder, '{}/My Drive/Colab Notebooks/hola/data/decoder.pt'.format(mounted_folder_fp))