# Project 3 - Sequence models

See the Project 3 text for more information about what is done in this project. Also see PDF supplementing this assignment.

(Note that we use code similar to/copied from the weekly tasks and earlier projects.)

Import relevant modules:

In [1]:
import torch
from torch import nn
import torchtext

import pandas as pd
from datetime import datetime, timedelta
import numpy as np
import pickle

from os import listdir
import re

torch.manual_seed(123)
# We use torch.double to get the same results as PyTorch
torch.set_default_dtype(torch.double)

# run the training on CPU
device = torch.device('cpu')

# local path to data
path = '../../project3/data/'


### 2.1.1 Reading and tokenizing the data sets

We define a function for reading the txt files line by line:

In [2]:
def read_files(datapath='./'):
    """
    Return a list of strings, one for each line of text in each .txt files in 'datapath'
    """
    # Find all txt files in directory 
    files = listdir(datapath)
    files = [datapath + f for f in files if f.endswith(".txt")]
    
    # Stores each line of each book in a list
    lines = []
    for f_name in files:
        lines += open(f_name, encoding='utf8').readlines()
        
    return lines

We can then read the files from our local path using the eight books from the training, validation, and test data folders in the pre-made datasets:

In [3]:
lines_train = read_files(path + 'data_train/')
lines_val = read_files(path + 'data_val/')
lines_test = read_files(path + 'data_test/')

We define a function used to tokenize the sequences using torchtext:

In [4]:
# tokenizer will split a long text into a list of english words
tokenizer = torchtext.data.get_tokenizer('basic_english')

def tokenize(lines):
    """
    Tokenize the list of lines
    """
    
    list_text = []
    for line in lines:
        list_text += tokenizer(line)
        
    return list_text

And tokenize the data in the three datasets:

In [5]:
words_train = tokenize(lines_train)
words_val = tokenize(lines_val)
words_test = tokenize(lines_test)

print("Total number of words in the training dataset:   ", len(words_train))
print("Total number of words in the validation dataset: ", len(words_val))
print("Total number of words in the test dataset:       ", len(words_test))

Total number of words in the training dataset:    1368807
Total number of words in the validation dataset:  49526
Total number of words in the test dataset:        131750


### 2.1.2 Define vocabulary

We define functions used to build the vocabulary and to count the occurence frequency in a dataset of the words in the vocabulary. Note that we ignore names and digits when building the vocabulary.

In [6]:
# Match any word containing digit
no_digits = '\w*[0-9]+\w*'
# Match word containing an uppercase 
no_names = '\w*[A-Z]+\w*'
# Match any sequence containing more than one space
no_spaces = '\s+'

def yield_tokens(lines):
    """
    Yield tokens, ignoring names and digits to build vocabulary
    """
    for line in lines:
        line = re.sub(no_digits + "|" + no_names, ' ', line)
        line = re.sub(no_spaces, ' ', line)
        yield tokenizer(line)
        
        
def count_freqs(data, vocab):
    """
    Count occurrences of each word in vocabulary in the data
    """
    freqs = torch.zeros(len(vocab), dtype=torch.int)
    for w in data:
        freqs[vocab[w]] += 1
    return freqs

We can now build the vocabulary of the words used a minimum of 100 times in the training set:

In [7]:
# vocab contains the vocabulary found in the data, associating an index to each word
vocab = torchtext.vocab.build_vocab_from_iterator(yield_tokens(lines_train), min_freq = 100, specials = ["<unk>"])
# Since we removed all words with an uppercase when building the vocabulary, we skipped the word "I"
vocab.append_token("i")

# Value of default index. This index will be returned when OOV (Out Of Vocabulary) token is queried.
vocab.set_default_index(vocab["<unk>"])
vocab_size = len(vocab)

print("Total number of words in the training dataset:    ", len(words_train))
print("Number of distinct words in the training dataset: ", len(set(words_train)))
print("Size of defined vocabulary:                       ", vocab_size)

Total number of words in the training dataset:     1368807
Number of distinct words in the training dataset:  30374
Size of defined vocabulary:                        1050


We can also take a look at the frequency of the words in the vocabulary in the training set:

In [8]:
freqs = count_freqs(words_train, vocab)
print("occurences:\n", [(f.item(), w) for (f, w)  in zip(freqs, vocab.lookup_tokens(range(vocab_size)))])

occurences:
 [(251055, '<unk>'), (89904, ','), (71106, 'the'), (63121, '.'), (43426, 'and'), (33952, 'to'), (30061, 'of'), (23575, 'a'), (18657, 'in'), (20755, 'he'), (16814, 'that'), (15056, 'was'), (14400, 'his'), (13815, 'it'), (10997, 'with'), (10735, 'had'), (9430, 'her'), (9334, 'not'), (10562, 'you'), (9198, 'as'), (9152, 'at'), (8447, 'him'), (8457, 'is'), (8269, 'for'), (7824, 'on'), (7122, '!'), (6510, '?'), (8087, 'she'), (6477, 's'), (6223, 'be'), (5871, 'said'), (8103, 'but'), (6221, 'all'), (5689, 'have'), (5137, 'from'), (4752, 'which'), (4648, 'me'), (5138, 'so'), (4722, 'by'), (4476, 'were'), (4807, 'my'), (4989, 'this'), (4952, 'they'), (4399, 'one'), (4023, 'who'), (4656, 'what'), (3253, 'up'), (4196, 'there'), (3211, 'them'), (4125, 'we'), (3206, 'would'), (3258, 'an'), (3196, 'are'), (3091, 'been'), (3009, 'or'), (2941, 'out'), (2985, 'will'), (3912, 'when'), (3587, 'no'), (2656, 'could'), (2631, 'man'), (3321, 'if'), (2606, 'did'), (2580, 'their'), (2504, 'into'),

We lookup the indices of the special symbols in the vocabulary for use when creating datasets later:

In [9]:
specials = vocab.lookup_indices(['<unk>',',','.','!','?','(',')','-'])

### 2.1.3 n-gram language model architecture

We define a n-gram model with an embedding layer. We choose to use two words before and two words after the target as context words, four in total. We also have a training function for this model:

In [10]:
class NGramModel(nn.Module):
    def __init__(self, 
                 vocab_size,         # size of vocabulary
                 embedding_dim = 16, # small embedding dimension to reduce computations
                 context_size = 4):  # n=4, two words before and two after
        
        super().__init__()
        
        # Model architecture with embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.fc1 = nn.Linear(context_size * embedding_dim, 128)
        self.fc2 = nn.Linear(128, vocab_size)

    def forward(self, x):
        embeds = self.embedding(x)
        embeds = torch.flatten(embeds, 1)
        y = torch.relu(self.fc1(embeds))
        y = self.fc2(y)
        y = nn.functional.log_softmax(y, dim=1)
        return y
    
def train(model,        # n-gram model to be trained
          n_epochs,     # number of epochs to train for
          train_loader, # dataloader
          optimizer,    # optimizer
          loss_fn):     # loss function
    """
    Training an n-gram model architecture and return trained model
    """
    
    n_batches = len(train_loader)
    
    losses = []
    for epoch in range(1, n_epochs + 1):
        epoch_loss = 0
        
        for contexts, targets in train_loader:
            
            model.zero_grad()
            
            output = model(contexts)
            
            loss = loss_fn(output, targets.flatten())
            
            loss.backward()
            
            optimizer.step()
            
            epoch_loss += loss.item()
            
        losses.append(epoch_loss)
        
        if epoch == 1 or epoch % 5 == 0:
            print('{}  |  Epoch {}  |  Training loss {:.5f}'.format(
                datetime.now().time(), epoch, epoch_loss/n_batches))
        
    return model

#### Making context/target datasets

We need to make context/target datasets for use in the n-gram model, and define a function for making these datasets:

In [11]:
def create_dataset(text,               # all words in the dataset
                    vocab,             # vocabulary
                    context_size = 4): # effectively two words before and two words after target
    """
    Create context/target dataset with integer features
    """
    # two words before and two words after
    context_size = context_size//2
    
    # number of words in text
    n_text = len(text)
    
    # Transform the text to a list of integers
    txt = [vocab[w] for w in text]

    contexts = []
    targets = []
    for i in range(context_size, n_text - context_size):
        if txt[i] in specials: # see above - removing ['<unk>',',','.','!','?','(',')','-'] from datasets
            continue
        # Get context words before and after target        
        contexts.append(torch.LongTensor([txt[i-context_size+j] for j in range(context_size*2+1) if j!=context_size]))
        targets.append(txt[i])
        
    contexts = torch.stack(contexts)
    targets = torch.LongTensor(targets)
    targets = torch.unsqueeze(targets, dim = 1)
    data = torch.utils.data.TensorDataset(contexts, targets)
        
    return data

We create the context/target datasets (training, validation, and test):

In [12]:
data_train = create_dataset(words_train, vocab)
data_val = create_dataset(words_val, vocab)
data_test = create_dataset(words_test, vocab)

print("Total number of entries in the training dataset:   ", len(data_train))
print("Total number of entries in the validation dataset: ", len(data_val))
print("Total number of entries in the test dataset:       ", len(data_test))

Total number of entries in the training dataset:    949667
Total number of entries in the validation dataset:  34301
Total number of entries in the test dataset:        86268


##### Save and load datasets
And save them using pandas in case we need them later:

In [12]:
def savetorch(onetensor, filename = '', sep = ",", header = True):
    """
    save a torch tensor as a csv file
    """
    df = pd.DataFrame(onetensor.numpy())
    df.to_csv(path + filename, sep = sep, header = header, index = False)

In [13]:
savetorch(data_train[:][1], 'traintarget.csv')
savetorch(data_train[:][0], 'traincontext.csv')
savetorch(data_val[:][1], 'valtarget.csv')
savetorch(data_val[:][0], 'valcontext.csv')
savetorch(data_test[:][1], 'testtarget.csv')
savetorch(data_test[:][0], 'testcontext.csv')

To load the data from the csv files later:

In [13]:
def readtotorch(fns): # filenames of the context, target data (list of strings)
    """
    read csv files of contexts and target into torch tensors and get dataset
    """
    context = pd.read_csv(path + fns[0])
    target = pd.read_csv(path + fns[1])
    contexts = torch.LongTensor(context.values)
    targets = torch.LongTensor(target.values)
    
    dataset = torch.utils.data.TensorDataset(contexts, targets)
    
    return dataset

In [14]:
data_train = readtotorch(['traincontext.csv', 'traintarget.csv'])
data_val = readtotorch(['valcontext.csv', 'valtarget.csv'])
data_test = readtotorch(['testcontext.csv', 'testtarget.csv'])

### 2.1.4 Best n-gram model

We define a function for computing accuracy to use for model selection and evaluation of our n-gram language model. We also make a model selection function where the best hyperparameters (variations of embedding size and learning rate) are chosen.

In [15]:
def accuracy(pred, target):
    """
    acc = (number of correct predictions)/(total number of predictions)
    """
    hits = torch.sum(pred == target)
    
    return hits / len(target)

def model_selection(lr_list,            # learning rates to be tested (list)
                    embedding_size_list,# embedding size variations to be tested (list)
                    data_train,         # training data (context/target)
                    data_val,           # validation data (context/target)
                    vocab_size,         # length of the vocabulary
                    n_epochs = 20,      # number of epochs to train for (int)
                    batch_size = 1024): # batch size 
    """
    checks validation accuracy of models with different hyperparameters (learning rate - lr, embedding size)
    saves models state dictionaries
    
    returns:
        best_model : the best hyperparameter combination
    """
    # initialize variables to hold accuracy and best hyperparameters
    best_acc = 0    
    best_model = {}
    count = 0
    
    # get train loader
    torch.manual_seed(123)
    train_loader = torch.utils.data.DataLoader(dataset = data_train,
                                                batch_size = batch_size,
                                                shuffle = True)
    # different hyperparameters
    for lr in lr_list:
        for embedding_size in embedding_size_list:

            # make model and optimizer
            model = NGramModel(vocab_size, embedding_size).to(device)
            optimizer = torch.optim.SGD(model.parameters(), lr)
            
            print('\n =========================================================\n  Current parameters:')
            print(f'lr: {lr}')  
            print(f'embedding size: {embedding_size}')
            
            # training the model
            model = train(model, n_epochs, train_loader, optimizer, loss_fn)
            
            # calculate validation accuracy
            target = data_val[:][1]
            pred = model(data_val[:][0])
            pred = torch.argmax(pred, dim=1) # most likely word is prediction
            acc = accuracy(pred, target.flatten())
            print('accuracy:    {:.5f}'.format(acc))
            
            # save current trained model parameters for use later
            torch.save(model.state_dict(), path + 'model_weights_' + str(count) + '.pth')
            
            # keep track of the best hyperparameters and validation accuracies
            if acc > best_acc:
                best_model['lr'] = lr
                best_model['embedding size'] = embedding_size
                best_model['accuracy'] = acc
                best_model['number'] = count
                best_acc = acc
            # update number of trained models
            count += 1
    
    return best_model


#### Model selection

Then we do model selection with different values for the learning rate and batch sizes (hyperparameters). A dictionary containing the hyperparameters giving the best validation accuracy is returned.

In [15]:
lr_list = [0.001, 0.005]
embedding_size_list = [16, 32]
n_epochs = 30
loss_fn = nn.NLLLoss() # negative log likelihood

print(f"Training on device {device}.")
best_model = model_selection(lr_list, embedding_size_list, data_train, data_val, vocab_size, n_epochs)

Training on device cpu.

  Current parameters:
lr: 0.001
embedding size: 16
15:06:13.913206  |  Epoch 1  |  Training loss 6.90381
15:09:21.441758  |  Epoch 5  |  Training loss 6.35261
15:13:14.864000  |  Epoch 10  |  Training loss 5.86961
15:17:08.945030  |  Epoch 15  |  Training loss 5.64995
15:21:00.211583  |  Epoch 20  |  Training loss 5.52412
15:24:50.606553  |  Epoch 25  |  Training loss 5.44262
15:28:40.957439  |  Epoch 30  |  Training loss 5.38301
accuracy:    0.11128

  Current parameters:
lr: 0.001
embedding size: 32
15:29:30.172449  |  Epoch 1  |  Training loss 6.90039
15:32:44.290292  |  Epoch 5  |  Training loss 6.24776
15:36:50.214439  |  Epoch 10  |  Training loss 5.78118
15:40:53.084177  |  Epoch 15  |  Training loss 5.57084
15:44:56.812510  |  Epoch 20  |  Training loss 5.44731
15:48:59.109819  |  Epoch 25  |  Training loss 5.36115
15:53:01.781760  |  Epoch 30  |  Training loss 5.29431
accuracy:    0.12355

  Current parameters:
lr: 0.005
embedding size: 16
15:53:48.571

##### Save and load best model hyperparameters
We save the dictionary containing which model is best:

In [16]:
with open(path + 'bestmodel_dict.pkl', 'wb') as f:
    pickle.dump(best_model, f)
f.close()

And to load the dictionary back from file:

In [16]:
with open(path + 'bestmodel_dict.pkl', 'rb') as file_to_read:
    best_model = pickle.load(file_to_read)

##### Load trained model
We then load the best models state dictionary to the n-gram language model architecture to get back the trained (best) model:

In [17]:
# architecture
embeddingmodel = NGramModel(vocab_size, best_model['embedding size']).to(device)

# fill with trained parameters
embeddingmodel.load_state_dict(torch.load(path + 'model_weights_' + str(best_model['number']) + '.pth'))

<All keys matched successfully>

#### Model evaluation
We can then calculate training accuracy and evaluate (test accuracy) the model:

In [18]:
def get_accuracy(dataset):
    """
    take context/target datasets and calculate accuracy
    """
    target = dataset[:][1]
    pred = embeddingmodel(dataset[:][0])
    pred = torch.argmax(pred, dim = 1)
    
    return accuracy(pred, target.flatten())

acc_train = get_accuracy(data_train)

# evaluate model (test accuracy)
acc_test = get_accuracy(data_test)
            
print('Training accuracy:      {:.5f}'.format(acc_train))
print('Validation accuracy:    {:.5f}'.format(best_model['accuracy']))
print('Test accuracy:          {:.5f}'.format(acc_test))

Training accuracy:      0.15676
Validation accuracy:    0.16568
Test accuracy:          0.15568


### 2.1.5 Cosine similarity

We define functions to compute the cosine similarity matrix and to get n similar words to a word in the vocabulary.

In [19]:
def similarity_matrix(E):
    """
    calculate cosing similarity matrix from trained word embedding
    """
    cos = nn.CosineSimilarity(dim=0)
    
    sim_matrix = torch.zeros(vocab_size, vocab_size)
    
    for i in range(vocab_size):
        for j in range(vocab_size):
            e_i = E[:][i].view((-1, 1))
            e_j = E[:][j].view((-1, 1))
            sim_matrix[i][j] = cos(e_i, e_j)
            
    return sim_matrix

def n_most_similar_words(n, words, sim_matrix):
    """
    Get the n most similar words to a list of words using the cosine
    similarity matrix
    """
    word_dict = {}
    
    for w in words:
        
        # index of word
        index = vocab.__getitem__(w)
        
        # orders from lowest to highest similarity
        similar_word_index = np.argsort(sim_matrix[:][index].detach().numpy())
        
        # get highest to lowest
        similar_word_index = np.flip(similar_word_index)[1:n+1]
        
        similar_words = vocab.lookup_tokens(similar_word_index)
        
        word_dict[w] = similar_words
        
    return word_dict

#### Cosine similarity matrix
We get the cosine similarity matrix from the trained embedding:

In [20]:
# Embedding matrix (weights of embedding)
E = embeddingmodel.embedding.weight

sim_matrix = similarity_matrix(E)

sim_matrix

tensor([[ 1.0000,  0.0896, -0.1920,  ...,  0.1734, -0.0356, -0.0967],
        [ 0.0896,  1.0000,  0.1909,  ...,  0.1808, -0.1294,  0.0371],
        [-0.1920,  0.1909,  1.0000,  ...,  0.2394,  0.1400,  0.0113],
        ...,
        [ 0.1734,  0.1808,  0.2394,  ...,  1.0000,  0.2037,  0.1586],
        [-0.0356, -0.1294,  0.1400,  ...,  0.2037,  1.0000,  0.0964],
        [-0.0967,  0.0371,  0.0113,  ...,  0.1586,  0.0964,  1.0000]],
       grad_fn=<CopySlices>)

#### Similar words
And use it to find the ten most similar words to a selection of words in our vocabulary according to the similarity matrix.

In [21]:
test_words = ['woman', 'have', 'be', 'yes', 'no', 'house', 'red', 'blue', 'two']
n = 10

word_dict = n_most_similar_words(n, test_words, sim_matrix)

In [22]:
for w in test_words:
    print('\nSimilar words to: ' + w)
    for word in word_dict[w]:
        print(word)


Similar words to: woman
as
during
plan
listening
understand
bring
move
street
fixed
same

Similar words to: have
regard
asked
last
left
act
continued
since
sure
several
drawing

Similar words to: be
honor
pleasant
faces
tears
bit
m
means
other
won
found

Similar words to: yes
important
m
common
land
grown
nature
away
will
its
subject

Similar words to: no
we
they
further
myself
than
from
thinking
who
least
am

Similar words to: house
just
simple
murder
talked
ever
front
probably
hat
fresh
passion

Similar words to: red
affair
stand
seen
subject
how
repeated
warm
whose
grown
sake

Similar words to: blue
wish
caught
yourself
law
lies
should
large
especially
express
say

Similar words to: two
lying
finished
really
quick
galloped
matter
heard
able
spent
command


### 2.1.6 To visualize embedding space

We save the vocabulary and the corresponding values in embedding space as tsv-files for uploading to https://projector.tensorflow.org/:

In [77]:
listofwords = vocab.lookup_tokens([i for i in range(vocab_size)])

In [79]:
#savetorch(vocab, 'vocab.tsv', sep="\t", header=False) # words in vocabulary
df = pd.DataFrame(listofwords)
df.to_csv(path+'vocab.tsv', sep='\t', header=False, index=False)
savetorch(E, 'embedding.tsv', sep='\t', header=False) # corresponding values in embedding space

See the pdf report for the visualization.

## 2.2 Conjugating be and have

We are to predict *be* and *have* conjugation given the context around the missing target.Possible conjugations are *be, am, are, is, was, were, been, being, have, has, had, having*, corresponding to 12 classes (integers 0 to 11).

#### Making context/target datasets
We make the context/target datasets where the targets are integers between 0 and 11. We use a context size of four, meaning two words before and two words after the target word (same as when training the embedding in Task 2.1).

In [23]:
def create_conjugate_dataset(text,              # all words in the dataset
                             vocab,             # vocabulary
                             conjugate_list,    # list of conjugations of be and have
                             context_size = 4): # effectively two words before and two words after target

    """
    Create context/target dataset with integer features, based on given target words
    """
    n_text = len(text)
    context_size = context_size//2
    
    # Transform the text to a list of integers.
    txt = [vocab[w] for w in text]

    contexts = []
    targets = []
    for i in range(context_size, n_text - context_size):
        if text[i] in(conjugate_list):
            if txt[i] in specials: # see above - removing ['<unk>',',','.','!','?','(',')','-'] from datasets
                continue
            # get context words before and after
            contexts.append(torch.LongTensor([txt[i-context_size+j] for j in range(context_size*2+1) if j!=context_size]))
            # map targets to integers 0-11
            targets.append(conjugate_list.index(text[i]))
        
    contexts = torch.stack(contexts)
    targets = torch.LongTensor(targets)
    targets = torch.unsqueeze(targets, dim=1)
    data = torch.utils.data.TensorDataset(contexts, targets)
        
    return data

We use the above function to create datasets containing the conjugations of *be* and *have*:

In [31]:
# conjugations of be and have
conjugate_list = ['be', 'am', 'are', 'is', 'was', 'were', 'been', 'being', 'have', 'has', 'had', 'having']

# new context/target datasets
data_train_2 = create_conjugate_dataset(words_train, vocab, conjugate_list)
data_val_2 = create_conjugate_dataset(words_val, vocab, conjugate_list)
data_test_2 = create_conjugate_dataset(words_test, vocab, conjugate_list)

print("Total number of entries in the training dataset:   ", len(data_train_2))
print("Total number of entries in the validation dataset: ", len(data_val_2))
print("Total number of entries in the test dataset:       ", len(data_test_2))

Total number of entries in the training dataset:    61961
Total number of entries in the validation dataset:  2590
Total number of entries in the test dataset:        4502


##### Save and load datasets
And save them using pandas in case we need them later:

In [32]:
savetorch(data_train_2[:][1], 'traintarget_conj.csv')
savetorch(data_train_2[:][0], 'traincontext_conj.csv')
savetorch(data_val_2[:][1], 'valtarget_conj.csv')
savetorch(data_val_2[:][0], 'valcontext_conj.csv')
savetorch(data_test_2[:][1], 'testtarget_conj.csv')
savetorch(data_test_2[:][0], 'testcontext_conj.csv')

To load the data from the csv files later:

In [24]:
data_train_2 = readtotorch(['traincontext_conj.csv', 'traintarget_conj.csv'])
data_val_2 = readtotorch(['valcontext_conj.csv', 'valtarget_conj.csv'])
data_test_2 = readtotorch(['testcontext_conj.csv', 'testtarget_conj.csv'])

### 2.2.1 MLP and RNN to predict conjugation

We are to look at two different architectures for predicting the conjugations:
- Multilayer perception (MLP)
- Recurrent neural network (RNN)

#### MLP

We define an MLP architecture based on the embedding:

In [25]:
class MLP(nn.Module):
    def __init__(self,
                 embedding,         # word embedding (frozen, not to be trained)
                 context_size = 4): # (two words before and two words after target)
        super().__init__()
        
        embedding_size = embedding.weight.shape[1]
        
        # frozen embedding layer
        self.embedding = embedding
        self.embedding.weight.requires_grad = False
        
        # three fuly connected layers
        self.fc1 = nn.Linear(embedding_size*context_size, 128)
        self.fc2 = nn.Linear(128, 32)
        self.fc3 = nn.Linear(32, 12)
        
    def forward(self, x):
        embeds = self.embedding(x)
        embeds = torch.flatten(embeds, 1)
        out = torch.relu(self.fc1(embeds))
        out = torch.relu(self.fc2(out))
        out = self.fc3(out)
        
        return out

#### RNN

We define a RNN architecture based on the embedding:

In [26]:
class RNN(nn.Module):
    def __init__(self, 
                 embedding,        # word embedding (frozen, not to be trained)
                 hidden_size = 12, # size of hidden layer 
                 n_layers = 2):    # number of layers
        super().__init__()
        
        embedding_size = embedding.weight.shape[1]
        
        # frozen embedding layer
        self.embedding = embedding
        self.embedding.weight.requires_grad = False
        
        # RNN layer and one fully connected layer for output
        self.rnn = nn.RNN(embedding_size, hidden_size, n_layers, batch_first = True)
        self.fc1 = nn.Linear(hidden_size, 12)

    def forward(self, x):
        embeds = self.embedding(x)
        _, hidden = self.rnn(embeds)
        out = nn.ReLU()(hidden[-1])
        out = self.fc1(out)
        
        return out

We then define the function for training our models:

In [27]:
def train_conjugate(model,        # model architecture to train
                    n_epochs,     # number of epochs to train for
                    train_loader, # dataloader
                    optimizer,    # optimizer
                    loss_fn):     # loss function
                
    """
    Function for training the model
    """
    # get the number of batches
    n_batches = len(train_loader)
    
    losses = []
    for epoch in range(1, n_epochs + 1):
        epoch_loss = 0
        for contexts, targets in train_loader:
            model.zero_grad()
            output = model(contexts)
            
            loss = loss_fn(output, targets.flatten())
            
            loss.backward()
            
            optimizer.step()
            
            epoch_loss += loss.item()
            
        losses.append(epoch_loss)
        if epoch == 1 or epoch % 5 == 0:
            print('{}  |  Epoch {}  |  Training loss {:.5f}'.format(
                datetime.now().time(), epoch, epoch_loss/n_batches))
        
    return model

### 2.2.2 Best conjugation predictor

We want to do model selection to find the best model for predicting conjugatins of *be* and *have*. We define a function for model selection based on validation accuracy:

In [28]:
def conjugate_model_selection(lr_list,            # learning rates to be tested (list)
                              hidden_size_list,   # hidden size variations to be tested for RNN (list)
                              data_train,         # training data (context/target)
                              data_val,           # validation data (context/target)
                              embedding,          # word embedding (frozen, not to be trained)
                              n_epochs = 30,      # number of epochs to train for (int)
                              batch_size = 1024): # batch size 
    """
    checks validation accuracy of models with different architectures and hyperparameters
    saves models state dictionaries
    
    returns:
        best_model : the best hyperparameter combination
    """
    # initialize variables to hold accuracy and best hyperparameters
    best_acc = 0
    best_model = {}
    count = 0
    
    # get train loader
    torch.manual_seed(123)
    train_loader = torch.utils.data.DataLoader(dataset = data_train,
                                                batch_size = batch_size,
                                                shuffle = True)
    # different hyperparameters
    for lr in lr_list:
        
        # ---------------------------- variations on RNN architecture ----------------------------------
        for hidden_size in hidden_size_list:
           
            model = RNN(embedding, hidden_size)
            optimizer = torch.optim.SGD(model.parameters(), lr)

            print('\n =========================================================\n  Current parameters:')
            print('---- RNN model ----')
            print(f'lr:             {lr}')  
            print(f'hidden size:     {hidden_size}')
            
            # train model
            train_start = datetime.now()
            model = train_conjugate(model, n_epochs, train_loader, optimizer, loss_fn)
            
            # get validation accuracy
            target = data_val[:][1]
            pred = model(data_val[:][0])
            pred = torch.argmax(pred, dim=1)
            acc = accuracy(pred, target.flatten())
            print('accuracy:        {:.5f}'.format(acc))
            
            # get time it takes to train RNN
            train_stop = datetime.now()
            train_delta = train_stop - train_start
            sec = train_delta.seconds
            print(f'training time:   {sec} seconds')
            
            # save current trained model parameters for use later
            torch.save(model.state_dict(), path + 'model_weights_conjugate_' + str(count) + '.pth')

            if acc > best_acc:
                best_model['lr'] = lr
                best_model['hidden size'] = hidden_size
                best_model['accuracy'] = acc
                best_model['model'] = 'RNN'
                best_model['number'] = count
                best_model['seconds'] = sec
                best_acc = acc
            
            # update number of trained models
            count += 1
        
        # ------------------------------- MLP architecture -------------------------------------------
        # get model and optimizer
        model = MLP(embedding)
        optimizer = torch.optim.SGD(model.parameters(), lr)

        print('\n =========================================================\n  Current parameters:')
        print('---- MLP model ----')
        print(f'lr:             {lr}')  

        # train model
        train_start = datetime.now()
        model = train_conjugate(model, n_epochs, train_loader, optimizer, loss_fn)

        # get validation accuracy
        target = data_val[:][1]
        pred = model(data_val[:][0])
        pred = torch.argmax(pred, dim = 1)
        acc = accuracy(pred, target.flatten())
        print('accuracy:        {:.5f}'.format(acc))
        
        # get time it takes to train MLP
        train_stop = datetime.now()
        train_delta = train_stop - train_start
        sec = train_delta.seconds
        print(f'training time:   {sec} seconds')
            
        # save current trained model parameters for use later
        torch.save(model.state_dict(), path + 'model_weights_conjugate_' + str(count) + '.pth')

        if acc > best_acc:
            best_model['lr'] = lr
            best_model['hidden size'] = None
            best_model['accuracy'] = acc
            best_model['model'] = 'MLP'
            best_model['number'] = count
            best_model['seconds'] = sec
            best_acc = acc

        # update number of trained models
        count += 1
                   
    return best_model

#### Model selection

Then we do model selection with different hyperparameter values. We train two MLP architectures with different learning rates, and four RNN architectures where we vary both learning rate and number of hidden units. A dictionary containing the hyperparameters giving the best validation accuracy is returned.

In [29]:
wordembedding = embeddingmodel.embedding

# hyperparameters
lr_list = [0.001, 0.005]
hidden_size_list = [12, 24] # for RNN

n_epochs = 100
loss_fn = nn.CrossEntropyLoss() # Cross-entropy loss

print(f"Training on device {device}.")
best_model_2 = conjugate_model_selection(lr_list, hidden_size_list, data_train_2, data_val_2, wordembedding, n_epochs)

Training on device cpu.

  Current parameters:
---- RNN model ----
lr:             0.001
hidden size:     12
17:50:45.077769  |  Epoch 1  |  Training loss 2.51716
17:50:47.047290  |  Epoch 5  |  Training loss 2.47296
17:50:49.513164  |  Epoch 10  |  Training loss 2.43025
17:50:51.945862  |  Epoch 15  |  Training loss 2.39532
17:50:54.421655  |  Epoch 20  |  Training loss 2.36578
17:50:56.856322  |  Epoch 25  |  Training loss 2.34024
17:50:59.452014  |  Epoch 30  |  Training loss 2.31841
17:51:02.012333  |  Epoch 35  |  Training loss 2.29896
17:51:04.468135  |  Epoch 40  |  Training loss 2.28256
17:51:06.965399  |  Epoch 45  |  Training loss 2.26818
17:51:09.496510  |  Epoch 50  |  Training loss 2.25628
17:51:12.015567  |  Epoch 55  |  Training loss 2.24555
17:51:14.456674  |  Epoch 60  |  Training loss 2.23683
17:51:16.978050  |  Epoch 65  |  Training loss 2.22916
17:51:19.418716  |  Epoch 70  |  Training loss 2.22277
17:51:21.899170  |  Epoch 75  |  Training loss 2.21728
17:51:24.3813

##### Save and load best model hyperparameters
We save the dictionary containing which model is best:

In [30]:
with open(path + 'bestmodel_dict_conjugate.pkl', 'wb') as f:
    pickle.dump(best_model_2, f)
f.close()

And to load the dictionary back from file:

In [31]:
with open(path + 'bestmodel_dict_conjugate.pkl', 'rb') as file_to_read:
    best_model_2 = pickle.load(file_to_read)

##### Load trained model
We then load the best models state dictionary to correct model architecture to get back the trained (best) model:

In [32]:
wordembedding = embeddingmodel.embedding

# architecture
if best_model_2['model'] == 'RNN':
    conjugatemodel = model = RNN(wordembedding, best_model_2['hidden size'])
elif best_model_2['model'] == 'MLP':
    conjugatemodel = MLP(wordembedding)

# fill with trained parameters
conjugatemodel.load_state_dict(torch.load(path + 'model_weights_conjugate_' + str(best_model_2['number']) + '.pth'))

<All keys matched successfully>

#### Model evaluation
We can then calculate training accuracy and evaluate (test accuracy) the model:

In [51]:
def get_accuracy_2(dataset):
    """
    take context/target datasets and calculate accuracy
    """
    target = dataset[:][1]
    pred = conjugatemodel(dataset[:][0])
    pred = torch.argmax(pred, dim = 1)
    
    return accuracy(pred, target.flatten())

acc_train = get_accuracy_2(data_train_2)

# evaluate model (test accuracy)
acc_test = get_accuracy_2(data_test_2)
            
print('Training accuracy:      {:.5f}'.format(acc_train))
print('Validation accuracy:    {:.5f}'.format(best_model_2['accuracy']))
print('Test accuracy:          {:.5f}'.format(acc_test))

Training accuracy:      0.44031
Validation accuracy:    0.45019
Test accuracy:          0.24656


## 2.3 Text generation

We use our trained word embedding to define an RNN architecture to predict the next word in a sequence given a context before the target.

#### Making context/target datasets

For this task we need new context/target datasets containing context words before the target word (we choose three context words). We define a function for this:

In [34]:
def create_dataset_3(text,              # all words in dataset
                     vocab,             # vocabulary
                     context_size = 3): # number of context words before the targets
    """
    Create context/target dataset with integer features
    """
    
    # get dataset
    n_text = len(text)
    
    # transform the text as a list of integers
    txt = [vocab[w] for w in text]
    
    contexts = []
    targets = []
    for i in range(context_size, n_text - context_size):
        if txt[i] in specials: # see above - removing ['<unk>',',','.','!','?','(',')','-'] from datasets
            continue
        contexts.append(torch.LongTensor([txt[i - context_size + j] for j in range(context_size)]))
        targets.append(txt[i])
        
    contexts = torch.stack(contexts)
    targets = torch.LongTensor(targets)
    targets = torch.unsqueeze(targets, dim=1)
    data = torch.utils.data.TensorDataset(contexts, targets) 
    
    return data


We create the context/target datasets (training, validation, and test):

In [61]:
data_train_3 = create_dataset_3(words_train, vocab)
data_val_3 = create_dataset_3(words_val, vocab)
data_test_3 = create_dataset_3(words_test, vocab)

print("Total number of entries in the training dataset:   ", len(data_train_3))
print("Total number of entries in the validation dataset: ", len(data_val_3))
print("Total number of entries in the test dataset:       ", len(data_test_3))

Total number of entries in the training dataset:    949665
Total number of entries in the validation dataset:  34299
Total number of entries in the test dataset:        86266


##### Save and load datasets
And save them using pandas in case we need them later:

In [62]:
savetorch(data_train_3[:][1], 'traintarget_3.csv')
savetorch(data_train_3[:][0], 'traincontext_3.csv')
savetorch(data_val_3[:][1], 'valtarget_3.csv')
savetorch(data_val_3[:][0], 'valcontext_3.csv')
savetorch(data_test_3[:][1], 'testtarget_3.csv')
savetorch(data_test_3[:][0], 'testcontext_3.csv')

To load the data from the csv files later:

In [35]:
data_train_3 = readtotorch(['traincontext_3.csv', 'traintarget_3.csv'])
data_val_3 = readtotorch(['valcontext_3.csv', 'valtarget_3.csv'])
data_test_3 = readtotorch(['testcontext_3.csv', 'testtarget_3.csv'])

### 2.3.1 RNN for word prediction

We can now define an RNN architecture based on the word embedding that is to predict the following words given a context of previous words. The model can be trained using the train_conjugate() function.

In [36]:
class myRNN(nn.Module):
    def __init__(self, 
                 embedding,        # word embedding (frozen, not to be trained)
                 vocab_size,       # size of vocabulary (output)
                 hidden_size = 12, # size of hidden layer 
                 n_layers = 2):    # number of layers
        super().__init__()
        
        embedding_dim = embedding.weight.shape[1]
        
        # frozen embedding layer
        self.embedding = embedding
        self.embedding.weight.requires_grad = False
        
        # RNN layer and one fully connected layer for output
        self.rnn = nn.RNN(embedding_dim, hidden_size, n_layers, batch_first = True)
        self.fc1 = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        embeds = self.embedding(x)
        _, hidden = self.rnn(embeds)
        out = nn.ReLU()(hidden[-1])
        out = self.fc1(out)
        
        return out

### 2.3.2 Best next word predictor

We want to do model selection to find the best model for predicting the next word based on previous words. We define a function for model selection based on validation accuracy:

In [41]:
def next_model_selection(lr_list,            # learning rates to be tested (list)
                         hidden_size_list,   # hidden size variations to be tested for RNN (list)
                         data_train,         # training data (context/target)
                         data_val,           # validation data (context/target)
                         embedding,          # word embedding (frozen, not to be trained)
                         vocab_size,         # length of the vocabulary
                         n_epochs = 30,      # number of epochs to train for (int)
                         batch_size = 1024): # batch size 
    """
    checks validation accuracy of models with different architectures and hyperparameters
    saves models state dictionaries
    
    returns:
        best_model : the best hyperparameter combination
    """
    # initialize variables to hold accuracy and best hyperparameters
    best_acc = 0
    best_model = {}
    count = 0
    
    # get train loader
    torch.manual_seed(123)
    train_loader = torch.utils.data.DataLoader(dataset = data_train,
                                                batch_size = batch_size,
                                                shuffle = True)
    # different hyperparameters
    for lr in lr_list:
        for hidden_size in hidden_size_list:
           
            model = myRNN(embedding, vocab_size, hidden_size)
            optimizer = torch.optim.SGD(model.parameters(), lr)

            print('\n =========================================================\n  Current parameters:')
            print(f'lr:             {lr}')  
            print(f'hidden size:     {hidden_size}')
            
            # train model
            train_start = datetime.now()
            model = train_conjugate(model, n_epochs, train_loader, optimizer, loss_fn)
            
            # get validation accuracy
            target = data_val[:][1]
            pred = model(data_val[:][0])
            pred = torch.argmax(pred, dim=1)
            acc = accuracy(pred, target.flatten())
            print('accuracy:        {:.5f}'.format(acc))
            
            # get time it takes to train RNN
            train_stop = datetime.now()
            train_delta = train_stop - train_start
            sec = train_delta.seconds
            print(f'training time:   {sec} seconds')
            
            # save current trained model parameters for use later
            torch.save(model.state_dict(), path + 'model_weights_next_' + str(count) + '.pth')

            if acc > best_acc:
                best_model['lr'] = lr
                best_model['hidden size'] = hidden_size
                best_model['accuracy'] = acc
                best_model['number'] = count
                best_model['seconds'] = sec
                best_acc = acc
            
            # update number of trained models
            count += 1
                   
    return best_model

#### Model selection

Then we do model selection with different hyperparameter values. We train two MLP architectures with different learning rates, and four RNN architectures where we vary both learning rate and number of hidden units. A dictionary containing the hyperparameters giving the best validation accuracy is returned.

In [42]:
wordembedding = embeddingmodel.embedding

# hyperparameters
lr_list = [0.001, 0.005]
hidden_size_list = [12, 24]

n_epochs = 50
loss_fn = nn.CrossEntropyLoss() # Cross-entropy loss

print(f"Training on device {device}.")
best_model_3 = next_model_selection(lr_list, hidden_size_list, data_train_3, data_val_3, wordembedding,
                                                                                        vocab_size, n_epochs)


Training on device cpu.

  Current parameters:
lr:             0.001
hidden size:     12
18:34:57.136644  |  Epoch 1  |  Training loss 6.98308
18:35:57.353689  |  Epoch 5  |  Training loss 6.85885
18:37:13.725189  |  Epoch 10  |  Training loss 6.66778
18:38:30.294013  |  Epoch 15  |  Training loss 6.40180
18:39:46.353042  |  Epoch 20  |  Training loss 6.17225
18:41:03.976043  |  Epoch 25  |  Training loss 6.01938
18:42:20.180093  |  Epoch 30  |  Training loss 5.91513
18:43:38.349696  |  Epoch 35  |  Training loss 5.83905
18:44:56.250277  |  Epoch 40  |  Training loss 5.77988
18:46:14.018901  |  Epoch 45  |  Training loss 5.73257
18:47:32.828554  |  Epoch 50  |  Training loss 5.69412
accuracy:        0.07359
training time:   771 seconds

  Current parameters:
lr:             0.001
hidden size:     24
18:47:53.965920  |  Epoch 1  |  Training loss 6.95448
18:49:17.662856  |  Epoch 5  |  Training loss 6.77736
18:51:01.586662  |  Epoch 10  |  Training loss 6.28719
18:52:44.528091  |  Epoch 

##### Save and load best model hyperparameters
We save the dictionary containing which model is best:

In [44]:
with open(path + 'bestmodel_dict_next.pkl', 'wb') as f:
    pickle.dump(best_model_3, f)
f.close()

And to load the dictionary back from file:

In [45]:
with open(path + 'bestmodel_dict_next.pkl', 'rb') as file_to_read:
    best_model_3 = pickle.load(file_to_read)

In [47]:
best_model_3

{'lr': 0.005,
 'hidden size': 24,
 'accuracy': tensor(0.1097),
 'number': 3,
 'seconds': 1039}

##### Load trained model
We then load the best models state dictionary to the myRNN architecture to get back the trained (best) model:

In [49]:
wordembedding = embeddingmodel.embedding

# architecture
nextmodel = myRNN(wordembedding, vocab_size, best_model_3['hidden size'])

# fill with trained parameters
nextmodel.load_state_dict(torch.load(path + 'model_weights_next_' + str(best_model_3['number']) + '.pth'))

<All keys matched successfully>

#### Model evaluation
We can then calculate training accuracy and evaluate (test accuracy) the model:

In [50]:
def get_accuracy_3(dataset):
    """
    take context/target datasets and calculate accuracy
    """
    target = dataset[:][1]
    pred = nextmodel(dataset[:][0])
    pred = torch.argmax(pred, dim = 1)
    
    return accuracy(pred, target.flatten())

acc_train = get_accuracy_3(data_train_3)

# evaluate model (test accuracy)
acc_test = get_accuracy_3(data_test_3)
            
print('Training accuracy:      {:.5f}'.format(acc_train))
print('Validation accuracy:    {:.5f}'.format(best_model_3['accuracy']))
print('Test accuracy:          {:.5f}'.format(acc_test))

Training accuracy:      0.10716
Validation accuracy:    0.10974
Test accuracy:          0.12495


### 2.3.3 Beam search

The beam search algorithm is implemented:

In [106]:
def beam_search(word_list, # sentence in list format
                model,     # the model used to predict next word
                n=5,       # number of additional words to predict
                B=3):      # branch factor
    
    inputs = torch.tensor(vocab.lookup_indices(text_input))
    sequences = [[inputs.tolist(), 1.0]]
    
    for _ in range(n):
        all_candidates = []
        for i in range(len(sequences)):
            
            seq, score = sequences[i]
            pred = model(torch.tensor(seq))
            
            for j in range(vocab_size):
                candidate = [seq + [j], score*pred[j].tolist()]
                all_candidates.append(candidate)
                
        ordered = sorted(all_candidates, key=lambda tup:tup[1], reverse=True)
        sequences = ordered[:B]
            
    return vocab.lookup_tokens(sequences[0][0])

### 2.3.4 Playing with the model

Now we will test our word predictor. We use sequences of arbitrary length and give it to the model. We let the model complete the next n words using the beam search algorithm.

In [140]:
text_input = ['he', 'was', 'a']
words = beam_search(text_input, nextmodel, n=5, B=3)
' '.join(words)

'he was a was of the was of'

In [142]:
text_input = ['there', 'are', 'two', 'minutes', 'to', 'the']
words = beam_search(text_input, nextmodel, n=5, B=3)
' '.join(words)

'there are two minutes to the he of the to of'

In [144]:
text_input = ['i', 'wish', 'i', 'had']
words = beam_search(text_input, nextmodel, n=5, B=3)
' '.join(words)

'i wish i had of the was of the'

In [146]:
text_input = ['you', 'never', 'were', 'my']
words = beam_search(text_input, nextmodel, n=5, B=4)
' '.join(words)

'you never were my to the the and of'

In [148]:
text_input = ['the', 'gentleman', 'read', 'my']
words = beam_search(text_input, nextmodel, n=2, B=3)
' '.join(words)

'the gentleman read my of the'

In [150]:
text_input = ['you', 'and', 'i', 'could', 'never', 'do']
words = beam_search(text_input, nextmodel, n=4, B=2)
' '.join(words)

'you and i could never do of the to of'

See pdf report for comments on these results.