In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

import pprint


pp = pprint.PrettyPrinter()

In [None]:
# Our raw data, which consists of sentences
corpus = [
          "We always come to Paris",
          "The professor is from Australia",
          "I live in Stanford",
          "He comes from Taiwan",
          "The capital of Turkey is Ankara"
         ]

In [None]:
### this is the preprocessing steps that we will take for our data
def preprocess_sentence(sentence):
    return sentence.lower().split()

train_sentences = [preprocess_sentence(sent) for sent in corpus]
train_sentences

In [None]:
locations = set(['paris', 'australia', 'stanford', 'taiwan', 'turkey', 'ankara'])

train_labels = [[1 if word in locations else 0 for word in sent] for sent in train_sentences]
train_labels


In [None]:
vocabulary = set(w for s in train_sentences for w in s)
vocabulary

In [None]:
vocabulary.add("<unk>")
vocabulary.add("<pad>")

In [None]:
def pad_window(sentence, window_size, pad_token="<pad>"):
    window = [pad_token] * window_size
    return window + sentence + window

window_size = 2
pad_window(train_sentences[0], window_size=window_size)

In [None]:
# We are just converting our vocabularly to a list to be able to index into it
# Sorting is not necessary, we sort to show an ordered word_to_ind dictionary
# That being said, we will see that having the index for the padding token
# be 0 is convenient as some PyTorch functions use it as a default value
# such as nn.utils.rnn.pad_sequence, which we will cover in a bit
ix_into_words = sorted(list(vocabulary))
word_into_ix = {word: ind for ind, word in enumerate(ix_into_words)}
word_into_ix

In [None]:
# We are ready to convert our training sentences into a sequence of indices corresponding to each token.

In [None]:
# Given a sentence of tokens, return the corresponding indices
# def convert_token_to_indices(sentences, word_to_ix):
#     indices = []
#     for token in sentences:
#         if token in word_into_ix:
#             index = word_to_ix[token]
#         else:
#             index = word_into_ix["<unk>"]
#         indices.append(index)
#     return indices
def convert_token_to_indices(sentence, word_to_ix):
    return [word_into_ix.get(token, word_into_ix["<unk>"])for token in sentence]

example_sentence = ["we", "always", "come", "to", "kuwait"]
example_indices = convert_token_to_indices(example_sentence, word_into_ix)
restored_example = [ix_into_words[ind] for ind in example_indices]

print(example_indices)
print(f"Original sentence is: {example_sentence}")
print(f"Going from words to indices: {example_indices}")
print(f"Going from indices to words: {restored_example}")

In [None]:
example_padded_indices = [convert_token_to_indices(s, word_into_ix) for s in train_sentences]
example_padded_indices

In [None]:
embedding_dim = 5
embeds = nn.Embedding(len(vocabulary), embedding_dim)
list(embeds.parameters())

In [None]:
#we want to get the lookup tensor for the word Paris
index = word_into_ix.get('paris')
index_tensor = torch.tensor(index, dtype=torch.long)
paris_embed = embeds(index_tensor)
paris_embed


In [None]:
from torch.utils.data import DataLoader
from functools import partial
def custom_collate_fn(batch, window_size,word_into_ix):
    # we are going to break out our batch into examples 'x' and labels 'y'
    # and then turn them into tensors because nn.utiles.rnn.pad_sequence
    # expects tensors as inputs
    x, y =zip(*batch)
    # we have already designed a function for padding but we're just gonna bring it here as well to have everything in one place
    def pad_window(sentence, window_size, pad_token="<pad>"):
        window = [pad_token] * window_size
        return window + sentence + window

    # now we pad our examples
    x = [pad_window(s, window_size=window_size) for s in x]

    # now we need to turn words in our training examples to indices.
    def convert_token_to_indices(sentence, word_to_ix):
        return [word_to_ix.get(token, word_to_ix['<unk>']) for token in sentence]

    # now we convert the examples themselves into indices
    x = [convert_token_to_indices(s, word_into_ix) for s in x]

    # now we pad all of our examples so they are all the same length
    # because our matrix operations are impossible if they are not the same length
    pad_token_ix = word_into_ix["<pad>"]

    # pad sequence expects tensors so we make x into a tensor
    x = [torch.LongTensor(x_i) for x_i in x]
    x_padded = nn.utils.rnn.pad_sequence(x, batch_first=True, padding_value=pad_token_ix)

    # We will also pad the labels. Before we do that, we record the number
    # of labels so that we know how many words existed in each example
    lengths = [len(label) for label in y]
    lengths = torch.LongTensor(lengths)

    y = [torch.LongTensor(y_i) for y_i in y]
    y_padded = nn.utils.rnn.pad_sequence(y, batch_first=True, padding_value=0)


    #we are now ready to return our variables. The order we return our variables
    # here will match the oder we read them in our training loop
    return x_padded, y_padded, lengths
    #this order is super important


In [None]:
# Params to be passed to the Dataloader
data = list(zip(train_sentences, train_labels))
batch_size = 2
shuffle = True
window_size = 2
collate_fn = partial(custom_collate_fn, window_size=window_size, word_into_ix=word_into_ix)

# Now we instantiate the Dataloader
loader = DataLoader(data, batch_size=batch_size, shuffle=shuffle, collate_fn=collate_fn)

#Go through one loop
counter = 0
for batched_x, batched_y, batched_lengths in loader:
    print(f'Iteration {counter}')
    print('Batched Input:')
    print(batched_x)
    print('Batched Labels:')
    print(batched_y)
    print('Batched Lengths:')
    print(batched_lengths)
    print('')
    counter += 1

Now our model needs to create the windows for each word, make a prediction as to whether the center
word is a LOCATION or not

Given that our window_size is N we want out model to make a prediction on every 2N+1 tokens, that is if we have an input with 9 tokens, it should return 5 predictions

In [None]:
# Print original tensor
print(f'Original tensor: ')
print(batched_x)
print('')

# Creat the 2 * 2 + 2 chunks
chunk = batched_x.unfold(1, window_size*2 + 1, 1)
print(f'Windows: ')
print(chunk)

Now we get to our actual model. We have prepared our data and we are ready to build our model.
Now we are going to put it all together here.

In [None]:
class WordWindowClassifier(nn.Module):

    def __init__(self, hyperparameters, vocab_size, pad_ix=0):
        super(WordWindowClassifier, self).__init__()

        """Instance variables"""
        self.window_size = hyperparameters["window_size"]
        self.embed_dim = hyperparameters["embed_dim"]
        self.hidden_dim = hyperparameters["hidden_dim"]
        self.freeze_embeddings = hyperparameters["freeze_embeddings"]


        """Embedding Layer
        Takes in a tensor containing embedding indices, and returns the corresponding
        embeddings. The output is of dim (number_of_indices * embedding_dim)

        If freeze_embeddings is True, set the embedding layer parameters to be non-trainable.
        This is useful if we only want the parameters other than the embeddings to change.

        """
        self.embeds = nn.Embedding(vocab_size, self.embed_dim, padding_idx=pad_ix)
        if self.freeze_embeddings:
            self.embeds.weight.requires_grad = False

        """
        Hidden Layer
        """
        full_window_size = 2 * window_size + 1
        self.hidden_layer = nn.Sequential(
            nn.linear(full_window_size * self.embed_dim, self.hidden_dim),
            nn.Tanh()
        )
        """
        Output layer
        """
        self.output_layer = nn.linear(self.hidden_dim)
        """
        Probabilities
        """
        self.probabilities = nn.sigmoid()

    def forward(self, inputs):
        """
        Let B:= Window-padded sentence length
            L:= self.embed_dim
            S:= self.window_size
            H:= self.hidden_layer
        """
        B, L = inputs.size()
        """
        Reshaping
        Takes in a (B, L) LongTensor
        Outputs a (B, l~, S) LongTensor
        """
        # First, fet our word windows for each word in out input
        token_windows = inputs.unfold(1, 2 * self.window_size + 1, 1)
        _, adjusted_lenght, _ = token_windows.size()
        # We use underlines here for the two values that we do not care about here
        # They are irrelevant at this step

        # Now we do a sanity check on our token windows to make sure everything is working properly
        assert token_windows.size() == (B, adjusted_lenght, 2 * self.window_size, 1)

        """
        Embedding.
        Takes in a torch.LongTensor of size (B, L~, S)
        Outputs a (B, L~, S, D) FloatTensor
        """
        embedded_windows = self.embeds(token_windows)

        """
        Reshaping.
        Takes in a (B, L~, S, D) FloatTensor.
        Resizes it into a (B, L~, S*D) FloatTensor.
        -1 argument "infers" what the last dimension should be based on leftover axis.
        """
        embedded_windows = embedded_windows.view(B, adjusted_lenght, -1)

        """
        Layer 1.
        Takes in a (B, L~, S*D) FloatTensor.
        Resizes it into (B, L~, H) FloatTensor.
        """
        layer_1 = self.hidden_layer(embedded_windows)

        """
        Layer 2.
        Takes in a (B, L~, H) FloatTensor.
        Resizes it into (B, L~, 1) Float Tensor.
        """
        output = self.output_layer(layer_1)

        """
        Softmax.
        Takes in a (B, L~, 1) FloatTensor of unnormalized class scores.
        Outputs a (B, L~, 1) FloatTensor of (log-)normalized c
        """
        output = self.probabilities(output)
        output = output.view(B, -1)

        return output


#### Training
Now we are ready to put everything together at last.
Start with preparing our data and intializing our model.
Then we can intialize our optimizer and define out loss function.
And this time instead of using a predefined loss function, we will define our own.

In [None]:
# Data preparation
data = list(zip(train_sentences, train_labels))
batch_size = 2
shuffle = True
window_size = 2
collate_fn = partial(custom_collate_fn, window_size=window_size, word_into_ix=word_into_ix)

# Instantiate a DataLoader
loader = DataLoader(data, batch_size=batch_size, shuffle=shuffle, collate_fn=collate_fn)

# Now initialize our model
#It's useful to put all the model hyperparameters into a dictionary
model_hyperparmeters = {
    "batch_size": 4,
    "window_size": 2,
    "embed_dim": 25,
    "hidden_dim": 25,
    "freeze_embeddings": False
}

vocab_size = len(word_into_ix)
model = WordWindowClassifier(model_hyperparmeters, vocab_size)
