In [356]:
import torch
import torch.nn as nn
import torch.optim as optim
import pprint



pp = pprint.PrettyPrinter()

In [357]:
# Our raw data, which consists of sentences
corpus = [
          "We always come to Paris",
          "The professor is from Australia",
          "I live in Stanford",
          "He comes from Taiwan",
          "The capital of Turkey is Ankara"
         ]

In [358]:
### this is the preprocessing steps that we will take for our data
def preprocess_sentence(sentence):
    return sentence.lower().split()

train_sentences = [preprocess_sentence(sent) for sent in corpus]
train_sentences

[['we', 'always', 'come', 'to', 'paris'],
 ['the', 'professor', 'is', 'from', 'australia'],
 ['i', 'live', 'in', 'stanford'],
 ['he', 'comes', 'from', 'taiwan'],
 ['the', 'capital', 'of', 'turkey', 'is', 'ankara']]

In [359]:
locations = set(['paris', 'australia', 'stanford', 'taiwan', 'turkey', 'ankara'])

train_labels = [[1 if word in locations else 0 for word in sent] for sent in train_sentences]
train_labels


[[0, 0, 0, 0, 1],
 [0, 0, 0, 0, 1],
 [0, 0, 0, 1],
 [0, 0, 0, 1],
 [0, 0, 0, 1, 0, 1]]

In [360]:
vocabulary = set(w for s in train_sentences for w in s)
vocabulary

{'always',
 'ankara',
 'australia',
 'capital',
 'come',
 'comes',
 'from',
 'he',
 'i',
 'in',
 'is',
 'live',
 'of',
 'paris',
 'professor',
 'stanford',
 'taiwan',
 'the',
 'to',
 'turkey',
 'we'}

In [361]:
vocabulary.add("<unk>")
vocabulary.add("<pad>")

In [362]:
def pad_window(sentence, window_size, pad_token="<pad>"):
    window = [pad_token] * window_size
    return window + sentence + window

window_size = 2
pad_window(train_sentences[0], window_size=window_size)

['<pad>', '<pad>', 'we', 'always', 'come', 'to', 'paris', '<pad>', '<pad>']

In [363]:
# We are just converting our vocabularly to a list to be able to index into it
# Sorting is not necessary, we sort to show an ordered word_to_ind dictionary
# That being said, we will see that having the index for the padding token
# be 0 is convenient as some PyTorch functions use it as a default value
# such as nn.utils.rnn.pad_sequence, which we will cover in a bit
ix_to_words = sorted(list(vocabulary))
word_to_ix = {word: ind for ind, word in enumerate(ix_to_words)}
word_to_ix

{'<pad>': 0,
 '<unk>': 1,
 'always': 2,
 'ankara': 3,
 'australia': 4,
 'capital': 5,
 'come': 6,
 'comes': 7,
 'from': 8,
 'he': 9,
 'i': 10,
 'in': 11,
 'is': 12,
 'live': 13,
 'of': 14,
 'paris': 15,
 'professor': 16,
 'stanford': 17,
 'taiwan': 18,
 'the': 19,
 'to': 20,
 'turkey': 21,
 'we': 22}

In [364]:
# We are ready to convert our training sentences into a sequence of indices corresponding to each token.

In [365]:
# # Given a sentence of tokens, return the corresponding indices
# # def convert_token_to_indices(sentences, word_to_ix):
# #     indices = []
# #     for token in sentences:
# #         if token in word_to_ix:
# #             index = word_to_ix[token]
# #         else:
# #             index = word_to_ix["<unk>"]
# #         indices.append(index)
# #     return indices
# def convert_token_to_indices(sentence, word_to_ix):
#     return [word_to_ix.get(token, word_to_ix["<unk>"])for token in sentence]
#
# example_sentence = ["we", "always", "come", "to", "kuwait"]
# example_indices = convert_token_to_indices(example_sentence, word_to_ix)
# restored_example = [ix_to_words[ind] for ind in example_indices]
#
# print(example_indices)
# print(f"Original sentence is: {example_sentence}")
# print(f"Going from words to indices: {example_indices}")
# print(f"Going from indices to words: {restored_example}")

In [366]:
example_padded_indices = [convert_token_to_indices(s, word_to_ix) for s in train_sentences]
example_padded_indices

[[22, 2, 6, 20, 15],
 [19, 16, 12, 8, 4],
 [10, 13, 11, 17],
 [9, 7, 8, 18],
 [19, 5, 14, 21, 12, 3]]

In [367]:
embedding_dim = 5
embeds = nn.Embedding(len(vocabulary), embedding_dim)
list(embeds.parameters())

[Parameter containing:
 tensor([[-2.0596, -0.0735, -1.5079, -0.2774,  0.8163],
         [-2.2444, -0.5621,  0.5508, -1.7929, -0.7057],
         [-0.0600,  0.0720, -0.2341,  0.8129, -0.3203],
         [ 0.8071,  0.3906,  1.8616,  0.4977,  0.4330],
         [ 0.6045,  0.4990, -1.0775, -0.2709,  0.4924],
         [ 0.5458, -1.1854, -0.3158,  0.3069,  1.2659],
         [ 1.5832,  0.5701,  1.8572,  2.6360,  1.9865],
         [-0.5471, -1.7537,  0.7713, -0.1700, -1.8573],
         [ 0.7872, -0.9673,  0.0487,  0.9399,  0.2434],
         [-0.3049, -0.3712, -0.2111,  0.6995,  0.8096],
         [-1.0291, -0.0579, -0.1719,  0.8258, -1.4774],
         [ 1.1457,  1.4173,  1.1532,  1.6549,  1.0306],
         [-2.1529, -0.2197, -0.3605,  0.2895, -0.3643],
         [-0.3731,  0.0208, -1.6271,  1.1005,  1.4999],
         [ 0.2322,  0.0332,  0.8207,  0.0075, -0.0876],
         [-1.2228, -1.1438,  0.1233, -0.9908,  0.2386],
         [ 0.2188,  1.1597, -0.3406,  0.2690,  0.9704],
         [ 0.1175, -0.273

In [368]:
#we want to get the lookup tensor for the word Paris
index = word_to_ix.get('paris')
index_tensor = torch.tensor(index, dtype=torch.long)
paris_embed = embeds(index_tensor)
paris_embed


tensor([-1.2228, -1.1438,  0.1233, -0.9908,  0.2386],
       grad_fn=<EmbeddingBackward0>)

In [369]:
from torch.utils.data import DataLoader
from functools import partial

def custom_collate_fn(batch, window_size, word_to_ix):
    # we are going to break out our batch into examples 'x' and labels 'y'
    # and then turn them into tensors because nn.utiles.rnn.pad_sequence
    # expects tensors as inputs
    x, y =zip(*batch)

    # we have already designed a function for padding but we're just gonna bring it here as well to have everything in one place
    def pad_window(sentence, window_size, pad_token="<pad>"):
        window = [pad_token] * window_size
        return window + sentence + window

    # now we pad our examples
    x = [pad_window(s, window_size=window_size) for s in x]

    # now we need to turn words in our training examples to indices.
    def convert_token_to_indices(sentence, word_to_ix):
        return [word_to_ix.get(token, word_to_ix['<unk>']) for token in sentence]

    # now we convert the examples themselves into indices
    x = [convert_token_to_indices(s, word_to_ix) for s in x]

    # now we pad all of our examples so they are all the same length
    # because our matrix operations are impossible if they are not the same length
    pad_token_ix = word_to_ix["<pad>"]

    # pad sequence expects tensors so we make x into a tensor
    x = [torch.LongTensor(x_i) for x_i in x]
    x_padded = nn.utils.rnn.pad_sequence(x, batch_first=True, padding_value=pad_token_ix)

    # We will also pad the labels. Before we do that, we record the number
    # of labels so that we know how many words existed in each example
    lengths = [len(label) for label in y]
    lengths = torch.LongTensor(lengths)

    # y = [ [0]*window_size + list(lbl) + [0]*window_size for lbl in y ]
    y = [torch.LongTensor(y_i) for y_i in y]
    y_padded = nn.utils.rnn.pad_sequence(y, batch_first=True, padding_value=0)


    #we are now ready to return our variables. The order we return our variables
    # here will match the oder we read them in our training loop
    return x_padded, y_padded, lengths
    #this order is super important


In [370]:
# Parameters to be passed to the DataLoader
data = list(zip(train_sentences, train_labels))
batch_size = 2
shuffle = True
window_size = 2
collate_fn = partial(custom_collate_fn, window_size=window_size, word_to_ix=word_to_ix)

# Instantiate the DataLoader
loader = DataLoader(data, batch_size=batch_size, shuffle=shuffle, collate_fn=collate_fn)

# Go through one loop
counter = 0
for batched_x, batched_y, batched_lengths in loader:
  print(f"Iteration {counter}")
  print("Batched Input:")
  print(batched_x)
  print("Batched Labels:")
  print(batched_y)
  print("Batched Lengths:")
  print(batched_lengths)
  print("")
  counter += 1

Iteration 0
Batched Input:
tensor([[ 0,  0, 19,  5, 14, 21, 12,  3,  0,  0],
        [ 0,  0, 10, 13, 11, 17,  0,  0,  0,  0]])
Batched Labels:
tensor([[0, 0, 0, 1, 0, 1],
        [0, 0, 0, 1, 0, 0]])
Batched Lengths:
tensor([6, 4])

Iteration 1
Batched Input:
tensor([[ 0,  0, 22,  2,  6, 20, 15,  0,  0],
        [ 0,  0,  9,  7,  8, 18,  0,  0,  0]])
Batched Labels:
tensor([[0, 0, 0, 0, 1],
        [0, 0, 0, 1, 0]])
Batched Lengths:
tensor([5, 4])

Iteration 2
Batched Input:
tensor([[ 0,  0, 19, 16, 12,  8,  4,  0,  0]])
Batched Labels:
tensor([[0, 0, 0, 0, 1]])
Batched Lengths:
tensor([5])



Now our model needs to create the windows for each word, make a prediction as to whether the center
word is a LOCATION or not

Given that our window_size is N we want out model to make a prediction on every 2N+1 tokens, that is if we have an input with 9 tokens, it should return 5 predictions

In [371]:
# Print original tensor
print(f'Original tensor: ')
print(batched_x)
print('')

# Create the 2 * 2 + 2 chunks
chunk = batched_x.unfold(1, window_size*2 + 1, 1)
print(f'Windows: ')
print(chunk)

Original tensor: 
tensor([[ 0,  0, 19, 16, 12,  8,  4,  0,  0]])

Windows: 
tensor([[[ 0,  0, 19, 16, 12],
         [ 0, 19, 16, 12,  8],
         [19, 16, 12,  8,  4],
         [16, 12,  8,  4,  0],
         [12,  8,  4,  0,  0]]])


Now we get to our actual model. We have prepared our data and we are ready to build our model.
Now we are going to put it all together here.

In [372]:
class WordWindowClassifier(nn.Module):

    def __init__(self, hyperparameters, vocab_size, pad_ix=0):
        super(WordWindowClassifier, self).__init__()

        """Instance variables"""
        self.window_size = hyperparameters["window_size"]
        self.embed_dim = hyperparameters["embed_dim"]
        self.hidden_dim = hyperparameters["hidden_dim"]
        self.freeze_embeddings = hyperparameters["freeze_embeddings"]


        """Embedding Layer
        Takes in a tensor containing embedding indices, and returns the corresponding
        embeddings. The output is of dim (number_of_indices * embedding_dim)

        If freeze_embeddings is True, set the embedding layer parameters to be non-trainable.
        This is useful if we only want the parameters other than the embeddings to change.

        """
        self.embeds = nn.Embedding(vocab_size, self.embed_dim, padding_idx=pad_ix)
        if self.freeze_embeddings:
            self.embed_layer.weight.requires_grad = False

        """
        Hidden Layer
        """
        full_window_size = 2 * window_size + 1
        self.hidden_layer = nn.Sequential(
            nn.Linear(full_window_size * self.embed_dim, self.hidden_dim),
            nn.Tanh()
        )
        """
        Output layer
        """
        self.output_layer = nn.Linear(self.hidden_dim, 1)
        """
        Probabilities
        """
        self.probabilities = nn.Sigmoid()

    def forward(self, inputs):
        """
        Let B:= Window-padded sentence length
            L:= self.embed_dim
            S:= self.window_size
            H:= self.hidden_layer
        """
        B, L = inputs.size()
        """
        Reshaping
        Takes in a (B, L) LongTensor
        Outputs a (B, l~, S) LongTensor
        """
        # First, fet our word windows for each word in out input
        token_windows = inputs.unfold(1, 2 * self.window_size + 1, 1)
        _, adjusted_length, _ = token_windows.size()
        # We use underlines here for the two values that we do not care about here
        # They are irrelevant at this step

        # Now we do a sanity check on our token windows to make sure everything is working properly
        assert token_windows.size() == (B, adjusted_length, 2 * self.window_size + 1)

        """
        Embedding.
        Takes in a torch.LongTensor of size (B, L~, S)
        Outputs a (B, L~, S, D) FloatTensor
        """
        embedded_windows = self.embeds(token_windows)

        """
        Reshaping.
        Takes in a (B, L~, S, D) FloatTensor.
        Resizes it into a (B, L~, S*D) FloatTensor.
        -1 argument "infers" what the last dimension should be based on leftover axis.
        """
        embedded_windows = embedded_windows.view(B, adjusted_length, -1)

        """
        Layer 1.
        Takes in a (B, L~, S*D) FloatTensor.
        Resizes it into (B, L~, H) FloatTensor.
        """
        layer_1 = self.hidden_layer(embedded_windows)

        """
        Layer 2.
        Takes in a (B, L~, H) FloatTensor.
        Resizes it into (B, L~, 1) Float Tensor.
        """
        output = self.output_layer(layer_1)

        """
        Softmax.
        Takes in a (B, L~, 1) FloatTensor of unnormalized class scores.
        Outputs a (B, L~, 1) FloatTensor of (log-)normalized c
        """
        output = self.probabilities(output)
        output = output.view(B, -1)

        return output


#### Training
Now we are ready to put everything together at last.
Start with preparing our data and intializing our model.
Then we can intialize our optimizer and define out loss function.
And this time instead of using a predefined loss function, we will define our own.

In [373]:
# Data preparation
data = list(zip(train_sentences, train_labels))
batch_size = 2
shuffle = True
window_size = 2
collate_fn = partial(custom_collate_fn, window_size=window_size, word_to_ix=word_to_ix)

# Instantiate a DataLoader
loader = DataLoader(data, batch_size=batch_size, shuffle=shuffle, collate_fn=collate_fn)

# Now initialize our model
#It's useful to put all the model hyperparameters into a dictionary
model_hyperparameters = {
    "batch_size": 4,
    "window_size": 2,
    "embed_dim": 25,
    "hidden_dim": 25,
    "freeze_embeddings": False
}

vocab_size = len(word_to_ix)
model = WordWindowClassifier(model_hyperparameters, vocab_size)

# We need to define our optimizer
learning_rate = 0.01
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

#Define a loss function, that computes to binary cross entropy loss
def loss_function(batch_outputs, batch_labels, batch_lengths):
    #Calculate the loss for the whole batch
    bceloss = nn.BCELoss()
    loss = bceloss(batch_outputs, batch_labels.float())

    # Rescale the loss. Remember that we have used lengths to store the
    # number of the words in each training example
    loss = loss / batch_lengths.sum().float()

    return loss

Now that want to make our training data into batches and then feed them to our model
we need to iterate over the batches too

In [374]:
# Function that will be called in every epoch
def train_epoch(loss_function, optimizer, model, loader):
    #Keep  track of the total loss for the batch
    total_loss = 0
    for batch_inputs, batch_labels, batch_lengths in  loader:
        #clear gradients
        optimizer.zero_grad()
        #Run a forward pass
        outputs = model.forward(batch_inputs)
        #Compute the batch loss
        loss = loss_function(outputs, batch_labels, batched_lengths)
        #calculate gradients
        loss.backward()
        #update out parameters
        optimizer.step()
        total_loss += loss.item()

    return total_loss


# Function containing out main training loop
def train(loss_function, optimizer, model, loader, num_epochs=10000):
    #Iterate through each epoch and call our train_epoch function
    for epoch in range(num_epochs):
        epoch_loss = train_epoch(loss_function, optimizer, model, loader)
        if epoch % 100 == 0: print(epoch_loss)

# LET THE TRAINING BEGIN!

In [375]:
num_epochs = 1000
train(loss_function, optimizer, model, loader, num_epochs=num_epochs)

0.41429315507411957
0.3132731541991234
0.22233904898166656
0.1418863646686077
0.1116438414901495
0.07947961986064911
0.06458008289337158
0.04541182518005371
0.0365419602021575
0.03033832600340247


### Prediction
now we can see how well it can actually predict.
We can start by creating our test data.

In [378]:
# Create test sentences
test_corpus = ["She come from Paris"]
test_sentences = [s.lower().split() for s in test_corpus]
test_labels = [[0, 0 ,0 ,1]]

#Create a test loader
test_data = list(zip(test_sentences, test_labels))
batch_size = 1
shuffle = False
window_size = 2
collate_fn = partial(custom_collate_fn, window_size=window_size, word_to_ix=word_to_ix)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=1, shuffle=False, collate_fn=collate_fn )

In [379]:
for test_instance, labels, _ in test_loader:
    outputs = model.forward(test_instance)
    print(labels)
    print(outputs)

tensor([[0, 0, 0, 1]])
tensor([[0.0382, 0.0328, 0.0283, 0.9614]], grad_fn=<ViewBackward0>)
