In [1]:
from operator import length_hint

import torch
import torch.nn as nn
import torch.optim as optim

import pprint

pp = pprint.PrettyPrinter()

In [2]:
# Our raw data, which consists of sentences
corpus = [
          "We always come to Paris",
          "The professor is from Australia",
          "I live in Stanford",
          "He comes from Taiwan",
          "The capital of Turkey is Ankara"
         ]

In [3]:
### this is the preprocessing steps that we will take for our data
def preprocess_sentence(sentence):
    return sentence.lower().split()

train_sentences = [preprocess_sentence(sent) for sent in corpus]
train_sentences

[['we', 'always', 'come', 'to', 'paris'],
 ['the', 'professor', 'is', 'from', 'australia'],
 ['i', 'live', 'in', 'stanford'],
 ['he', 'comes', 'from', 'taiwan'],
 ['the', 'capital', 'of', 'turkey', 'is', 'ankara']]

In [4]:
locations = set(['paris', 'australia', 'stanford', 'taiwan', 'turkey', 'ankara'])

train_labels = [[1 if word in locations else 0 for word in sent] for sent in train_sentences]
train_labels


[[0, 0, 0, 0, 1],
 [0, 0, 0, 0, 1],
 [0, 0, 0, 1],
 [0, 0, 0, 1],
 [0, 0, 0, 1, 0, 1]]

In [5]:
vocabulary = set(w for s in train_sentences for w in s)
vocabulary

{'always',
 'ankara',
 'australia',
 'capital',
 'come',
 'comes',
 'from',
 'he',
 'i',
 'in',
 'is',
 'live',
 'of',
 'paris',
 'professor',
 'stanford',
 'taiwan',
 'the',
 'to',
 'turkey',
 'we'}

In [6]:
vocabulary.add("<unk>")
vocabulary.add("<pad>")

In [7]:
def pad_window(sentence, window_size, pad_token="<pad>"):
    window = [pad_token] * window_size
    return window + sentence + window

window_size = 2
pad_window(train_sentences[0], window_size=window_size)

['<pad>', '<pad>', 'we', 'always', 'come', 'to', 'paris', '<pad>', '<pad>']

In [8]:
# We are just converting our vocabularly to a list to be able to index into it
# Sorting is not necessary, we sort to show an ordered word_to_ind dictionary
# That being said, we will see that having the index for the padding token
# be 0 is convenient as some PyTorch functions use it as a default value
# such as nn.utils.rnn.pad_sequence, which we will cover in a bit
ix_into_words = sorted(list(vocabulary))
word_into_ix = {word: ind for ind, word in enumerate(ix_into_words)}
word_into_ix

{'<pad>': 0,
 '<unk>': 1,
 'always': 2,
 'ankara': 3,
 'australia': 4,
 'capital': 5,
 'come': 6,
 'comes': 7,
 'from': 8,
 'he': 9,
 'i': 10,
 'in': 11,
 'is': 12,
 'live': 13,
 'of': 14,
 'paris': 15,
 'professor': 16,
 'stanford': 17,
 'taiwan': 18,
 'the': 19,
 'to': 20,
 'turkey': 21,
 'we': 22}

In [9]:
# We are ready to convert our training sentences into a sequence of indices corresponding to each token.

In [10]:
# Given a sentence of tokens, return the corresponding indices
# def convert_token_to_indices(sentences, word_to_ix):
#     indices = []
#     for token in sentences:
#         if token in word_into_ix:
#             index = word_to_ix[token]
#         else:
#             index = word_into_ix["<unk>"]
#         indices.append(index)
#     return indices
def convert_token_to_indices(sentence, word_to_ix):
    return [word_into_ix.get(token, word_into_ix["<unk>"])for token in sentence]

example_sentence = ["we", "always", "come", "to", "kuwait"]
example_indices = convert_token_to_indices(example_sentence, word_into_ix)
restored_example = [ix_into_words[ind] for ind in example_indices]

print(example_indices)
print(f"Original sentence is: {example_sentence}")
print(f"Going from words to indices: {example_indices}")
print(f"Going from indices to words: {restored_example}")

[22, 2, 6, 20, 1]
Original sentence is: ['we', 'always', 'come', 'to', 'kuwait']
Going from words to indices: [22, 2, 6, 20, 1]
Going from indices to words: ['we', 'always', 'come', 'to', '<unk>']


In [11]:
example_padded_indices = [convert_token_to_indices(s, word_into_ix) for s in train_sentences]
example_padded_indices

[[22, 2, 6, 20, 15],
 [19, 16, 12, 8, 4],
 [10, 13, 11, 17],
 [9, 7, 8, 18],
 [19, 5, 14, 21, 12, 3]]

In [12]:
embedding_dim = 5
embeds = nn.Embedding(len(vocabulary), embedding_dim)
list(embeds.parameters())

[Parameter containing:
 tensor([[ 1.4759e+00, -2.4065e-01, -9.6251e-01,  3.2371e-01, -1.9047e+00],
         [ 5.2418e-02,  1.1083e+00,  5.7652e-01,  2.0985e-02,  4.8053e-01],
         [-5.1035e-01, -1.4210e+00, -1.3285e+00, -1.2246e+00, -5.6465e-01],
         [ 1.2930e-02, -1.2087e+00,  5.3275e-01,  1.6090e+00,  7.2514e-01],
         [ 1.0790e-01, -2.0799e+00, -2.0972e-01,  4.7126e-01,  3.9881e-03],
         [-8.5796e-01, -1.6784e+00, -4.0366e-01, -1.8828e+00, -5.2009e-01],
         [-5.6866e-01,  4.9350e-02, -3.3253e-01, -5.4979e-01, -5.7710e-01],
         [-6.5453e-01, -1.0045e+00,  6.7280e-01, -9.1633e-01, -1.3520e+00],
         [-8.0023e-01,  1.1475e+00, -4.0933e-01, -1.1514e+00, -6.6068e-01],
         [-4.7910e-01, -1.5182e+00,  4.0799e-02,  1.5528e+00, -5.3339e-01],
         [ 1.6279e-01, -6.7179e-01, -6.7538e-01, -1.0129e+00, -4.1664e-01],
         [-1.9126e+00,  1.6517e-02,  6.9508e-01, -4.1973e-01,  8.5616e-02],
         [ 1.2379e+00, -1.3403e+00,  1.1769e+00,  5.7889e-01, -9.

In [13]:
#we want to get the lookup tensor for the word Paris
index = word_into_ix.get('paris')
index_tensor = torch.tensor(index, dtype=torch.long)
paris_embed = embeds(index_tensor)
paris_embed


tensor([-1.4449, -0.7197,  0.0103,  0.4384, -0.6187],
       grad_fn=<EmbeddingBackward0>)

In [14]:
from torch.utils.data import DataLoader
from functools import partial
def custom_collate_fn(batch, window_size,word_into_ix):
    # we are going to break out our batch into examples 'x' and labels 'y'
    # and then turn them into tensors because nn.utiles.rnn.pad_sequence
    # expects tensors as inputs
    x, y =zip(*batch)
    # we have already designed a function for padding but we're just gonna bring it here as well to have everything in one place
    def pad_window(sentence, window_size, pad_token="<pad>"):
        window = [pad_token] * window_size
        return window + sentence + window

    # now we pad our examples
    x = [pad_window(s, window_size=window_size) for s in x]

    # now we need to turn words in our training examples to indices.
    def convert_token_to_indices(sentence, word_to_ix):
        return [word_to_ix.get(token, word_to_ix['<unk>']) for token in sentence]

    # now we convert the examples themselves into indices
    x = [convert_token_to_indices(s, word_into_ix) for s in x]

    # now we pad all of our examples so they are all the same length
    # because our matrix operations are impossible if they are not the same length
    pad_token_ix = word_into_ix["<pad>"]

    # pad sequence expects tensors so we make x into a tensor
    x = [torch.LongTensor(x_i) for x_i in x]
    x_padded = nn.utils.rnn.pad_sequence(x, batch_first=True, padding_value=pad_token_ix)

    # We will also pad the labels. Before we do that, we record the number
    # of labels so that we know how many words existed in each example
    lengths = [len(label) for label in y]
    lengths = torch.LongTensor(lengths)

    y = [torch.LongTensor(y_i) for y_i in y]
    y_padded = nn.utils.rnn.pad_sequence(y, batch_first=True, padding_value=0)


    #we are now ready to return our variables. The order we return our variables
    # here will match the oder we read them in our training loop
    return x_padded, y_padded, lengths
    #this order is super important


In [15]:
# Params to be passed to the Dataloader
data = list(zip(train_sentences, train_labels))
batch_size = 2
shuffle = True
window_size = 2
collate_fn = partial(custom_collate_fn, window_size=window_size, word_into_ix=word_into_ix)

# Now we instantiate the Dataloader
loader = DataLoader(data, batch_size=batch_size, shuffle=shuffle, collate_fn=collate_fn)

#Go through one loop
counter = 0
for batched_x, batched_y, batched_lengths in loader:
    print(f'Iteration {counter}')
    print('Batched Input:')
    print(batched_x)
    print('Batched Labels:')
    print(batched_y)
    print('Batched Lengths:')
    print(batched_lengths)
    print('')
    counter += 1

Iteration 0
Batched Input:
tensor([[ 0,  0,  9,  7,  8, 18,  0,  0,  0],
        [ 0,  0, 19, 16, 12,  8,  4,  0,  0]])
Batched Labels:
tensor([[0, 0, 0, 1, 0],
        [0, 0, 0, 0, 1]])
Batched Lengths:
tensor([4, 5])

Iteration 1
Batched Input:
tensor([[ 0,  0, 22,  2,  6, 20, 15,  0,  0,  0],
        [ 0,  0, 19,  5, 14, 21, 12,  3,  0,  0]])
Batched Labels:
tensor([[0, 0, 0, 0, 1, 0],
        [0, 0, 0, 1, 0, 1]])
Batched Lengths:
tensor([5, 6])

Iteration 2
Batched Input:
tensor([[ 0,  0, 10, 13, 11, 17,  0,  0]])
Batched Labels:
tensor([[0, 0, 0, 1]])
Batched Lengths:
tensor([4])



Now our model needs to create the windows for each word, make a prediction as to whether the center
word is a LOCATION or not

Given that our window_size is N we want out model to make a prediction on every 2N+1 tokens, that is if we have an input with 9 tokens, it should return 5 predictions

In [19]:
# Print original tensor
print(f'Original tensor: ')
print(batched_x)
print('')

# Creat the 2 * 2 + 2 chunks
chunk = batched_x.unfold(1, window_size*2 + 1, 1)
print(f'Windows: ')
print(chunk)

Original tensor: 
tensor([[ 0,  0, 10, 13, 11, 17,  0,  0]])

Windows: 
tensor([[[ 0,  0, 10, 13, 11],
         [ 0, 10, 13, 11, 17],
         [10, 13, 11, 17,  0],
         [13, 11, 17,  0,  0]]])
