In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

import pprint
pp = pprint.PrettyPrinter()

In [2]:
# Our raw data, which consists of sentences
corpus = [
          "We always come to Paris",
          "The professor is from Australia",
          "I live in Stanford",
          "He comes from Taiwan",
          "The capital of Turkey is Ankara"
         ]

In [3]:
### this is the preprocessing steps that we will take for our data
def preprocess_sentence(sentence):
    return sentence.lower().split()

train_sentences = [preprocess_sentence(sent) for sent in corpus]
train_sentences

[['we', 'always', 'come', 'to', 'paris'],
 ['the', 'professor', 'is', 'from', 'australia'],
 ['i', 'live', 'in', 'stanford'],
 ['he', 'comes', 'from', 'taiwan'],
 ['the', 'capital', 'of', 'turkey', 'is', 'ankara']]

In [4]:
locations = set(['paris', 'australia', 'stanford', 'taiwan', 'turkey', 'ankara'])

train_labels = [[1 if word in locations else 0 for word in sent] for sent in train_sentences]
train_labels


[[0, 0, 0, 0, 1],
 [0, 0, 0, 0, 1],
 [0, 0, 0, 1],
 [0, 0, 0, 1],
 [0, 0, 0, 1, 0, 1]]

In [5]:
vocabulary = set(w for s in train_sentences for w in s)
vocabulary

{'always',
 'ankara',
 'australia',
 'capital',
 'come',
 'comes',
 'from',
 'he',
 'i',
 'in',
 'is',
 'live',
 'of',
 'paris',
 'professor',
 'stanford',
 'taiwan',
 'the',
 'to',
 'turkey',
 'we'}

In [6]:
vocabulary.add("<unk>")
vocabulary.add("<pad>")

In [7]:
def pad_window(sentence, window_size, pad_token="<pad>"):
    window = [pad_token] * window_size
    return window + sentence + window

window_size = 2
pad_window(train_sentences[0], window_size=window_size)

['<pad>', '<pad>', 'we', 'always', 'come', 'to', 'paris', '<pad>', '<pad>']

In [8]:
# We are just converting our vocabularly to a list to be able to index into it
# Sorting is not necessary, we sort to show an ordered word_to_ind dictionary
# That being said, we will see that having the index for the padding token
# be 0 is convenient as some PyTorch functions use it as a default value
# such as nn.utils.rnn.pad_sequence, which we will cover in a bit
ix_into_words = sorted(list(vocabulary))
word_into_ix = {word: ind for ind, word in enumerate(ix_into_words)}
word_into_ix

{'<pad>': 0,
 '<unk>': 1,
 'always': 2,
 'ankara': 3,
 'australia': 4,
 'capital': 5,
 'come': 6,
 'comes': 7,
 'from': 8,
 'he': 9,
 'i': 10,
 'in': 11,
 'is': 12,
 'live': 13,
 'of': 14,
 'paris': 15,
 'professor': 16,
 'stanford': 17,
 'taiwan': 18,
 'the': 19,
 'to': 20,
 'turkey': 21,
 'we': 22}

In [9]:
###### Great! We are ready to convert our training sentences into a sequence of indices corresponding to each token.

In [10]:
# Given a sentence of tokens, return the corresponding indices
# def convert_token_to_indices(sentences, word_to_ix):
#     indices = []
#     for token in sentences:
#         if token in word_into_ix:
#             index = word_to_ix[token]
#         else:
#             index = word_into_ix["<unk>"]
#         indices.append(index)
#     return indices
def convert_token_to_indices(sentence, word_to_ix):
    return [word_into_ix.get(token, word_into_ix["<unk>"])for token in sentence]

example_sentence = ["we", "always", "come", "to", "kuwait"]
example_indices = convert_token_to_indices(example_sentence, word_into_ix)
restored_example = [ix_into_words[ind] for ind in example_indices]

print(example_indices)
print(f"Original sentence is: {example_sentence}")
print(f"Going from words to indices: {example_indices}")
print(f"Going from indices to words: {restored_example}")

[22, 2, 6, 20, 1]
Original sentence is: ['we', 'always', 'come', 'to', 'kuwait']
Going from words to indices: [22, 2, 6, 20, 1]
Going from indices to words: ['we', 'always', 'come', 'to', '<unk>']


In the example above, `kuwait` shows up as `<unk>`, because it is not included in our vocabulary. Let's convert our `train_sentences` to `example_padded_indices`.

In [11]:
example_padded_indices = [convert_token_to_indices(s, word_into_ix) for s in train_sentences]
example_padded_indices

[[22, 2, 6, 20, 15],
 [19, 16, 12, 8, 4],
 [10, 13, 11, 17],
 [9, 7, 8, 18],
 [19, 5, 14, 21, 12, 3]]

In [12]:
embedding_dim = 5
embeds = nn.Embedding(len(vocabulary), embedding_dim)
list(embeds.parameters())

[Parameter containing:
 tensor([[-0.5566, -0.2973,  0.7979, -1.0992,  1.1357],
         [-0.0195, -0.5383, -0.0191,  0.4970, -0.1460],
         [-0.5175, -1.0026, -0.2947,  0.3439, -0.3770],
         [-0.0227, -0.4044,  1.0167,  0.0094, -0.3849],
         [ 0.4756,  1.9361, -0.2580, -0.1633,  0.4199],
         [-0.4498, -0.3365, -1.1495,  0.8817,  1.8471],
         [ 1.1960,  1.3883,  0.2877, -0.6307, -0.2359],
         [-1.3974, -1.1944,  0.5969, -0.1510, -1.9802],
         [ 0.6131, -0.7753, -0.6028,  1.3068, -0.3897],
         [ 0.8399,  0.5327,  0.1397, -1.9980,  0.3431],
         [ 0.6340, -1.7666, -0.1620, -0.7948, -0.9057],
         [ 0.0042, -0.2346,  0.7788,  0.2809,  2.0080],
         [-1.2792, -0.8290, -0.5560, -1.6716, -0.5987],
         [-0.0859,  0.4920, -0.4824, -1.5534, -0.1874],
         [ 0.5157,  0.9199,  0.5778,  0.3613, -0.1230],
         [ 0.8765, -0.6908, -0.0958,  0.2061,  0.5311],
         [-0.8379,  1.0306,  0.4352, -0.0924,  0.2690],
         [ 0.6127, -0.779

In [13]:
#we want to get the lookup tensor for the word Paris
index = word_into_ix.get('paris')
index_tensor = torch.tensor(index, dtype=torch.long)
paris_embed = embeds(index_tensor)
paris_embed


tensor([ 0.8765, -0.6908, -0.0958,  0.2061,  0.5311],
       grad_fn=<EmbeddingBackward0>)