In [65]:
import torch
import torch.nn as nn

In [2]:
# Our raw data, which consists of sentences
corpus = [
          "We always come to Paris",
          "The professor is from Australia",
          "I live in Stanford",
          "He comes from Taiwan",
          "The capital of Turkey is Ankara"
         ]

In [3]:
# The preprocessing function we will use to generate our training examples
# Our function is a simple one, we lowercase the letters
# and then tokenize the words.
def preprocess_sentence(sentence):
    return sentence.lower().split()

# Create our training set
train_sentences = [sent.lower().split() for sent in corpus]
train_sentences

[['we', 'always', 'come', 'to', 'paris'],
 ['the', 'professor', 'is', 'from', 'australia'],
 ['i', 'live', 'in', 'stanford'],
 ['he', 'comes', 'from', 'taiwan'],
 ['the', 'capital', 'of', 'turkey', 'is', 'ankara']]

In [5]:
# Set of locations that appear in our corpus
locations = set(["australia", "ankara", "paris", "stanford", "taiwan", "turkey"])

# Our train labels
train_labels = [[1 if word in locations else 0 for word in sent] for sent in train_sentences]
train_labels

[[0, 0, 0, 0, 1],
 [0, 0, 0, 0, 1],
 [0, 0, 0, 1],
 [0, 0, 0, 1],
 [0, 0, 0, 1, 0, 1]]

In [6]:
# Find all the unique words in our corpus 
vocabulary = set(w for s in train_sentences for w in s)
vocabulary

{'always',
 'ankara',
 'australia',
 'capital',
 'come',
 'comes',
 'from',
 'he',
 'i',
 'in',
 'is',
 'live',
 'of',
 'paris',
 'professor',
 'stanford',
 'taiwan',
 'the',
 'to',
 'turkey',
 'we'}

In [60]:
#set(w for s in train_sentences for w in s)
# i=0
# for s in train_sentences:
#     i += 1
#     print(i, s, set(s),'\n')
#     for w in s:
#         print(w, set(w))
ss = train_sentences[0]+['come']
a = set()
for w in ss:
    a.add(w)
print(a)
ss 



{'come', 'always', 'paris', 'to', 'we'}


['we', 'always', 'come', 'to', 'paris', 'come']

In [7]:
# Add the unknown token to our vocabulary
vocabulary.add("<unk>")

In [9]:
# Add the <pad> token to our vocabulary
vocabulary.add("<pad>")

# Function that pads the given sentence
# We are introducing this function here as an example
# We will be utilizing it later in the tutorial
def pad_window(sentence, window_size, pad_token="<pad>"):
    window = [pad_token] * window_size
    return window + sentence + window

# Show padding example
window_size = 2
pad_window(train_sentences[0], window_size=window_size)

['<pad>', '<pad>', 'we', 'always', 'come', 'to', 'paris', '<pad>', '<pad>']

In [10]:
# We are just converting our vocabularly to a list to be able to index into it
# Sorting is not necessary, we sort to show an ordered word_to_ind dictionary
# That being said, we will see that having the index for the padding token
# be 0 is convenient as some PyTorch functions use it as a default value
# such as nn.utils.rnn.pad_sequence, which we will cover in a bit
ix_to_word = sorted(list(vocabulary))

# Creating a dictionary to find the index of a given word
word_to_ix = {word: ind for ind, word in enumerate(ix_to_word)}
word_to_ix

{'<pad>': 0,
 '<unk>': 1,
 'always': 2,
 'ankara': 3,
 'australia': 4,
 'capital': 5,
 'come': 6,
 'comes': 7,
 'from': 8,
 'he': 9,
 'i': 10,
 'in': 11,
 'is': 12,
 'live': 13,
 'of': 14,
 'paris': 15,
 'professor': 16,
 'stanford': 17,
 'taiwan': 18,
 'the': 19,
 'to': 20,
 'turkey': 21,
 'we': 22}

In [62]:
# Given a sentence of tokens, return the corresponding indices
def convert_token_to_indices(sentence, word_to_ix):
    indices = []
    for token in sentence:
    # Check if the token is in our vocabularly. If it is, get it's index. 
    # If not, get the index for the unknown token.
        if token in word_to_ix:
            index = word_to_ix[token]
        else:
            index = word_to_ix["<unk>"]
        indices.append(index)
    return indices

# More compact version of the same function
def _convert_token_to_indices(sentence, word_to_ix):
    return [word_to_ind.get(token, word_to_ix["<unk>"]) for token in sentence]

# Show an example
example_sentence = ["we", "always", "come", "to", "kuwait"]
example_indices = convert_token_to_indices(example_sentence, word_to_ix)
restored_example = [ix_to_word[ind] for ind in example_indices]

print(f"Original sentence is: {example_sentence}")
print(f"Going from words to indices: {example_indices}")
print(f"Going from indices to words: {restored_example}")

Original sentence is: ['we', 'always', 'come', 'to', 'kuwait']
Going from words to indices: [22, 2, 6, 20, 1]
Going from indices to words: ['we', 'always', 'come', 'to', '<unk>']


In [63]:
# Converting our sentences to indices
example_padded_indices = [convert_token_to_indices(s, word_to_ix) for s in train_sentences]
example_padded_indices

[[22, 2, 6, 20, 15],
 [19, 16, 12, 8, 4],
 [10, 13, 11, 17],
 [9, 7, 8, 18],
 [19, 5, 14, 21, 12, 3]]

In [66]:
# Creating an embedding table for our words
embedding_dim = 5
embeds = nn.Embedding(len(vocabulary), embedding_dim)

# Printing the parameters in our embedding table
list(embeds.parameters())

[Parameter containing:
 tensor([[ 1.5640,  0.9931, -0.5065, -0.6059,  0.2677],
         [ 0.9392, -0.4744,  1.7372,  0.9878, -0.5409],
         [ 0.6240, -0.2289,  0.7069,  1.1147, -0.1733],
         [-0.5652, -1.2163, -1.2439, -1.1777,  0.1270],
         [-0.7263,  0.0732, -0.3329, -0.0813, -0.0881],
         [-0.2984,  0.9377,  1.9442,  0.5603, -1.7099],
         [ 0.9644,  1.4539,  0.9083,  0.7319, -0.7047],
         [ 0.3939,  0.6944, -0.6944,  0.6625,  0.4587],
         [ 0.2147,  0.6061,  0.1211, -0.1540,  1.4776],
         [ 0.6206, -0.4433, -1.4120, -0.5253,  1.2812],
         [ 0.6761,  0.7323, -1.5500,  0.0033, -0.3617],
         [-1.1364, -1.0965,  0.9340, -0.8377,  0.9525],
         [ 0.3147, -0.1762, -1.2367,  1.8767, -1.5138],
         [-0.2269,  1.9277,  0.2408, -0.5290,  0.2557],
         [-0.3283,  0.0112, -0.5466,  0.6947, -0.3921],
         [-0.6507,  0.8004, -1.6272,  1.8601,  0.2682],
         [-0.0971,  0.5737,  1.6904, -1.7544,  0.1287],
         [ 1.2230, -0.913

In [67]:
# Get the embedding for the word Paris
index = word_to_ix["paris"]
index_tensor = torch.tensor(index, dtype=torch.long)
paris_embed = embeds(index_tensor)
paris_embed

tensor([-0.6507,  0.8004, -1.6272,  1.8601,  0.2682],
       grad_fn=<EmbeddingBackward>)

In [68]:
# We can also get multiple embeddings at once
index_paris = word_to_ix["paris"]
index_ankara = word_to_ix["ankara"]
indices = [index_paris, index_ankara]
indices_tensor = torch.tensor(indices, dtype=torch.long)
embeddings = embeds(indices_tensor)
embeddings

tensor([[-0.6507,  0.8004, -1.6272,  1.8601,  0.2682],
        [-0.5652, -1.2163, -1.2439, -1.1777,  0.1270]],
       grad_fn=<EmbeddingBackward>)