In [3]:
import torch
import torch.nn as nn

import pprint
pp = pprint.PrettyPrinter()

# Tensors

## From Python list

In [4]:
# Initialize a tensor from a Python List
data = [
    [0, 1],
    [2, 3],
    [4, 5]
]
x_python = torch.tensor(data)

# Print the tensor
x_python

tensor([[0, 1],
        [2, 3],
        [4, 5]])

In [6]:
x_float = torch.tensor(data, dtype=torch.float)
x_float

tensor([[0., 1.],
        [2., 3.],
        [4., 5.]])

In [8]:
x_bool = torch.tensor(data, dtype=torch.bool)
x_bool

tensor([[False,  True],
        [ True,  True],
        [ True,  True]])

In [10]:
x_python.float()

tensor([[0., 1.],
        [2., 3.],
        [4., 5.]])

## From Numpy

In [12]:
import numpy as np

ndarray = np.array(data)
x_numpy = torch.from_numpy(ndarray)

x_numpy

tensor([[0, 1],
        [2, 3],
        [4, 5]])

## From a Tensor

In [15]:
# Base tensor
x = torch.tensor([[1., 2.], [3., 4.]])
x

tensor([[1., 2.],
        [3., 4.]])

In [17]:
x_zeros = torch.zeros_like(x)
x_zeros

tensor([[0., 0.],
        [0., 0.]])

In [19]:
x_ones = torch.ones_like(x)
x_ones

tensor([[1., 1.],
        [1., 1.]])

In [20]:
x_rand = torch.rand_like(x)
x_rand

tensor([[0.8962, 0.6999],
        [0.1680, 0.3985]])

In [21]:
x_randn = torch.randn_like(x)
x_randn

tensor([[-1.0109,  0.3549],
        [-1.6442, -0.0284]])

## By specifying a Shape

In [22]:
shape = (4, 2, 2)
x_zeros = torch.zeros(shape)
x_zeros

tensor([[[0., 0.],
         [0., 0.]],

        [[0., 0.],
         [0., 0.]],

        [[0., 0.],
         [0., 0.]],

        [[0., 0.],
         [0., 0.]]])

In [23]:
x = torch.arange(10)
x

tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

## Tensor properties

### Shape 

In [24]:
x = torch.ones(3, 2)
x.dtype

torch.float32

In [27]:
x = torch.Tensor([[1, 2], [3, 4], [5, 6]])
x

tensor([[1., 2.],
        [3., 4.],
        [5., 6.]])

In [28]:
x.shape

torch.Size([3, 2])

In [29]:
x.shape[0]

3

In [30]:
x.size(0)

3

In [31]:
x_view = x.view(3, 2)
x_view

tensor([[1., 2.],
        [3., 4.],
        [5., 6.]])

In [33]:
x_view = x.view(-1, 3)
x_view

tensor([[1., 2., 3.],
        [4., 5., 6.]])

In [34]:
x_reshaped = torch.reshape(x, (2, 3))
x_reshaped

tensor([[1., 2., 3.],
        [4., 5., 6.]])

In [35]:
x = torch.arange(10).reshape(5, 2)
x

tensor([[0, 1],
        [2, 3],
        [4, 5],
        [6, 7],
        [8, 9]])

In [36]:
x = x.unsqueeze(1)
x.shape

torch.Size([5, 1, 2])

In [38]:
x = x.squeeze()
x.shape

torch.Size([5, 2])

In [40]:
x.numel()

10

### Device

In [41]:
x = torch.Tensor([[1, 2], [3, 4]])
x

tensor([[1., 2.],
        [3., 4.]])

In [42]:
x.device

device(type='cpu')

In [43]:
if torch.cuda.is_available():
    x.to('cuda')

## Tensor Indexing 

In [44]:
# Initialize an example tensor
x = torch.Tensor([
                  [[1, 2], [3, 4]],
                  [[5, 6], [7, 8]], 
                  [[9, 10], [11, 12]] 
                 ])
x

tensor([[[ 1.,  2.],
         [ 3.,  4.]],

        [[ 5.,  6.],
         [ 7.,  8.]],

        [[ 9., 10.],
         [11., 12.]]])

In [45]:
x.shape

torch.Size([3, 2, 2])

In [46]:
x[0]

tensor([[1., 2.],
        [3., 4.]])

In [47]:
x[:, 0, 0]

tensor([1., 5., 9.])

In [50]:
x

tensor([[[ 1.,  2.],
         [ 3.,  4.]],

        [[ 5.,  6.],
         [ 7.,  8.]],

        [[ 9., 10.],
         [11., 12.]]])

In [49]:
i = torch.tensor([0, 0, 1, 1])
x[i]

tensor([[[1., 2.],
         [3., 4.]],

        [[1., 2.],
         [3., 4.]],

        [[5., 6.],
         [7., 8.]],

        [[5., 6.],
         [7., 8.]]])

In [51]:
i = torch.tensor([1, 2])
j = torch.tensor([0])
x[i, j]

tensor([[ 5.,  6.],
        [ 9., 10.]])

In [52]:
x[0, 0, 0]

tensor(1.)

In [53]:
x[0, 0, 0].item()

1.0

## Operations 

In [55]:
x = torch.ones(3, 2, 2)
x

tensor([[[1., 1.],
         [1., 1.]],

        [[1., 1.],
         [1., 1.]],

        [[1., 1.],
         [1., 1.]]])

In [56]:
x + 2

tensor([[[3., 3.],
         [3., 3.]],

        [[3., 3.],
         [3., 3.]],

        [[3., 3.],
         [3., 3.]]])

In [57]:
x * 2

tensor([[[2., 2.],
         [2., 2.]],

        [[2., 2.],
         [2., 2.]],

        [[2., 2.],
         [2., 2.]]])

In [59]:
a = torch.ones((4, 3)) * 6
a

tensor([[6., 6., 6.],
        [6., 6., 6.],
        [6., 6., 6.],
        [6., 6., 6.]])

In [60]:
b = torch.ones(3) * 2
b

tensor([2., 2., 2.])

In [61]:
a / b

tensor([[3., 3., 3.],
        [3., 3., 3.],
        [3., 3., 3.],
        [3., 3., 3.]])

In [62]:
a @ b

tensor([36., 36., 36., 36.])

In [65]:
pp.pprint(a.shape)
pp.pprint(a.T.shape)

torch.Size([4, 3])
torch.Size([3, 4])


In [67]:
# Create an example tensor
m = torch.tensor(
    [
     [1., 1.],
     [2., 2.],
     [3., 3.],
     [4., 4.]
    ]
)

pp.pprint("Mean: {}".format(m.mean()))
pp.pprint("Mean in the 0th dimension: {}".format(m.mean(0)))
pp.pprint("Mean in the 1th dimension: {}".format(m.mean(1)))

'Mean: 2.5'
'Mean in the 0th dimension: tensor([2.5000, 2.5000])'
'Mean in the 1th dimension: tensor([1., 2., 3., 4.])'


In [68]:
# Concatenate in dimension 0 and 1
a_cat0 = torch.cat([a, a, a], dim=0)
a_cat1 = torch.cat([a, a, a], dim=1)

print("Initial shape: {}".format(a.shape))
print("Shape after concatenation in dimension 0: {}".format(a_cat0.shape))
print("Shape after concatenation in dimension 1: {}".format(a_cat1.shape))

Initial shape: torch.Size([4, 3])
Shape after concatenation in dimension 0: torch.Size([12, 3])
Shape after concatenation in dimension 1: torch.Size([4, 9])


In [69]:
a

tensor([[6., 6., 6.],
        [6., 6., 6.],
        [6., 6., 6.],
        [6., 6., 6.]])

In [70]:
a.add(a)
a

tensor([[6., 6., 6.],
        [6., 6., 6.],
        [6., 6., 6.],
        [6., 6., 6.]])

In [71]:
a.add_(a)
a

tensor([[12., 12., 12.],
        [12., 12., 12.],
        [12., 12., 12.],
        [12., 12., 12.]])

## Autograd 

In [77]:
x = torch.tensor([2.], requires_grad=True)
pp.pprint(x.grad)

None


In [78]:
y = x * x * 3
y.backward()
pp.pprint(x.grad)

tensor([12.])


In [79]:
z = x * x *3
z.backward()
pp.pprint(x.grad)

tensor([24.])


# Neural Network Module 

In [80]:
import torch.nn as nn

## Linear layer

In [84]:
input = torch.ones(2, 3, 4)

linear = nn.Linear(4, 2)
linear_output = linear(input)
linear_output

tensor([[[-0.1600, -0.2957],
         [-0.1600, -0.2957],
         [-0.1600, -0.2957]],

        [[-0.1600, -0.2957],
         [-0.1600, -0.2957],
         [-0.1600, -0.2957]]], grad_fn=<AddBackward0>)

## Activation Function Layer 

In [86]:
linear_output

tensor([[[-0.1600, -0.2957],
         [-0.1600, -0.2957],
         [-0.1600, -0.2957]],

        [[-0.1600, -0.2957],
         [-0.1600, -0.2957],
         [-0.1600, -0.2957]]], grad_fn=<AddBackward0>)

In [88]:
sigmoid = nn.Sigmoid()
output = sigmoid(linear_output)
output

tensor([[[0.4601, 0.4266],
         [0.4601, 0.4266],
         [0.4601, 0.4266]],

        [[0.4601, 0.4266],
         [0.4601, 0.4266],
         [0.4601, 0.4266]]], grad_fn=<SigmoidBackward>)

## Putting the Layers Together 

In [89]:
block = nn.Sequential(
    nn.Linear(4, 2),
    nn.Sigmoid())

input = torch.ones(2, 3, 4)
output = block(input)
output

tensor([[[0.4533, 0.7074],
         [0.4533, 0.7074],
         [0.4533, 0.7074]],

        [[0.4533, 0.7074],
         [0.4533, 0.7074],
         [0.4533, 0.7074]]], grad_fn=<SigmoidBackward>)

### Custom Modulues 

In [90]:
class MultilayerPerceptron(nn.Module):
    
    def __init__(self, input_size, hidden_size):
        super(MultilayerPerceptron, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        
        self.model = nn.Sequential(
            nn.Linear(self.input_size, self.hidden_size),
            nn.ReLU(),
            nn.Linear(self.hidden_size, self.input_size),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        output = self.model(x)
        return output

In [91]:
input = torch.randn(2, 5)
input

tensor([[-0.7531,  2.1106, -0.7007, -0.0859, -0.1345],
        [-1.8899,  0.3836,  0.9575, -0.7254, -1.1139]])

In [92]:
model = MultilayerPerceptron(5, 3)
model(input)

tensor([[0.4950, 0.3568, 0.5792, 0.5409, 0.3772],
        [0.5608, 0.3628, 0.6255, 0.5288, 0.3067]], grad_fn=<SigmoidBackward>)

In [93]:
list(model.named_parameters())

[('model.0.weight', Parameter containing:
  tensor([[-0.2167,  0.3756,  0.4341, -0.3957, -0.4406],
          [ 0.2018,  0.1818,  0.3157, -0.4461,  0.2801],
          [-0.3928, -0.2104,  0.3142,  0.3246,  0.1554]], requires_grad=True)),
 ('model.0.bias', Parameter containing:
  tensor([ 0.1612, -0.0521, -0.0284], requires_grad=True)),
 ('model.2.weight', Parameter containing:
  tensor([[ 0.4369,  0.0720, -0.3297],
          [-0.1795,  0.0237,  0.3918],
          [-0.0059,  0.4957,  0.3796],
          [-0.0787, -0.2435,  0.0574],
          [-0.1284, -0.1452, -0.3530]], requires_grad=True)),
 ('model.2.bias', Parameter containing:
  tensor([-0.4160, -0.4267,  0.3247,  0.2353, -0.3850], requires_grad=True))]

## Optimization 

In [95]:
import torch.optim as optim

In [96]:
y = torch.ones(10, 5)

x = y + torch.randn_like(y)
x

tensor([[ 0.7779,  0.9647,  1.9476,  1.9706,  1.7899],
        [-0.1213,  3.3108,  0.6291,  1.3773,  2.8998],
        [ 2.4536, -0.8032,  0.8095,  1.4692, -0.0266],
        [ 1.3653,  1.2021,  0.0934,  1.0497,  2.4420],
        [-0.0276,  1.9113,  1.2256,  0.6083,  1.2179],
        [ 2.2881,  2.1811,  0.7544,  1.6644,  0.9204],
        [ 2.6070,  0.8955,  2.0630,  2.1397,  0.5987],
        [ 2.7572,  1.0345,  0.0089, -0.3678,  1.8597],
        [ 1.3556,  1.3756,  0.4785,  0.9865,  0.2422],
        [ 1.9422,  1.7130,  2.2025,  1.6128,  0.9251]])

In [97]:
model = MultilayerPerceptron(5, 3)
adam = optim.Adam(model.parameters(), lr=1e-1)
loss_function = nn.BCELoss()

y_pred = model(x)
loss_function(y_pred, y).item()

0.8227662444114685

In [98]:
n_epoch = 10

for epoch in range(n_epoch):
    adam.zero_grad()
    y_pred = model(x)
    loss = loss_function(y_pred, y)
    print(f"Epoch {epoch}: training loss: {loss}")
    loss.backward()
    adam.step()

Epoch 0: training loss: 0.8227662444114685
Epoch 1: training loss: 0.676514744758606
Epoch 2: training loss: 0.5837696194648743
Epoch 3: training loss: 0.4635762870311737
Epoch 4: training loss: 0.32721611857414246
Epoch 5: training loss: 0.1965080052614212
Epoch 6: training loss: 0.09940735995769501
Epoch 7: training loss: 0.04349245876073837
Epoch 8: training loss: 0.017437821254134178
Epoch 9: training loss: 0.006805609446018934


In [99]:
y_pred = model(x)
y_pred

tensor([[0.9988, 0.9996, 0.9999, 1.0000, 0.9983],
        [0.9987, 0.9994, 0.9998, 1.0000, 0.9984],
        [0.9880, 0.9950, 0.9978, 0.9993, 0.9857],
        [0.9968, 0.9984, 0.9993, 0.9999, 0.9963],
        [0.9912, 0.9956, 0.9974, 0.9995, 0.9904],
        [0.9988, 0.9995, 0.9999, 1.0000, 0.9984],
        [0.9994, 0.9998, 1.0000, 1.0000, 0.9991],
        [0.9959, 0.9977, 0.9983, 0.9999, 0.9958],
        [0.9869, 0.9937, 0.9963, 0.9990, 0.9857],
        [0.9994, 0.9998, 1.0000, 1.0000, 0.9992]], grad_fn=<SigmoidBackward>)

In [100]:
x2 = y + torch.randn_like(y)
y_pred = model(x2)
y_pred

tensor([[0.9993, 0.9998, 1.0000, 1.0000, 0.9990],
        [0.9924, 0.9969, 0.9987, 0.9997, 0.9908],
        [0.9844, 0.9937, 0.9973, 0.9988, 0.9813],
        [0.9966, 0.9984, 0.9992, 0.9999, 0.9961],
        [0.9928, 0.9950, 0.9944, 0.9995, 0.9935],
        [0.9909, 0.9959, 0.9980, 0.9995, 0.9896],
        [0.9970, 0.9985, 0.9993, 0.9999, 0.9965],
        [0.9985, 0.9994, 0.9998, 1.0000, 0.9981],
        [0.9948, 0.9977, 0.9990, 0.9998, 0.9938],
        [0.9930, 0.9972, 0.9990, 0.9997, 0.9913]], grad_fn=<SigmoidBackward>)

# Demo: Word Window Classification 

## Data

In [101]:
# Our raw data, which consists of sentences
corpus = [
          "We always come to Paris",
          "The professor is from Australia",
          "I live in Stanford",
          "He comes from Taiwan",
          "The capital of Turkey is Ankara"
         ]

### Preprocessing 

In [102]:
def preprocess_sentence(sentence):
    return sentence.lower().split()

train_sentences = [sent.lower().split() for sent in corpus]
train_sentences

[['we', 'always', 'come', 'to', 'paris'],
 ['the', 'professor', 'is', 'from', 'australia'],
 ['i', 'live', 'in', 'stanford'],
 ['he', 'comes', 'from', 'taiwan'],
 ['the', 'capital', 'of', 'turkey', 'is', 'ankara']]

In [104]:
locations = set(["australia", "ankara", "paris", "stanford", "taiwan", "turkey"])

train_labels = [[1 if word in locations else 0 for word in sent] for sent in train_sentences]
train_labels

[[0, 0, 0, 0, 1],
 [0, 0, 0, 0, 1],
 [0, 0, 0, 1],
 [0, 0, 0, 1],
 [0, 0, 0, 1, 0, 1]]

### Converting the words to embeddings 

In [105]:
vocabulary = set(w for s in train_sentences for w in s)
vocabulary

{'always',
 'ankara',
 'australia',
 'capital',
 'come',
 'comes',
 'from',
 'he',
 'i',
 'in',
 'is',
 'live',
 'of',
 'paris',
 'professor',
 'stanford',
 'taiwan',
 'the',
 'to',
 'turkey',
 'we'}

In [106]:
vocabulary.add("<unk>")

In [107]:
vocabulary.add("<pad>")

def pad_window(sentence, window_size, pad_token="<pad>"):
    window = [pad_token] * window_size
    return window + sentence + window

window_size = 2
pad_window(train_sentences[0], window_size=window_size)

['<pad>', '<pad>', 'we', 'always', 'come', 'to', 'paris', '<pad>', '<pad>']

In [108]:
ix_to_word = sorted(list(vocabulary))

word_to_ix = {word: ind for ind, word in enumerate(ix_to_word)}
word_to_ix

{'<pad>': 0,
 '<unk>': 1,
 'always': 2,
 'ankara': 3,
 'australia': 4,
 'capital': 5,
 'come': 6,
 'comes': 7,
 'from': 8,
 'he': 9,
 'i': 10,
 'in': 11,
 'is': 12,
 'live': 13,
 'of': 14,
 'paris': 15,
 'professor': 16,
 'stanford': 17,
 'taiwan': 18,
 'the': 19,
 'to': 20,
 'turkey': 21,
 'we': 22}

In [109]:
def convert_token_to_indices(sentence, word_to_ix):
    indices = []
    for token in sentence:
        if token in word_to_ix:
            index = word_to_ix[token]
        else:
            index = word_to_ix["<unk>"]
        indices.append(index)
    return indices

In [111]:
example_padded_indices = [convert_token_to_indices(s, word_to_ix) for s in train_sentences]
example_padded_indices

[[22, 2, 6, 20, 15],
 [19, 16, 12, 8, 4],
 [10, 13, 11, 17],
 [9, 7, 8, 18],
 [19, 5, 14, 21, 12, 3]]

In [112]:
embedding_dim = 5
embeds = nn.Embedding(len(vocabulary), embedding_dim)

list(embeds.parameters())

[Parameter containing:
 tensor([[-1.0438, -0.7594, -1.2429, -0.9675, -0.5421],
         [ 0.5904, -0.3822, -0.3377, -0.5467, -1.6959],
         [-0.1684,  0.0953,  0.4167, -0.9343, -0.7105],
         [-0.7384,  0.8329, -1.3471, -0.7814, -0.5336],
         [ 1.4964, -1.2145, -0.6583,  0.2651, -0.5711],
         [-0.8416,  0.3527, -0.1740,  0.2533,  0.3015],
         [ 1.3283,  0.0590,  0.0707,  0.7774, -0.5143],
         [ 2.1485, -0.3947,  0.2431, -0.2072, -0.2714],
         [ 0.9240, -1.2958, -0.4004,  2.0959, -0.6149],
         [-0.1205,  1.7748,  0.2206, -0.9654, -0.8555],
         [-0.8745, -0.3593,  0.5483, -1.1653,  0.2301],
         [-1.3185,  0.2990,  1.3506, -0.8036, -0.1402],
         [ 0.9432,  0.6465, -0.6482,  1.5425,  0.6648],
         [-2.3618,  1.1250,  0.0108,  0.4851,  0.1148],
         [-0.4075, -0.2102, -0.1306,  0.0360, -0.7633],
         [ 0.1564, -0.3607,  1.1361,  0.8833, -1.4348],
         [ 0.6241,  0.0524,  0.7946, -0.9099,  0.5626],
         [-1.7290,  0.226

In [113]:
# Get the embedding for the word Paris
index = word_to_ix["paris"]
index_tensor = torch.tensor(index, dtype=torch.long)
paris_embed = embeds(index_tensor)
paris_embed

tensor([ 0.1564, -0.3607,  1.1361,  0.8833, -1.4348],
       grad_fn=<EmbeddingBackward>)

In [116]:
index_paris = word_to_ix["paris"]
index_ankara = word_to_ix["ankara"]
indices = [index_paris, index_ankara]
indices_tensor = torch.tensor(indices, dtype=torch.long)
embeddings = embeds(indices_tensor)
embeddings

tensor([[ 0.1564, -0.3607,  1.1361,  0.8833, -1.4348],
        [-0.7384,  0.8329, -1.3471, -0.7814, -0.5336]],
       grad_fn=<EmbeddingBackward>)

### Batching Sentences 

In [131]:
from torch.utils.data import DataLoader
from functools import partial

def _custom_collate_fn(batch, window_size, word_to_ix):
  # Prepare the datapoints
  x, y = zip(*batch)  
  x = [pad_window(s, window_size=window_size) for s in x]
  x = [convert_tokens_to_indices(s, word_to_ix) for s in x]

  # Pad x so that all the examples in the batch have the same size
  pad_token_ix = word_to_ix["<pad>"]
  x = [torch.LongTensor(x_i) for x_i in x]
  x_padded = nn.utils.rnn.pad_sequence(x, batch_first=True, padding_value=pad_token_ix)

  # Pad y and record the length
  lengths = [len(label) for label in y]
  lenghts = torch.LongTensor(lengths)
  y = [torch.LongTensor(y_i) for y_i in y]
  y_padded = nn.utils.rnn.pad_sequence(y, batch_first=True, padding_value=0)

  return x_padded, y_padded, lenghts

In [132]:
# Parameters to be passed to the DataLoader
data = list(zip(train_sentences, train_labels))
batch_size = 2
shuffle = True
window_size = 2
collate_fn = partial(custom_collate_fn, window_size=window_size, word_to_ix=word_to_ix)

# Instantiate the DataLoader
loader = DataLoader(data, batch_size=batch_size, shuffle=shuffle, collate_fn=collate_fn)

# Go through one loop
counter = 0
for batched_x, batched_y, batched_lengths in loader:
  print(f"Iteration {counter}")
  print("Batched Input:")
  print(batched_x)
  print("Batched Labels:")
  print(batched_y)
  print("Batched Lengths:")
  print(batched_lengths)
  print("")
  counter += 1

Iteration 0
Batched Input:
tensor([[ 0,  0,  9,  7,  8, 18,  0,  0,  0,  0],
        [ 0,  0, 19,  5, 14, 21, 12,  3,  0,  0]])
Batched Labels:
tensor([[0, 0, 0, 1, 0, 0],
        [0, 0, 0, 1, 0, 1]])
Batched Lengths:
tensor([4, 6])

Iteration 1
Batched Input:
tensor([[ 0,  0, 19, 16, 12,  8,  4,  0,  0],
        [ 0,  0, 22,  2,  6, 20, 15,  0,  0]])
Batched Labels:
tensor([[0, 0, 0, 0, 1],
        [0, 0, 0, 0, 1]])
Batched Lengths:
tensor([5, 5])

Iteration 2
Batched Input:
tensor([[ 0,  0, 10, 13, 11, 17,  0,  0]])
Batched Labels:
tensor([[0, 0, 0, 1]])
Batched Lengths:
tensor([4])



In [133]:
print(f"Original Tensor: ")
print(batched_x)
print("")

chunk = batched_x.unfold(1, window_size*2 + 1, 1)
print(f"Windows: ")
print(chunk)

Original Tensor: 
tensor([[ 0,  0, 10, 13, 11, 17,  0,  0]])

Windows: 
tensor([[[ 0,  0, 10, 13, 11],
         [ 0, 10, 13, 11, 17],
         [10, 13, 11, 17,  0],
         [13, 11, 17,  0,  0]]])


### Model

In [157]:
class WordWindowClassifier(nn.Module):

  def __init__(self, hyperparameters, vocab_size, pad_ix=0):
    super(WordWindowClassifier, self).__init__()
    
    """ Instance variables """
    self.window_size = hyperparameters["window_size"]
    self.embed_dim = hyperparameters["embed_dim"]
    self.hidden_dim = hyperparameters["hidden_dim"]
    self.freeze_embeddings = hyperparameters["freeze_embeddings"]

    """ Embedding Layer 
    Takes in a tensor containing embedding indices, and returns the 
    corresponding embeddings. The output is of dim 
    (number_of_indices * embedding_dim).

    If freeze_embeddings is True, set the embedding layer parameters to be
    non-trainable. This is useful if we only want the parameters other than the
    embeddings parameters to change. 

    """
    self.embeds = nn.Embedding(vocab_size, self.embed_dim, padding_idx=pad_ix)
    if self.freeze_embeddings:
      self.embed_layer.weight.requires_grad = False

    """ Hidden Layer
    """
    full_window_size = 2 * window_size + 1
    self.hidden_layer = nn.Sequential(
      nn.Linear(full_window_size * self.embed_dim, self.hidden_dim), 
      nn.Tanh()
    )

    """ Output Layer
    """
    self.output_layer = nn.Linear(self.hidden_dim, 1)

    """ Probabilities 
    """
    self.probabilities = nn.Sigmoid()

  def forward(self, inputs):
    """
    Let B:= batch_size
        L:= window-padded sentence length
        D:= self.embed_dim
        S:= self.window_size
        H:= self.hidden_dim
        
    inputs: a (B, L) tensor of token indices
    """
    B, L = inputs.size()

    """
    Reshaping.
    Takes in a (B, L) LongTensor
    Outputs a (B, L~, S) LongTensor
    """
    # Fist, get our word windows for each word in our input.
    token_windows = inputs.unfold(1, 2 * self.window_size + 1, 1)
    _, adjusted_length, _ = token_windows.size()

    # Good idea to do internal tensor-size sanity checks, at the least in comments!
    assert token_windows.size() == (B, adjusted_length, 2 * self.window_size + 1)

    """
    Embedding.
    Takes in a torch.LongTensor of size (B, L~, S) 
    Outputs a (B, L~, S, D) FloatTensor.
    """
    embedded_windows = self.embeds(token_windows)

    """
    Reshaping.
    Takes in a (B, L~, S, D) FloatTensor.
    Resizes it into a (B, L~, S*D) FloatTensor.
    -1 argument "infers" what the last dimension should be based on leftover axes.
    """
    embedded_windows = embedded_windows.view(B, adjusted_length, -1)

    """
    Layer 1.
    Takes in a (B, L~, S*D) FloatTensor.
    Resizes it into a (B, L~, H) FloatTensor
    """
    layer_1 = self.hidden_layer(embedded_windows)

    """
    Layer 2
    Takes in a (B, L~, H) FloatTensor.
    Resizes it into a (B, L~, 1) FloatTensor.
    """
    output = self.output_layer(layer_1)

    """
    Softmax.
    Takes in a (B, L~, 1) FloatTensor of unnormalized class scores.
    Outputs a (B, L~, 1) FloatTensor of (log-)normalized class scores.
    """
    output = self.probabilities(output)
    output = output.view(B, -1)

    return output

### Training 

In [158]:
# Prepare the data
data = list(zip(train_sentences, train_labels))
batch_size = 2
shuffle = True
window_size = 2
collate_fn = partial(custom_collate_fn, window_size=window_size, word_to_ix=word_to_ix)

# Instantiate a DataLoader
loader = DataLoader(data, batch_size=batch_size, shuffle=shuffle, collate_fn=collate_fn)

# Initialize a model
# It is useful to put all the model hyperparameters in a dictionary
model_hyperparameters = {
    "batch_size": 4,
    "window_size": 2,
    "embed_dim": 25,
    "hidden_dim": 25,
    "freeze_embeddings": False,
}

vocab_size = len(word_to_ix)
model = WordWindowClassifier(model_hyperparameters, vocab_size)

# Define an optimizer
learning_rate = 0.01
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

# Define a loss function, which computes to binary cross entropy loss
def loss_function(batch_outputs, batch_labels, batch_lengths):   
    # Calculate the loss for the whole batch
    bceloss = nn.BCELoss()
    loss = bceloss(batch_outputs, batch_labels.float())

    # Rescale the loss. Remember that we have used lengths to store the 
    # number of words in each training example
    loss = loss / batch_lengths.sum().float()

    return loss

In [159]:
# Function that will be called in every epoch
def train_epoch(loss_function, optimizer, model, loader):
  
  # Keep track of the total loss for the batch
  total_loss = 0
  for batch_inputs, batch_labels, batch_lengths in loader:
    # Clear the gradients
    optimizer.zero_grad()
    # Run a forward pass
    outputs = model.forward(batch_inputs)
    # Compute the batch loss
    loss = loss_function(outputs, batch_labels, batch_lengths)
    # Calculate the gradients
    loss.backward()
    # Update the parameteres
    optimizer.step()
    total_loss += loss.item()

  return total_loss


# Function containing our main training loop
def train(loss_function, optimizer, model, loader, num_epochs=10000):

  # Iterate through each epoch and call our train_epoch function
  for epoch in range(num_epochs):
    epoch_loss = train_epoch(loss_function, optimizer, model, loader)
    if epoch % 100 == 0: print(epoch_loss)

In [160]:
num_epochs = 1000
train(loss_function, optimizer, model, loader, num_epochs=num_epochs)

0.3036842495203018
0.25503042340278625
0.1798364520072937
0.1650073491036892
0.13519935123622417
0.11038812808692455
0.08957967069000006
0.07133886963129044
0.05756521504372358
0.05000739265233278


### Prediction 

In [162]:
test_corpus = ["She comes from Paris"]
test_sentences = [s.lower().split() for s in test_corpus]
test_labels = [[0, 0, 0, 1]]

test_data = list(zip(test_sentences, test_labels))
batch_size = 1
shuffle = True
window_size = 2
collate_fn = partial(custom_collate_fn, window_size=2, word_to_ix=word_to_ix)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=1, shuffle=False, collate_fn=collate_fn)

In [163]:
for test_instance, labels, _ in test_loader:
    outputs = model.forward(test_instance)
    print(labels)
    print(outputs)

tensor([[0, 0, 0, 1]])
tensor([[0.1545, 0.1098, 0.0938, 0.7376]], grad_fn=<ViewBackward>)
