In [1]:
import torch
import numpy as np

In [2]:
torch.__version__

'0.4.0'

# Pytorch basics

## Tensors

### Creating tensors

In [126]:
# tensor from values
a = torch.tensor([[1, 2, 4]], dtype=torch.float32)     # create tensor from values
a

tensor([[ 1.,  2.,  4.]])

In [127]:
# tensor from numpy
b_npy = np.random.random((5, 5)).astype("float32")                     
print(b_npy)
b = torch.tensor(b_npy)
print(b)

[[0.3716929  0.10671763 0.67461187 0.69461477 0.50948256]
 [0.5664982  0.9036897  0.95842284 0.02542921 0.20762315]
 [0.2559032  0.82377505 0.99166316 0.55204743 0.4266114 ]
 [0.73056453 0.00121057 0.15028913 0.9993542  0.13688307]
 [0.22894198 0.43128413 0.1248633  0.17739904 0.76088506]]
tensor([[ 0.3717,  0.1067,  0.6746,  0.6946,  0.5095],
        [ 0.5665,  0.9037,  0.9584,  0.0254,  0.2076],
        [ 0.2559,  0.8238,  0.9917,  0.5520,  0.4266],
        [ 0.7306,  0.0012,  0.1503,  0.9994,  0.1369],
        [ 0.2289,  0.4313,  0.1249,  0.1774,  0.7609]])


In [128]:
# random tensors
c = torch.rand(5, 5)
c

tensor([[ 0.4761,  0.4938,  0.3167,  0.3423,  0.3792],
        [ 0.1385,  0.3678,  0.1521,  0.0115,  0.3064],
        [ 0.7492,  0.6215,  0.5232,  0.3266,  0.9820],
        [ 0.6884,  0.2273,  0.7682,  0.7967,  0.2997],
        [ 0.7273,  0.1180,  0.1513,  0.3336,  0.4257]])

In [130]:
# zeros/ones tensors
d = torch.zeros(3, 3)
d

tensor([[ 0.,  0.,  0.],
        [ 0.,  0.,  0.],
        [ 0.,  0.,  0.]])

In [131]:
e = torch.ones_like(d)
e

tensor([[ 1.,  1.,  1.],
        [ 1.,  1.,  1.],
        [ 1.,  1.,  1.]])

### Some properties of tensors

In [132]:
# whether it requires to compute gradient
c.requires_grad

False

In [133]:
# where the tensor is
c.device

device(type='cpu')

In [134]:
# what type the tensor has
c.dtype

torch.float32

In [135]:
# shape of the tensor
c.size()

torch.Size([5, 5])

### Some operations on tensors
see the docs: https://pytorch.org/docs/stable/tensors.html
<br>
also: https://pytorch.org/docs/stable/torch.html#math-operations

In [142]:
a = torch.rand(5, 5, dtype=torch.float32)
b = torch.rand_like(b, dtype=torch.float32, requires_grad=True)
c = a + b
c

tensor([[ 1.6487,  1.0225,  0.3342,  0.6757,  1.3462],
        [ 1.4846,  1.1068,  1.1529,  0.4243,  1.1896],
        [ 1.6959,  1.0366,  1.4236,  0.7783,  0.9389],
        [ 1.6985,  0.9650,  0.4490,  0.6380,  0.9967],
        [ 1.6503,  0.5376,  1.5316,  0.7680,  0.1247]])

In [143]:
d = c ** 2
d

tensor([[ 2.7181,  1.0454,  0.1117,  0.4566,  1.8122],
        [ 2.2041,  1.2250,  1.3292,  0.1800,  1.4151],
        [ 2.8759,  1.0745,  2.0267,  0.6058,  0.8815],
        [ 2.8848,  0.9313,  0.2016,  0.4070,  0.9935],
        [ 2.7233,  0.2890,  2.3459,  0.5899,  0.0156]])

In [144]:
e = d.sum(1)
e

tensor([ 6.1441,  6.3534,  7.4645,  5.4183,  5.9636])

In [145]:
d.requires_grad

True

## Parameters
Parameters are a special kind of tensors that will be used by optimizers in torch.optim

Parameters have '.requires_grad' = True by default

In [146]:
n = torch.nn.Parameter(torch.randn(5, 5))

In [147]:
print(n.requires_grad)

True


## Modules

Modules are model building blocks in PyTorch (parameterized function).

A module is created by subclassing torch.nn.Module.

A module must register its parameters and submodules by attaching them directly onto self:
(if you don't do this, parameters will not be found and thus will not be trained)

** Rule #1 of Modules **: attach parameters and submodules onto self --> will get registered
<br>
** Rule #2 of Modules **: write a forward() method

In [148]:
# let's create a custom module
class TestModule(torch.nn.Module):
    def __init__(self, indim, outdim, submodule=None):
        super(TestModule, self).__init__()          # must call super()
        self.linear = torch.nn.Linear(indim, outdim, bias=False)     # we attached a linear transformation layer to the module
        self.W = torch.nn.Parameter(torch.randn(indim, outdim))      # we attached a parameter matrix to the module
        self.submodule = submodule
        

In [149]:
test = TestModule(5, 5)
test2 = TestModule(5, 5, test)

In [150]:
# printing the test shows the structure of the module
print(test)   
print(test2)

TestModule(
  (linear): Linear(in_features=5, out_features=5, bias=False)
)
TestModule(
  (linear): Linear(in_features=5, out_features=5, bias=False)
  (submodule): TestModule(
    (linear): Linear(in_features=5, out_features=5, bias=False)
  )
)


In [151]:
# we can also list the parameters
list(test.parameters())

[Parameter containing:
 tensor([[ 0.0874,  1.5442,  0.3271,  0.4075,  0.3752],
         [-0.4018, -0.4171,  1.5578,  0.8214, -0.8434],
         [-1.2709,  0.5951, -1.2376, -0.9346,  0.9792],
         [ 0.8213, -0.7507, -1.2882,  0.9481,  0.1909],
         [-1.1109, -0.5505, -0.1167,  0.2421, -0.4846]]), Parameter containing:
 tensor([[-0.2806, -0.2322,  0.3848,  0.0547,  0.1888],
         [-0.3850, -0.3899, -0.3927, -0.0425, -0.2552],
         [-0.3589,  0.2408, -0.4185, -0.0705,  0.2114],
         [-0.2745, -0.1168,  0.1714,  0.0202, -0.0160],
         [ 0.3096, -0.1182,  0.2436, -0.0324, -0.1903]])]

In [154]:
# or parameters and their names/paths
list(test.named_parameters())

[('W', Parameter containing:
  tensor([[ 0.0874,  1.5442,  0.3271,  0.4075,  0.3752],
          [-0.4018, -0.4171,  1.5578,  0.8214, -0.8434],
          [-1.2709,  0.5951, -1.2376, -0.9346,  0.9792],
          [ 0.8213, -0.7507, -1.2882,  0.9481,  0.1909],
          [-1.1109, -0.5505, -0.1167,  0.2421, -0.4846]])),
 ('linear.weight', Parameter containing:
  tensor([[-0.2806, -0.2322,  0.3848,  0.0547,  0.1888],
          [-0.3850, -0.3899, -0.3927, -0.0425, -0.2552],
          [-0.3589,  0.2408, -0.4185, -0.0705,  0.2114],
          [-0.2745, -0.1168,  0.1714,  0.0202, -0.0160],
          [ 0.3096, -0.1182,  0.2436, -0.0324, -0.1903]]))]

In [156]:
list(test2.named_parameters())

[('W', Parameter containing:
  tensor([[-2.6009,  0.0731,  1.2403, -0.2455,  0.5811],
          [ 0.5209, -0.0635, -0.5618,  0.2443,  1.4677],
          [-0.3999,  1.3165,  0.1894,  0.9646,  0.0491],
          [-0.5074,  1.8130,  0.2246, -0.8867, -0.7882],
          [ 1.1245, -0.9600, -0.0222, -0.5023,  0.5953]])),
 ('linear.weight', Parameter containing:
  tensor([[ 0.4421,  0.2815, -0.2408, -0.0181, -0.4305],
          [ 0.4376,  0.0466, -0.0821, -0.1138, -0.3844],
          [-0.1578, -0.4110,  0.3896,  0.0003, -0.2445],
          [ 0.0517,  0.0818,  0.2739,  0.3353, -0.0179],
          [-0.1306,  0.4023, -0.4120,  0.3429,  0.2351]])),
 ('submodule.W', Parameter containing:
  tensor([[ 0.0874,  1.5442,  0.3271,  0.4075,  0.3752],
          [-0.4018, -0.4171,  1.5578,  0.8214, -0.8434],
          [-1.2709,  0.5951, -1.2376, -0.9346,  0.9792],
          [ 0.8213, -0.7507, -1.2882,  0.9481,  0.1909],
          [-1.1109, -0.5505, -0.1167,  0.2421, -0.4846]])),
 ('submodule.linear.weight'

-- ** .forward() method of Modules **
<br>
This is called when you use the module.
<br>
Must implement logic here.

In [157]:
class Forward(torch.nn.Module):
    ''' Single feedforward layer '''
    def __init__(self, indim, outdim, bias=False, nonlinearity=torch.nn.functional.sigmoid):
        super(Forward, self).__init__()
        self.linear = torch.nn.Linear(indim, outdim, bias=False)
        self.nonlinearity = nonlinearity
        
    def forward(self, x):
        ''' x must be of shape (batch_size, indim) '''
        a = self.linear(x)
        b = self.nonlinearity(a)
        return b

In [158]:
x = torch.rand(2, 5)     # (two examples in a batch, 5 values in each example's vector)
l = Forward(5, 3)        # (five input dimensions, three output dimensions)
y = l(x)
y

tensor([[ 0.5415,  0.5421,  0.5795],
        [ 0.6030,  0.5480,  0.6227]])

In [159]:
# not that while we don't need to compute gradients of x, 
# we do need to compute gradients of y
# in order to get gradients of the parameters in the submodules
print(x.requires_grad)
print(y.requires_grad)

False
True


### Backpropagation
PyTorch does backpropagation for you.
To compute gradients, call .backward() on some Tensor.

In [160]:
# let's try a dummy loss (just a scalar based on our computations)
dummy_loss = y.sum()
print(dummy_loss.dtype)
print(dummy_loss.size())    # --> it's a scalar !
print(dummy_loss)
print(dummy_loss.item())

torch.float32
torch.Size([])
tensor(3.4368)
3.43684601784


In [161]:
# let's check gradients of some parameters in our module
print(l.linear.weight.grad)

None


In [162]:
dummy_loss.backward()

In [163]:
# let's check again
print(l.linear.weight.grad)

tensor([[ 0.3143,  0.3057,  0.4176,  0.2235,  0.1711],
        [ 0.3217,  0.3110,  0.4243,  0.2277,  0.1762],
        [ 0.3085,  0.3000,  0.4099,  0.2194,  0.1680]])


** now we have a gradient stored on the parameter Tensor !!! **

PyTorch optimizers use .grad for updates on the parameters

# PyTorch NN
PyTorch provides many different layers commonly used in neural networks.
<br>
We will only see a couple of them.
<br>
All such layers are in the torch.nn package: https://pytorch.org/docs/stable/nn.html

## Embedding layer

In [164]:
number_of_words_in_voc = 1000
embedding_dimension = 50
embedder = torch.nn.Embedding(number_of_words_in_voc, 
                              embedding_dimension, padding_idx=0)

In [168]:
word_ids = torch.randint(0, number_of_words_in_voc, (3, 4), dtype=torch.int64)

In [170]:
word_ids[1, 1] = word_ids[0, 0]
print(word_ids)
print(word_ids.size())

tensor([[ 247,  137,  657,  784],
        [ 616,  247,  439,   48],
        [ 246,  662,   73,  158]])
torch.Size([3, 4])


In [171]:
word_embeddings = embedder(word_ids)

In [172]:
print(word_embeddings.size())

torch.Size([3, 4, 50])


In [173]:
print((word_embeddings[0, 0] - word_embeddings[1, 1]).norm())

tensor(0.)


In [174]:
dummy_loss = word_embeddings.sum()
dummy_loss.backward()

In [175]:
torch.nonzero(embedder.weight.grad)[:, 0].unique()
# gradient is non-zero only for the words we used

tensor([ 784,  662,  657,  616,  439,  247,  246,  158,  137,   73,
          48])

## Feedfoward layer: see before

## LSTM layer

In [190]:
lstm = torch.nn.LSTM(50, 6, bidirectional=False, batch_first=True)

In [191]:
x = torch.randn(3, 4, 50)       # three examples in a batch, 4 elements in sequence, each element is a five dim vector
y = lstm(x)

In [192]:
print(y[0].size())
y_T = y[1][0].squeeze(0)
c_T = y[1][1].squeeze(0)
print(c_T.size())
print(y_T.size())

torch.Size([3, 4, 6])
torch.Size([3, 6])
torch.Size([3, 6])


** combine **

In [240]:
x = torch.randint(0, number_of_words_in_voc, (3, 4), dtype=torch.int64)

In [241]:
embedder.zero_grad()
lstm.zero_grad()

In [242]:

embeddings = embedder(x)
print(embeddings.size())
_, (final_state, _) = lstm(embeddings)
print(final_state.size())

torch.Size([3, 4, 50])
torch.Size([1, 3, 6])


In [243]:
dummy_loss = final_state.sum()
dummy_loss.backward()

In [244]:
torch.nonzero(embedder.weight.grad)[:, 0].unique()
# gradient is non-zero only for the words we used

tensor([ 905,  758,  703,  658,  580,  524,  427,  415,  335,  301,
         209,   65])

In [245]:
print(x)

tensor([[ 415,  209,  427,  301],
        [ 335,  524,  703,  580],
        [ 658,  905,   65,  758]])


### Bidirectional LSTM

In [3]:
bilstm = torch.nn.LSTM(3, 4, bidirectional=True, batch_first=True)

In [5]:
torch.random.manual_seed(1234)
x = torch.randn(2, 6, 3, requires_grad=True)
ys, (y_n, c_n) = bilstm(x)
y_n = y_n.transpose(1, 0).contiguous().view(2, -1)
print(ys.size())
print(y_n.size())

torch.Size([2, 6, 8])
torch.Size([2, 8])


In [6]:
# 1. backprop on forward portion of y_n
x.grad = None
ys, (y_n, c_n) = bilstm(x)
y_n = y_n.transpose(1, 0).contiguous().view(2, -1)
l = y_n[0, :4].sum()
l.backward()
print(x.grad[:, :, 0])

tensor(1.00000e-02 *
       [[-0.0548, -0.1437, -0.0931, -0.3193,  0.0014, -8.1011],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000]])


In [7]:
# 2. backprop on backward portion of y_n
x.grad = None
ys, (y_n, c_n) = bilstm(x)
y_n = y_n.transpose(1, 0).contiguous().view(2, -1)
l = y_n[0, 4:].sum()
l.backward()
print(x.grad[:, :, 0])

tensor([[-0.1401, -0.0894, -0.0242, -0.0064, -0.0081, -0.0049],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000]])


In [8]:
# 3. backprop on forward portion of ys, t=N       (should be same as #1)
x.grad = None
ys, (y_n, c_n) = bilstm(x)
y_n = y_n.transpose(1, 0).contiguous().view(2, -1)
l = ys[0, -1, :4].sum()
l.backward()
print(x.grad[:, :, 0])

tensor(1.00000e-02 *
       [[-0.0548, -0.1437, -0.0931, -0.3193,  0.0014, -8.1011],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000]])


In [9]:
# 4. backprop on backward portion of ys, t=0      (should be same as #2)
x.grad = None
ys, (y_n, c_n) = bilstm(x)
y_n = y_n.transpose(1, 0).contiguous().view(2, -1)
l = ys[0, 0, 4:].sum()
l.backward()
print(x.grad[:, :, 0])

tensor([[-0.1401, -0.0894, -0.0242, -0.0064, -0.0081, -0.0049],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000]])


In [10]:
# 5. backprop on backward portion of ys, t=N
x.grad = None
ys, (y_n, c_n) = bilstm(x)
y_n = y_n.transpose(1, 0).contiguous().view(2, -1)
l = ys[0, -1, 4:].sum()
l.backward()
print(x.grad[:, :, 0])

tensor(1.00000e-02 *
       [[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000, -6.5817],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000]])


In [11]:
# 6. backprop on forward portion of ys, t=0
x.grad = None
ys, (y_n, c_n) = bilstm(x)
y_n = y_n.transpose(1, 0).contiguous().view(2, -1)
l = ys[0, 0, :4].sum()
l.backward()
print(x.grad[:, :, 0])

tensor(1.00000e-02 *
       [[-6.8864,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000]])


### PyTorch RNNs and variable length sequences
All torch.nn modules take batches of inputs. 
<br>
We have variable length inputs so we pad the sequences (with zeros).
<br>
This is a problem because LSTM will also take into computation padded elements.
<br>
But there is a solution: PyTorch provides methods for packing and unpacking sequences.
<br>
We provide more convenient wrappers in lib.py.

In [45]:
from lib import seq_pack, seq_unpack

In [46]:
x = torch.tensor([[2, 3, 1, 0],
                  [4, 4, 0, 0],
                  [1, 2, 3, 4],
                  [3, 0, 0, 0],
                  [4, 4, 3, 0]], dtype=torch.float32)
x

tensor([[ 2.,  3.,  1.,  0.],
        [ 4.,  4.,  0.,  0.],
        [ 1.,  2.,  3.,  4.],
        [ 3.,  0.,  0.,  0.],
        [ 4.,  4.,  3.,  0.]])

In [47]:
packed_x, unpack_order = seq_pack(x, x!= 0)

In [48]:
unpack_order

tensor([ 1,  3,  0,  4,  2])

In [49]:
x_unpacked, _= seq_unpack(packed_x, unpack_order)

In [50]:
x_unpacked

tensor([[ 2.,  3.,  1.,  0.],
        [ 4.,  4.,  0.,  0.],
        [ 1.,  2.,  3.,  4.],
        [ 3.,  0.,  0.,  0.],
        [ 4.,  4.,  3.,  0.]])

Usage with LSTMs:

In [107]:
bilstm = torch.nn.LSTM(4, 3, bidirectional=True, batch_first=True)
embedder = torch.nn.Embedding(x.max().long().item()+1, 4)
emb = embedder(x.long())
emb = emb.detach()
emb.requires_grad = True
emb_mask = x != 0
emb_mask

tensor([[ 1,  1,  1,  0],
        [ 1,  1,  0,  0],
        [ 1,  1,  1,  1],
        [ 1,  0,  0,  0],
        [ 1,  1,  1,  0]], dtype=torch.uint8)

In [108]:
packed_emb, unpack_order = seq_pack(emb, emb_mask)
ys, (y_n, c_n) = bilstm(packed_emb)

In [109]:
print(y_n.size())
print(unpack_order)

torch.Size([2, 5, 3])
tensor([ 1,  3,  0,  4,  2])


In [110]:
ys, _ = seq_unpack(ys, unpack_order)
ys.size()

torch.Size([5, 4, 6])

In [111]:
y_n = y_n.index_select(1, unpack_order)
y_n.size()

torch.Size([2, 5, 3])

In [112]:
y_n = y_n.transpose(1, 0).contiguous().view(5, -1)
y_n.size()

torch.Size([5, 6])

In [113]:
dummy_loss = y_n[1].sum()
dummy_loss.backward()

In [114]:
print(emb.grad)

tensor([[[ 0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000]],

        [[ 0.0061,  0.3043,  0.2963,  0.1018],
         [-0.0372,  0.2815,  0.1496,  0.0503],
         [ 0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000]],

        [[ 0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000]],

        [[ 0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000]],

        [[ 0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000]]])
