## Importing PyTorch

In [1]:
import torch

In [2]:
print("PyTorch version {}".format(torch.__version__))
print("GPU-enabled installation? {}".format(torch.cuda.is_available()))

PyTorch version 1.2.0
GPU-enabled installation? False


In [3]:
import numpy as np

## Tensors

#### Calling the constructor

In [4]:
t = torch.FloatTensor(2, 3)
print(t)
print(t.size())

tensor([[ 0.0000e+00, -2.0000e+00,  0.0000e+00],
        [-2.0000e+00,  7.3787e+22,  2.4176e-12]])
torch.Size([2, 3])


In [5]:
t.zero_()

tensor([[0., 0., 0.],
        [0., 0., 0.]])

In [6]:
torch.FloatTensor([[1, 2, 3], [4, 5, 6]])

tensor([[1., 2., 3.],
        [4., 5., 6.]])

#### Calling a method in the torch module

In [7]:
tl = torch.tensor([1, 2, 3])
t = torch.tensor([1., 2., 3.])
print("A 64-bit integer tensor: {}, {}".format(tl, tl.type()))
print("A 32-bit float tensor: {}, {}".format(t, t.type()))

A 64-bit integer tensor: tensor([1, 2, 3]), torch.LongTensor
A 32-bit float tensor: tensor([1., 2., 3.]), torch.FloatTensor


In [8]:
t = torch.zeros(2, 3)
print(t)

tensor([[0., 0., 0.],
        [0., 0., 0.]])


In [10]:
t_zeros = torch.zeros_like(t)        # zeros_like returns a new tensor
t_ones = torch.ones(2, 3)            # creates a tensor with 1s
t_fives = torch.empty(2, 3).fill_(5) # creates a non-initialized tensor and fills it with 5
t_random = torch.rand(2, 3)          # creates a uniform random tensor
t_normal = torch.randn(2, 3)         # creates a normal random tensor

print(t_zeros)
print(t_ones)
print(t_fives)
print(t_random)
print(t_normal)

tensor([[0., 0., 0.],
        [0., 0., 0.]])
tensor([[1., 1., 1.],
        [1., 1., 1.]])
tensor([[5., 5., 5.],
        [5., 5., 5.]])
tensor([[0.1837, 0.1376, 0.7396],
        [0.1249, 0.3281, 0.1789]])
tensor([[-0.5177, -0.7123, -0.4389],
        [ 0.1121, -1.3473, -1.0835]])


In [11]:
# the computational graph (see below)
t1 = torch.clone(t)
assert id(t) != id(t1), 'Functional methods create a new copy of the tensor'

# To create a new _independent_ copy, we do need to detach 
# from the graph
t1 = torch.clone(t).detach()

#### Using the PyTorch–NumPy bridge

In [12]:
# Create a new multi-dimensional array in NumPy with the np datatype (np.float32)
a = np.array([1., 2., 3.])

# Convert the array to a torch tensor
t = torch.tensor(a)

print("NumPy array: {}, type: {}".format(a, a.dtype))
print("Torch tensor: {}, type: {}".format(t, t.dtype))

NumPy array: [1. 2. 3.], type: float64
Torch tensor: tensor([1., 2., 3.], dtype=torch.float64), type: torch.float64


In [13]:
t.numpy()

array([1., 2., 3.])

#### Indexing

In [14]:
t = torch.randn(2, 3)
t[ : , 0]

tensor([0.6231, 0.5008])

In [16]:
t = torch.randn(5, 6)
print(t)
i = torch.tensor([1, 3])
j = torch.tensor([4, 5])
print(t[i])                          # selects rows 1 and 3
print(t[i, j])                       # selects (1, 4) and (3, 5)

tensor([[-0.0279, -0.3199, -0.6748,  0.1461,  0.9728,  1.0245],
        [-1.3091, -0.6430,  0.0109, -1.2471,  0.6354, -0.1463],
        [-1.7598, -2.4160,  0.5159, -1.2517,  0.1154,  0.8764],
        [ 0.9015,  1.5617, -0.6592,  1.2284, -0.3083, -0.1000],
        [-0.2604, -0.6674, -1.4130, -1.4552, -0.1316,  0.2240]])
tensor([[-1.3091, -0.6430,  0.0109, -1.2471,  0.6354, -0.1463],
        [ 0.9015,  1.5617, -0.6592,  1.2284, -0.3083, -0.1000]])
tensor([ 0.6354, -0.1000])


#### Type conversion

In [17]:
t = t.float()   # converts to 32-bit float
print(t)
t = t.double()  # converts to 64-bit float
print(t)
t = t.byte()    # converts to unsigned 8-bit integer
print(t)

tensor([[-0.0279, -0.3199, -0.6748,  0.1461,  0.9728,  1.0245],
        [-1.3091, -0.6430,  0.0109, -1.2471,  0.6354, -0.1463],
        [-1.7598, -2.4160,  0.5159, -1.2517,  0.1154,  0.8764],
        [ 0.9015,  1.5617, -0.6592,  1.2284, -0.3083, -0.1000],
        [-0.2604, -0.6674, -1.4130, -1.4552, -0.1316,  0.2240]])
tensor([[-0.0279, -0.3199, -0.6748,  0.1461,  0.9728,  1.0245],
        [-1.3091, -0.6430,  0.0109, -1.2471,  0.6354, -0.1463],
        [-1.7598, -2.4160,  0.5159, -1.2517,  0.1154,  0.8764],
        [ 0.9015,  1.5617, -0.6592,  1.2284, -0.3083, -0.1000],
        [-0.2604, -0.6674, -1.4130, -1.4552, -0.1316,  0.2240]],
       dtype=torch.float64)
tensor([[  0,   0,   0,   0,   0,   1],
        [255,   0,   0, 255,   0,   0],
        [255, 254,   0, 255,   0,   0],
        [  0,   1,   0,   1,   0,   0],
        [  0,   0, 255, 255,   0,   0]], dtype=torch.uint8)


#### Operations on tensors

In [18]:
# Scalars =: creates a tensor with a scalar 
# (zero-th order tensor,  i.e. just a number)
s = torch.tensor(42)
print(s)

tensor(42)


In [19]:
s.item()

42

In [28]:
# Row vector
x = torch.randn(1,3)
print("Row vector\n{}\nwith size {}".format(x, x.size()), '\n')

# Column vector
v = torch.randn(3,1)
print("Column vector\n{}\nwith size {}".format(v, v.size()), '\n')

# Matrix
A = torch.randn(3, 3)
print("Matrix\n{}\nwith size {}".format(A, A.size()))

Row vector
tensor([[-1.5524,  0.8494,  0.5094]])
with size torch.Size([1, 3]) 

Column vector
tensor([[-0.9034],
        [ 0.8244],
        [ 0.1981]])
with size torch.Size([3, 1]) 

Matrix
tensor([[-0.7321, -0.8348, -0.5578],
        [-0.4026, -0.7836, -0.3000],
        [-2.0847,  1.2997,  0.5293]])
with size torch.Size([3, 3])


In [29]:
u = torch.matmul(A, v)
print(u, '\n')
b = torch.randn(3,1)
y = u + b              # we can also do torch.add(u, b)
print(y)

tensor([[-0.1373],
        [-0.3417],
        [ 3.0597]]) 

tensor([[-1.5258],
        [ 0.1881],
        [ 4.1542]])


In [30]:
s = torch.matmul(x, torch.matmul(A, v))
print(s.item())

1.481540560722351


In [34]:
# common tensor methods (they also have the counterpart in the torch package, e.g. as torch.sum(t))
t = torch.randn(2,3)
t

tensor([[-0.7497,  0.5296,  1.1363],
        [ 0.2596,  0.2451, -1.7299]])

In [35]:
t.sum(dim=0)

tensor([-0.4901,  0.7747, -0.5935])

In [36]:
t.t()                   # transpose

tensor([[-0.7497,  0.2596],
        [ 0.5296,  0.2451],
        [ 1.1363, -1.7299]])

In [37]:
t.numel()               # number of elements in tensor

6

In [38]:
t.nonzero()             # indices of non-zero elements

tensor([[0, 0],
        [0, 1],
        [0, 2],
        [1, 0],
        [1, 1],
        [1, 2]])

In [39]:
t.view(-1, 2)           # reorganizes the tensor to these dimensions

tensor([[-0.7497,  0.5296],
        [ 1.1363,  0.2596],
        [ 0.2451, -1.7299]])

In [40]:
t.squeeze()             # removes size 1 dimensions

tensor([[-0.7497,  0.5296,  1.1363],
        [ 0.2596,  0.2451, -1.7299]])

In [41]:
t.unsqueeze(0)          # inserts a dimension

tensor([[[-0.7497,  0.5296,  1.1363],
         [ 0.2596,  0.2451, -1.7299]]])

In [42]:
# operations in the package
torch.arange(0, 10)     # tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [43]:
torch.eye(3, 3)         # creates a 3x3 matrix with 1s in the diagonal

tensor([[1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.]])

In [47]:
t = torch.arange(0, 3)
t

tensor([0, 1, 2])

In [48]:
torch.cat((t, t))       # tensor([0, 1, 2, 0, 1, 2])

tensor([0, 1, 2, 0, 1, 2])

In [50]:
torch.stack((t, t))     # tensor([[0, 1, 2],
                        #         [0, 1, 2]])

tensor([[0, 1, 2],
        [0, 1, 2]])

## GPU computation

In [None]:
t_gpu = torch.cuda.FloatTensor(3, 3)   # creation of a GPU tensor
t_gpu.zero_()                          # initialization to zero

In [52]:
try:
    t_gpu = torch.randn(3, 3, device="cuda:0")
except:
    print("Torch not compiled with CUDA enabled")
    t_gpu = None
    
t_gpu  

Torch not compiled with CUDA enabled


In [53]:
# we could also state explicitly the device to be the 
# CPU with torch.randn(3,3,device="cpu")
t = torch.randn(3, 3)   
t

tensor([[-0.1597, -1.2119, -0.7052],
        [-0.0348,  1.0689, -0.5973],
        [-0.4529, -0.4263,  0.0266]])

In [None]:
t_gpu = t.to("cuda:0")  # copies the tensor from CPU to GPU
# note that if we do now t_to_gpu.to("cuda:0") it will 
# return the same tensor without doing anything else 
# as this tensor already resides on the GPU
print(t_gpu)
print(t_gpu.device)

In [55]:
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
print(device)

cpu


In [56]:
# moves t to the device (this code will **not** fail if the 
# local machine has not access to a GPU)
t.to(device)

tensor([[-0.1597, -1.2119, -0.7052],
        [-0.0348,  1.0689, -0.5973],
        [-0.4529, -0.4263,  0.0266]])

## Neural network foundations

**Static graphs:** the deep learning framework converts the computational graph into a static representation that cannot be modified. This allows the library developers to do very aggressive optimizations on this static graph ahead of computation time, pruning some areas and transforming others so that the final product is highly optimized and fast. The drawback is that some models can be really hard to implement with this approach. For example, TensorFlow uses static graphs. Having static graphs is part of the reason why TensorFlow has excellent support for sequence processing, which makes it very popular in NLP.

**Dynamic graphs:** the framework does not create a graph ahead of computation, but records the operations that are performed, which can be quite different for different inputs. When it is time to compute the gradients, it unrolls the graph and perform the computations. A major benefit of this approach is that implementing complex models can be easier in this paradigm. This flexibility comes at the expense of the major drawback of this approach: speed. Dynamic graphs cannot leverage the same level of ahead-of-time optimization as static graphs, which makes them slower. PyTorch uses dynamic graphs as the underlying paradigm for gradient computation.

### Module

In [57]:
import torch.nn as nn

In [58]:
class MyCustomModule(nn.Module):
    def __init__(self, n_inputs, n_hidden, n_output_classes):
        # call super to initialize the class above in the hierarchy
        super(MyCustomModule, self).__init__()
        # first affine transformation
        self.W = nn.Linear(n_inputs, n_hidden)        
        # non-linearity (here it is also a layer!)
        self.f = nn.ReLU()
        # final affine transformation
        self.U = nn.Linear(n_hidden, n_output_classes) 
        
    def forward(self, x):
        y = self.U(self.f(self.W(x)))
        return y

In [59]:
# set the network's architectural parameters
n_inputs = 3
n_hidden= 4
n_output_classes = 2

# instantiate the model
model = MyCustomModule(n_inputs, n_hidden, n_output_classes)

# create a simple input tensor 
# size is [1,3]: a mini-batch of one example, 
# this example having dimension 3
x = torch.FloatTensor([[0.3, 0.8, -0.4]]) 

# compute the model output by **applying** the input to the module
y = model(x)

# inspect the output
print(y)

tensor([[ 0.1543, -0.1493]], grad_fn=<AddmmBackward>)


### Sequential

In [60]:
class MyCustomModule(nn.Module):
    def __init__(self, n_inputs, n_hidden, n_output_classes):
        super(MyCustomModule, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(n_inputs, n_hidden),
            nn.ReLU(),
            nn.Linear(n_hidden, n_output_classes))
        
    def forward(self, x):
        y = self.network(x)
        return y

In [61]:
class MyCustomModule(nn.Module):
    def __init__(self, n_inputs, n_hidden, n_output_classes):
        super(MyCustomModule, self).__init__()
        self.p_keep = 0.7
        self.network = nn.Sequential(
            nn.Linear(n_inputs, n_hidden),
            nn.ReLU(),
            nn.Linear(n_hidden, 2*n_hidden),
            nn.ReLU(),
            nn.Linear(2*n_hidden, n_output_classes),   
            # dropout argument is probability of dropping
            nn.Dropout(1 - self.p_keep),
            # applies softmax in the data dimension
            nn.Softmax(dim=1)                  
        )
        
    def forward(self, x):
        y = self.network(x)
        return y

In [62]:
import torch.nn.functional as F

y = F.relu(torch.FloatTensor([[-5, -1, 0, 5]]))

y

tensor([[0., 0., 0., 5.]])

In [63]:
# the true label (in this case, 2) from our dataset wrapped 
# as a tensor of minibatch size of 1
y_gold = torch.tensor([1])        
                                  
# our simple classification criterion for this simple example    
criterion = nn.CrossEntropyLoss() 

# forward pass of our model (remember, using apply instead of forward)
y = model(x)  

# apply the criterion to get the loss corresponding to the pair (x, y)
# with respect to the real y (y_gold)
loss = criterion(y, y_gold)       
                                 

# the loss contains a gradient function that we can use to compute
# the gradient dL/dw (gradient with respect to the parameters 
# for a given fixed input)
print(loss)  

tensor(0.8564, grad_fn=<NllLossBackward>)


### Optimization

First-order dynamics: 
- Search direction only: optim.SGD
- Adaptive: optim.RMSprop, optim.Adagrad, optim.Adadelta

Second-order dynamics
- Search direction only: Momentum optim.SGD(momentum=0.9), Nesterov, optim.SGD(nesterov=True)
- Adaptive: optim.Adam, optim.Adamax (Adam with L∞)

### Training a simple mode

In [65]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data
import numpy as np
import matplotlib.pyplot as plt
import math

In [66]:
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")

In [67]:
M = 1200

# sample from the x axis M points
x = np.random.rand(M) * 2*math.pi

# add noise
eta = np.random.rand(M) * 0.01

# compute the function
y = np.sin(x) + eta

# plot
_ = plt.scatter(x,y)

In [68]:
# use the NumPy-PyTorch bridge
x_train = torch.tensor(x[0:1000]).float().view(-1, 1).to(device)
y_train = torch.tensor(y[0:1000]).float().view(-1, 1).to(device)

x_test = torch.tensor(x[1000:]).float().view(-1, 1).to(device)
y_test = torch.tensor(y[1000:]).float().view(-1, 1).to(device)

In [69]:
class SineDataset(data.Dataset):
    def __init__(self, x, y):
        super(SineDataset, self).__init__()
        assert x.shape[0] == y.shape[0]
        self.x = x
        self.y = y

    def __len__(self):
        return self.y.shape[0]

    def __getitem__(self, index):
        return self.x[index], self.y[index]

sine_dataset = SineDataset(x_train, y_train)

sine_dataset_test = SineDataset(x_test, y_test)

sine_loader = torch.utils.data.DataLoader(
    sine_dataset, batch_size=32, shuffle=True)

sine_loader_test = torch.utils.data.DataLoader(
    sine_dataset_test, batch_size=32)

In [70]:
class SineModel(nn.Module):
    def __init__(self):
        super(SineModel, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(1, 5),
            nn.ReLU(),
            nn.Linear(5, 5),
            nn.ReLU(),
            nn.Linear(5, 5),
            nn.ReLU(),
            nn.Linear(5, 1))
        
    def forward(self, x):
        return self.network(x)

In [71]:
# declare the model
model = SineModel().to(device)

# define the criterion
criterion = nn.MSELoss()

# select the optimizer and pass to it the parameters of the model it will optimize
optimizer = torch.optim.Adam(model.parameters(), lr = 0.01)

epochs = 1000

# training loop
for epoch in range(epochs):
    for i, (x_i, y_i) in enumerate(sine_loader):

        y_hat_i = model(x_i)            # forward pass
                                
        loss = criterion(y_hat_i, y_i)  # compute the loss and perform the backward pass

        optimizer.zero_grad()           # cleans the gradients
        loss.backward()                 # computes the gradients
        optimizer.step()                # update the parameters

    if epoch % 20:
        plt.scatter(x_i.data.cpu().numpy(), y_hat_i.data.cpu().numpy())

In [72]:
# testing
with torch.no_grad():
    model.eval()
    total_loss = 0.
    for k, (x_k, y_k) in enumerate(sine_loader_test):
        y_hat_k = model(x_k)
        loss_test = criterion(y_hat_k, y_k)
        total_loss += float(loss_test)

print(total_loss)

0.0038324657070916146


## Reproducibility

In [73]:
def enforce_reproducibility(seed=42):
    # Sets seed manually for both CPU and CUDA
    torch.manual_seed(seed)
    # For atomic operations there is currently 
    # no simple way to enforce determinism, as
    # the order of parallel operations is not known.
    #
    # CUDNN
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # System based
    np.random.seed(seed)

enforce_reproducibility()