# Deep Learning With PyTorch

Using:
* [Deep Learning With PyTorch - Full Course by Patrick Loeber](https://www.youtube.com/watch?v=c36lUUr864M&t=5s)

### Imports

In [2]:
import torch
import torch.nn as nn
import numpy as np

In [5]:
import timeit

In [12]:
# For NN Visualization
import torch
from torchviz import make_dot

# Chapter

## Tensors

### Basic tensor operations

In [11]:
# An empty tensor (1d-tensor) equals a scalar value.
examples = [
    torch.empty(1),
    torch.empty(4),
    torch.empty(2, 2),
    torch.ones(1, 2, 3, dtype=torch.float16),
    torch.zeros(3, 5),
]
for x in examples:
    print(x)

tensor([1.3853e-36])
tensor([1.3844e-36, 0.0000e+00, 1.3843e-36, 0.0000e+00])
tensor([[-1.0444e-24,  4.5560e-41],
        [ 1.3852e-36,  0.0000e+00]])
tensor([[[1., 1., 1.],
         [1., 1., 1.]]], dtype=torch.float16)
tensor([[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]])


In [24]:
samples = [(torch.rand(2,2), torch.rand(2,2)) for _ in range(1000)]

In [25]:
%timeit for x, y in samples: x + y

1.26 ms ± 17.9 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [26]:
%timeit for x, y in samples: torch.add(x, y)

1.31 ms ± 7.69 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


Functions with a tailind underscore can be used to do inplace operations. E.g. `x.add_(y)` as `x += y` equivalent.

In [27]:
%timeit for x, y in samples: x.add_(y)

823 µs ± 205 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


Summary of basic tensor functions:
- `torch.rand()`
- `torch.add()`
- `torch.sub()`
- `torch.mul()`

### Resizing tensors

In [29]:
x = torch.rand(4,4)
print(x)

tensor([[0.5717, 0.7741, 0.0849, 0.7694],
        [0.7536, 0.3507, 0.3607, 0.7189],
        [0.0287, 0.7415, 0.7122, 0.3119],
        [0.7741, 0.5915, 0.1425, 0.4543]])


In [35]:
# Concatenate all rows to one 1D-tensor.
y0 = x.view(16)
print(y0)
# This is the same as using `flatten()`
y1 = x.flatten()
print(y1)

tensor([0.5717, 0.7741, 0.0849, 0.7694, 0.7536, 0.3507, 0.3607, 0.7189, 0.0287,
        0.7415, 0.7122, 0.3119, 0.7741, 0.5915, 0.1425, 0.4543])
tensor([0.5717, 0.7741, 0.0849, 0.7694, 0.7536, 0.3507, 0.3607, 0.7189, 0.0287,
        0.7415, 0.7122, 0.3119, 0.7741, 0.5915, 0.1425, 0.4543])


In [37]:
# One can use the dummy value '-1' to let pytorch determine the second dimension.
y = x.view(-1, 8)
print(y)
print(y.size())

tensor([[0.5717, 0.7741, 0.0849, 0.7694, 0.7536, 0.3507, 0.3607, 0.7189],
        [0.0287, 0.7415, 0.7122, 0.3119, 0.7741, 0.5915, 0.1425, 0.4543]])
torch.Size([2, 8])


### Tensor conversions

Note that if the tenor is stored on the CPU, this will be a deep copy! Thus it is a reference and not a true copy.

In [44]:
a = torch.ones(5)
print(a)
print(type(a))
b = a.numpy()
print(b)
print(type(b))

print()
a.add_(1)
print(a)
print(b)

tensor([1., 1., 1., 1., 1.])
<class 'torch.Tensor'>
[1. 1. 1. 1. 1.]
<class 'numpy.ndarray'>

tensor([2., 2., 2., 2., 2.])
[2. 2. 2. 2. 2.]


In [42]:
a = np.ones(5)
print(a)
print(type(a))
b = torch.from_numpy(a)
print(b)
print(type(b))

print()
a += 1
print(a)
print(b)

[1. 1. 1. 1. 1.]
<class 'numpy.ndarray'>
tensor([1., 1., 1., 1., 1.], dtype=torch.float64)
<class 'torch.Tensor'>

[2. 2. 2. 2. 2.]
tensor([2., 2., 2., 2., 2.], dtype=torch.float64)


### Using a GPU

In [49]:
if torch.cuda.is_available():
    print(True)
    # Initialize the GPU as cuda device.
    device = torch.device("cuda")
    
    # Instantiate a variable on the GPU.
    x = torch.ones(5, device=device)
    
    # Instantiate a variable on the CPU and move it to be stored on the GPU.
    # Note that numpy can only handle CPU tensors!
    y = torch.ones(5)
    y = y.to(device)
    y.add_(x)
    y.to("cpu")
else:
    print(False)

False


## Gradients with Autograd

If needed later, one can specify that a gradient will need to be computed for a specific variable:

In [64]:
x = torch.randn(5, requires_grad=True)
x

tensor([ 0.4658, -0.9957, -0.1948, -1.4954,  0.1331], requires_grad=True)

In [65]:
# Note that Pytorch tracs the required operations for the Backpropagation.
y = x+2
y

tensor([2.4658, 1.0043, 1.8052, 0.5046, 2.1331], grad_fn=<AddBackward0>)

In [66]:
z = y*y*2
z

tensor([12.1606,  2.0173,  6.5178,  0.5092,  9.1001], grad_fn=<MulBackward0>)

In [67]:
z_scalar = z.mean()
z_scalar

tensor(6.0610, grad_fn=<MeanBackward0>)

In [68]:
print(x.grad)
# Trigger the backward pass by calling
z_scalar.backward() # dz/dx
# Now the gradient has been computed and can be used.
print(x.grad)

None
tensor([1.9727, 0.8034, 1.4442, 0.4036, 1.7065])


In [85]:
x = torch.randn(3, requires_grad=True)
z = x.add(2)
z.mul_(3)

# Now the output is not a scalar value but a vector. To trigger the backwards pass, another vector is needed.
v = torch.tensor([.1, 1., .001], dtype=torch.float32)
assert x.size() == v.size(), "The vector must have the same dimension as the input dimension!"

z.backward(v) # dz/dx
print(x.grad)

tensor([0.3000, 3.0000, 0.0030])


There are several options to turn of the `requires_grad`-flag:
* `x.requires_grad_(False)`
* `x.detach()`
* `with torch.no_grad()`

In [90]:
x = torch.randn(3, requires_grad=True)
print(x)
x.requires_grad_(False)
print(x)

tensor([0.3268, 0.9561, 1.7758], requires_grad=True)
tensor([0.3268, 0.9561, 1.7758])


In [89]:
x = torch.randn(3, requires_grad=True)
print(x)
y = x.detach()
print(x)
print(y)

tensor([-2.3683,  0.9145,  0.1720], requires_grad=True)
tensor([-2.3683,  0.9145,  0.1720], requires_grad=True)
tensor([-2.3683,  0.9145,  0.1720])


In [96]:
x = torch.randn(3, requires_grad=True)
print(x)
with torch.no_grad():
    y = x.add(2)
    print(f"torch.no_grad - x\t{x}")
    print(f"torch.no_grad - y\t{y}")

y = x.add(2)
print(f"x\t\t\t{x}")
print(f"y\t\t\t{y}")

tensor([ 2.0285,  0.6048, -2.3660], requires_grad=True)
torch.no_grad - x	tensor([ 2.0285,  0.6048, -2.3660], requires_grad=True)
torch.no_grad - y	tensor([ 4.0285,  2.6048, -0.3660])
x			tensor([ 2.0285,  0.6048, -2.3660], requires_grad=True)
y			tensor([ 4.0285,  2.6048, -0.3660], grad_fn=<AddBackward0>)


## Tiny NN example

### Gradients by hand

In [22]:
# f = w * x

# f = 2 * x
X = np.array([1, 2, 3, 4], dtype=np.float32)
Y = np.array([5, 6, 7, 8], dtype=np.float32)

w = .0

# Model prediction.
def forward(x):
    return w * x

# Loss function: MSE
def loss(y, y_predicted):
    return ((y_predicted - y)**2).mean()

# Gradient.
# MSE = 1/N * (w*x - y)**2
# dJ/dw = 1/N 2x (x*w - y)
def gradient(x, y, y_predicted):
    return np.dot(2*x, y_predicted - y).mean()

print(f"Prediction before training: f(5) = {forward(5):.3f}")

# Training
learning_rate = 0.01
nr_epochs = 20

for epoch in range(nr_epochs):
    # Prediction from the forward pass.
    y_pred = forward(X)
    
    # Loss computation.
    l = loss(Y, y_pred)
    
    # Gradient computation.
    dw = gradient(X, Y, y_pred)
    
    # Update the weights.
    w -= learning_rate * dw
    
    if epoch % (nr_epochs//10) == 0:
        print(f"Epoch {epoch + 1}: w = {w:.3f}, loss = {l:.8f}")

        
print(f"Prediction after training: f(5) = {forward(5):.3f}")

Prediction before training: f(5) = 0.000
Epoch 1: w = 1.400, loss = 43.50000000
Epoch 3: w = 2.184, loss = 3.71199965
Epoch 5: w = 2.309, loss = 2.69342732
Epoch 7: w = 2.330, loss = 2.66735172
Epoch 9: w = 2.333, loss = 2.66668415
Epoch 11: w = 2.333, loss = 2.66666722
Epoch 13: w = 2.333, loss = 2.66666675
Epoch 15: w = 2.333, loss = 2.66666675
Epoch 17: w = 2.333, loss = 2.66666651
Epoch 19: w = 2.333, loss = 2.66666651
Prediction after training: f(5) = 11.667


### Gradients by Autograd

In [24]:
# f = w * x

# f = 2 * x
X = torch.tensor([1, 2, 3, 4], dtype=torch.float32)
Y = torch.tensor([5, 6, 7, 8], dtype=torch.float32)

w = torch.tensor(.0, dtype=torch.float32, requires_grad=True)

# Model prediction.
def forward(x):
    return w * x

# Loss function: MSE
def loss(y, y_predicted):
    return ((y_predicted - y)**2).mean()

print(f"Prediction before training: f(5) = {forward(5):.3f}")

# Training
learning_rate = 0.01
nr_epochs = 100

for epoch in range(nr_epochs):
    # Prediction from the forward pass.
    y_pred = forward(X)
    
    # Loss computation.
    l = loss(Y, y_pred)
    
    # Gradient computation. Backward pass.
    l.backward() # dl/dw
    
    # Update the weights.
    with torch.no_grad():
        w -= learning_rate * w.grad
        
    # Reset the gradients.
    w.grad.zero_()
    
    if epoch % (nr_epochs // 10) == 0:
        print(f"Epoch {epoch + 1}: w = {w:.3f}, loss = {l:.8f}")

        
print(f"Prediction after training: f(5) = {forward(5):.3f}")

Prediction before training: f(5) = 0.000
Epoch 1: w = 0.350, loss = 43.50000000
Epoch 11: w = 1.943, loss = 4.24934816
Epoch 21: w = 2.256, loss = 2.72801042
Epoch 31: w = 2.318, loss = 2.66904426
Epoch 41: w = 2.330, loss = 2.66675878
Epoch 51: w = 2.333, loss = 2.66667008
Epoch 61: w = 2.333, loss = 2.66666675
Epoch 71: w = 2.333, loss = 2.66666675
Epoch 81: w = 2.333, loss = 2.66666651
Epoch 91: w = 2.333, loss = 2.66666651
Prediction after training: f(5) = 11.667


### Using Torch.NN

In [27]:
# 1) Design the model (input, output size, forward pass)
# 2) Construct the loss and optimizer
# 3) Training loop:
# - forward pass: compute prediction
# - backward pass: gradients
# - update weights

In [31]:
# y = w * x   ==>   w = 2
X = torch.tensor([[1], [2], [3], [4]], dtype=torch.float32)
Y = torch.tensor([[5], [6], [7], [8]], dtype=torch.float32)
n_samples, n_features = X.shape


input_size = n_features
output_size = n_features

model = nn.Linear(input_size, output_size)

print(f"Prediction before training: f(5) = {model(torch.tensor(5)):.3f}")

# Training
learning_rate = 0.01
nr_epochs = 100

loss = nn.MSELoss()
optimizer = torch.optim.SGD([w], lr=learning_rate)

for epoch in range(nr_epochs):
    # Prediction from the forward pass.
    y_pred = forward(X)
    
    # Loss computation.
    l = loss(Y, y_pred)
    
    # Gradient computation. Backward pass.
    l.backward() # dl/dw
    
    # Update the weights.
    optimizer.step()
        
    # Reset the gradients.
    optimizer.grad_zero_()
    
    if epoch % (nr_epochs // 10) == 0:
        print(f"Epoch {epoch + 1}: w = {w:.3f}, loss = {l:.8f}")

        
print(f"Prediction after training: f(5) = {forward(5):.3f}")

RuntimeError: both arguments to matmul need to be at least 1D, but they are 0D and 2D

# NEXT CHAPTER ...

In [3]:
model_graph = draw_graph(resnet18(), input_size=(1,3,224,224), expand_nested=True)
model_graph.visual_graph

In [None]:
# CONTINUE WITH: https://youtu.be/c36lUUr864M?si=Ed3ezOjp2TDZD6Ev&t=4899
