<a href="https://colab.research.google.com/github/4deepprk/4deepprk.github.io/blob/master/DL_107_PytorchIntro.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Outline
* PyTorch
* What are tensors
* Initialising, slicing, reshaping tensors
* Numpy and PyTorch interfacing
* GPU support for PyTorch + Enabling GPUs on Google Colab
* Speed comparisons, Numpy -- PyTorch -- PyTorch on GPU
* Autodiff concepts and application
* Writing a basic learning loop using autograd
* Exercises

In [0]:
import torch
import numpy as np
import matplotlib.pyplot as plt

## Initialise tensors

In [2]:
x = torch.ones(3, 2)
print(x)
x = torch.zeros(3, 2)
print(x)
x = torch.rand(3, 2)
print(x)

tensor([[1., 1.],
        [1., 1.],
        [1., 1.]])
tensor([[0., 0.],
        [0., 0.],
        [0., 0.]])
tensor([[0.2786, 0.3343],
        [0.3028, 0.1829],
        [0.0640, 0.3349]])


In [3]:
x = torch.empty(3, 2)
print(x)
y = torch.zeros_like(x)
print(y)

tensor([[2.4205e-35, 0.0000e+00],
        [3.3631e-44, 0.0000e+00],
        [       nan, 0.0000e+00]])
tensor([[0., 0.],
        [0., 0.],
        [0., 0.]])


In [4]:
x = torch.linspace(0, 1, steps=5)
print(x)

tensor([0.0000, 0.2500, 0.5000, 0.7500, 1.0000])


In [5]:
x = torch.tensor([[1, 2], 
                 [3, 4], 
                 [5, 6]])
print(x)

tensor([[1, 2],
        [3, 4],
        [5, 6]])


## Slicing tensors

In [6]:
print(x.size())
print(x[:, 1]) 
print(x[0, :]) 

torch.Size([3, 2])
tensor([2, 4, 6])
tensor([1, 2])


In [7]:
y = x[1, 1]
print(y)
print(y.item())

tensor(4)
4


## Reshaping tensors

In [8]:
print(x)
y = x.view(2, 3)
print(y)

tensor([[1, 2],
        [3, 4],
        [5, 6]])
tensor([[1, 2, 3],
        [4, 5, 6]])


In [9]:
y = x.view(6,-1) 
print(y)

tensor([[1],
        [2],
        [3],
        [4],
        [5],
        [6]])


## Simple Tensor Operations

In [10]:
x = torch.ones([3, 2])
y = torch.ones([3, 2])
z = x + y
print(z)
z = x - y
print(z)
z = x * y
print(z)

tensor([[2., 2.],
        [2., 2.],
        [2., 2.]])
tensor([[0., 0.],
        [0., 0.],
        [0., 0.]])
tensor([[1., 1.],
        [1., 1.],
        [1., 1.]])


In [11]:
z = y.add(x)
print(z)
print(y)

tensor([[2., 2.],
        [2., 2.],
        [2., 2.]])
tensor([[1., 1.],
        [1., 1.],
        [1., 1.]])


In [12]:
z = y.add_(x)
print(z)
print(y)

tensor([[2., 2.],
        [2., 2.],
        [2., 2.]])
tensor([[2., 2.],
        [2., 2.],
        [2., 2.]])


## Numpy <> PyTorch (Bridge or interface of numpy and pytorch)

In [13]:
x_np = x.numpy() # converting torch tensor to numpy nd array
print(type(x), type(x_np))
print(x_np)

<class 'torch.Tensor'> <class 'numpy.ndarray'>
[[1. 1.]
 [1. 1.]
 [1. 1.]]


In [14]:
a = np.random.randn(5)
print(a)
a_pt = torch.from_numpy(a) # converting numpy nd array to torch tensor
print(type(a), type(a_pt))
print(a_pt)

[-0.59056676 -0.21885342 -2.13425143  0.78068824  1.25396419]
<class 'numpy.ndarray'> <class 'torch.Tensor'>
tensor([-0.5906, -0.2189, -2.1343,  0.7807,  1.2540], dtype=torch.float64)


In [15]:
 np.add(a, 1, out=a) # pointwise addition of 1 to all elements of a
''' Both a and a_pt are updated. This means they are not a copy but refers the
same numerical store and operates at the corresponding data structure

i.e., memory_1 <- numpy operation (data structure)
and   memory_1 <- pytorch operation (data structure)'''

print(a)
print(a_pt)

[ 0.40943324  0.78114658 -1.13425143  1.78068824  2.25396419]
tensor([ 0.4094,  0.7811, -1.1343,  1.7807,  2.2540], dtype=torch.float64)


In [16]:
%%time
for i in range(100):
  a = np.random.randn(100,100)
  b = np.random.randn(100,100)
  c = np.matmul(a, b)

CPU times: user 151 ms, sys: 100 ms, total: 251 ms
Wall time: 133 ms


In [17]:
%%time
for i in range(100):
  a = torch.randn([100, 100])
  b = torch.randn([100, 100])
  c = torch.matmul(a, b)

CPU times: user 55.8 ms, sys: 77.9 ms, total: 134 ms
Wall time: 107 ms


In [18]:
%%time
for i in range(10):
  a = np.random.randn(10000,10000)
  b = np.random.randn(10000,10000)
  c = a + b

CPU times: user 1min 27s, sys: 736 ms, total: 1min 27s
Wall time: 1min 27s


In [19]:
%%time
for i in range(10):
  a = torch.randn([10000, 10000])
  b = torch.randn([10000, 10000])
  c = a + b

CPU times: user 25.6 s, sys: 7.94 ms, total: 25.6 s
Wall time: 25.6 s


## CUDA support

In [20]:
# to identify whether there are any GPU available in the system
# # CUDA supported devices
# CUDA - language extension by NVIDIA to support programming GPU's directly
print(torch.cuda.device_count())

1


In [21]:
print(torch.cuda.device(0)) # reference to object within torch
print(torch.cuda.get_device_name(0)) # kind of device

<torch.cuda.device object at 0x7fea7f2bf2b0>
Tesla P100-PCIE-16GB


In [0]:
# GPU device "Tesla P100-PCIE-16GB" at location 0
cuda0 = torch.device('cuda:0') # reference variable of the GPU device

In [23]:
a = torch.ones(3, 2, device=cuda0) # tensor created on GPU
b = torch.ones(3, 2, device=cuda0) # tensor created on GPU
c = a + b
print(c)

tensor([[2., 2.],
        [2., 2.],
        [2., 2.]], device='cuda:0')


In [24]:
print(a)
print(b)

tensor([[1., 1.],
        [1., 1.],
        [1., 1.]], device='cuda:0')
tensor([[1., 1.],
        [1., 1.],
        [1., 1.]], device='cuda:0')


In [25]:
''' NumPy operation on CPU '''
%%time
for i in range(10):
  a = np.random.randn(10000,10000)
  b = np.random.randn(10000,10000)
  np.add(b, a)

CPU times: user 1min 28s, sys: 345 ms, total: 1min 28s
Wall time: 1min 28s


In [26]:
''' PyTorch operation on CPU '''
%%time
for i in range(10):
  a_cpu = torch.randn([10000, 10000])
  b_cpu = torch.randn([10000, 10000])
  b_cpu.add_(a_cpu)

CPU times: user 25.6 s, sys: 16.9 ms, total: 25.6 s
Wall time: 25.6 s


In [27]:
''' PyTorch operation on GPU '''
%%time
for i in range(10):
  a = torch.randn([10000, 10000], device=cuda0)
  b = torch.randn([10000, 10000], device=cuda0)
  b.add_(a)

CPU times: user 1.82 ms, sys: 1.99 ms, total: 3.81 ms
Wall time: 13.3 ms


In [28]:
%%time
for i in range(10):
  a = np.random.randn(10000,10000)
  b = np.random.randn(10000,10000)
  np.matmul(b, a)

CPU times: user 20min 32s, sys: 5.78 s, total: 20min 38s
Wall time: 11min 15s


In [29]:
%%time
for i in range(10):
  a_cpu = torch.randn([10000, 10000])
  b_cpu = torch.randn([10000, 10000])
  torch.matmul(a_cpu, b_cpu)

CPU times: user 5min 1s, sys: 236 ms, total: 5min 1s
Wall time: 5min 1s


In [30]:
%%time
for i in range(10):
  a = torch.randn([10000, 10000], device=cuda0)
  b = torch.randn([10000, 10000], device=cuda0)
  torch.matmul(a, b)

CPU times: user 4.14 ms, sys: 2.99 ms, total: 7.13 ms
Wall time: 7.56 ms


## Autodiff (Tensor: 1. On data structure level it is about storing the multidimensional matrices, 2. On structural level it relates different tensors with each other)

In [38]:
# requires_grad = True tells pytorch that a particular is going to be 
# differentiable based on some function
# i.e., if x is a tensor and set as differentiable then y is a tensor or some function
# which differentiates with respect to x (this is because different tensors relate with
# each other. Say in the below case we can see that y = x + 5 is a simple linear relation).
x = torch.ones([3, 2], requires_grad=True)
print(x)

tensor([[1., 1.],
        [1., 1.],
        [1., 1.]], requires_grad=True)


In [39]:
# The power of Tensor can be seen just by the ability of modeling the relations
# between different high dimensional matrices (tensors) is what makes tensors to
# stand apart. 

y = x + 5
print(y)

tensor([[6., 6.],
        [6., 6.],
        [6., 6.]], grad_fn=<AddBackward0>)


In [40]:
z = y*y + 1
print(z)

tensor([[37., 37.],
        [37., 37.],
        [37., 37.]], grad_fn=<AddBackward0>)


In [41]:
t = torch.sum(z)
print(t)

tensor(222., grad_fn=<SumBackward0>)


In [0]:
t.backward() # backward propogation starting from t

In [43]:
print(x.grad) # differentiation of t with respect to x (.grad is just call for derivative operation)

tensor([[12., 12.],
        [12., 12.],
        [12., 12.]])


$t = \sum_i z_i, z_i = y_i^2 + 1, y_i = x_i + 5$

$\frac{\partial t}{\partial x_i} = \frac{\partial z_i}{\partial x_i} = \frac{\partial z_i}{\partial y_i} \frac{\partial y_i}{\partial x_i} = 2y_i \times 1$


At x = 1, y = 6, $\frac{\partial t}{\partial x_i} = 12$

In [44]:
x = torch.ones([3, 2], requires_grad=True)
y = x + 5
r = 1/(1 + torch.exp(-y))
print(r)
s = torch.sum(r)
s.backward()
print(x.grad)

tensor([[0.9975, 0.9975],
        [0.9975, 0.9975],
        [0.9975, 0.9975]], grad_fn=<MulBackward0>)
tensor([[0.0025, 0.0025],
        [0.0025, 0.0025],
        [0.0025, 0.0025]])


In [45]:
x = torch.ones([3, 2], requires_grad=True)
y = x + 5
r = 1/(1 + torch.exp(-y))
a = torch.ones([3, 2])
r.backward(a)
print(x.grad)

tensor([[0.0025, 0.0025],
        [0.0025, 0.0025],
        [0.0025, 0.0025]])


$\frac{\partial{s}}{\partial{x}} = \frac{\partial{s}}{\partial{r}} \cdot \frac{\partial{r}}{\partial{x}}$

For the above code $a$ represents $\frac{\partial{s}}{\partial{r}}$ and then $x.grad$ gives directly $\frac{\partial{s}}{\partial{x}}$



## Autodiff example that looks like what we have been doing

In [58]:
# True function
x = torch.randn([20, 1], requires_grad=True) # 20 data items or rows which are the input feature 
# We know the real values of w as +3 and b as -2
y = 3*x - 2 # ground truth model which is the corresponding true output of x
print(x)
print(y)

tensor([[-0.7942],
        [-2.8174],
        [ 1.3101],
        [-0.9173],
        [ 0.3928],
        [-0.3163],
        [-0.4522],
        [-0.5039],
        [ 1.5010],
        [-1.0879],
        [-0.2494],
        [-0.0527],
        [-0.7158],
        [ 0.5733],
        [-1.7799],
        [-0.4007],
        [-1.4680],
        [ 1.2438],
        [ 1.1957],
        [-1.2975]], requires_grad=True)
tensor([[ -4.3825],
        [-10.4523],
        [  1.9302],
        [ -4.7519],
        [ -0.8215],
        [ -2.9490],
        [ -3.3567],
        [ -3.5117],
        [  2.5031],
        [ -5.2636],
        [ -2.7483],
        [ -2.1580],
        [ -4.1475],
        [ -0.2801],
        [ -7.3396],
        [ -3.2021],
        [ -6.4040],
        [  1.7314],
        [  1.5871],
        [ -5.8926]], grad_fn=<SubBackward0>)


In [59]:
# Hypothesis function (Forward pass)
# Initially we estimate the values of w and b as 1 and 1
w = torch.tensor([1.], requires_grad=True)
b = torch.tensor([1.], requires_grad=True)

y_hat = w*x + b # hypothesis function for prediction

loss = torch.sum((y_hat - y)**2) # square error loss

print(y_hat)
print(loss)

tensor([[ 0.2058],
        [-1.8174],
        [ 2.3101],
        [ 0.0827],
        [ 1.3928],
        [ 0.6837],
        [ 0.5478],
        [ 0.4961],
        [ 2.5010],
        [-0.0879],
        [ 0.7506],
        [ 0.9473],
        [ 0.2842],
        [ 1.5733],
        [-0.7799],
        [ 0.5993],
        [-0.4680],
        [ 2.2438],
        [ 2.1957],
        [-0.2975]], grad_fn=<AddBackward0>)
tensor(364.9410, grad_fn=<SumBackward0>)


In [60]:
print(loss)

tensor(364.9410, grad_fn=<SumBackward0>)


In [0]:
# Backward pass
loss.backward()

In [57]:
# w.grad is the derivative of loss w.r.t to w
# b.grad is the derivative of loss w.r.t to b
print(w.grad, b.grad)

tensor([-114.4924]) tensor([123.6175])


## Do it in a loop

In [0]:
# learning algorithm to find out the right values of w and b

learning_rate = 0.01

w = torch.tensor([1.], requires_grad=True)
b = torch.tensor([1.], requires_grad=True)

print(w.item(), b.item())

for i in range(10): # think this as epochs
  
  x = torch.randn([20, 1])
  y = 3*x - 2
  
  y_hat = w*x + b
  loss = torch.sum((y_hat - y)**2)
  
  loss.backward()
  
  with torch.no_grad(): # to prevent further computation graph (forward pass) build up (no more backpropogation)
    w -= learning_rate * w.grad
    b -= learning_rate * b.grad
    
    w.grad.zero_()
    b.grad.zero_()

  print(w.item(), b.item())
  

1.0 1.0
1.694516658782959 -0.32816600799560547
2.5244972705841064 -0.9011859893798828
2.6990771293640137 -1.3381099700927734
2.7810328006744385 -1.5905817747116089
2.821857213973999 -1.7378290891647339
2.943121910095215 -1.868725061416626
2.9525837898254395 -1.9191371202468872
2.9741718769073486 -1.9551563262939453
2.9911296367645264 -1.972025752067566
2.994936943054199 -1.9838125705718994


## Do it for a large problem

In [64]:
%%time
# PyTorch operation on CPU
learning_rate = 0.001
N = 1000000
epochs = 200

w = torch.rand([N], requires_grad=True)
b = torch.ones([1], requires_grad=True)

#print(torch.mean(w).item(), b.item())

for i in range(epochs):
  
  x = torch.randn([N])
  y = torch.dot(3*torch.ones([N]), x) - 2
  
  y_hat = torch.dot(w, x) + b
  loss = torch.sum((y_hat - y)**2)
  
  loss.backward()
  
  with torch.no_grad():
    w -= learning_rate * w.grad
    b -= learning_rate * b.grad
    
    w.grad.zero_()
    b.grad.zero_()

#  print(torch.mean(w).item(), b.item())
print(x)  

tensor([ 1.8770,  0.9379, -0.7311,  ..., -0.2719,  0.3658, -0.7566])
CPU times: user 3.26 s, sys: 48.9 ms, total: 3.31 s
Wall time: 3.32 s


In [78]:
%%time
# PyTorch operation on GPU
learning_rate = 0.001
N = 444444444
epochs = 200

w = torch.rand([N], requires_grad=True, device=cuda0)
b = torch.ones([1], requires_grad=True, device=cuda0)

# print(torch.mean(w).item(), b.item())

for i in range(epochs):
  
  x = torch.randn([N], device=cuda0)
  y = torch.dot(3*torch.ones([N], device=cuda0), x) - 2
  
  y_hat = torch.dot(w, x) + b
  loss = torch.sum((y_hat - y)**2)
  
  loss.backward()
  
  with torch.no_grad():
    w -= learning_rate * w.grad
    b -= learning_rate * b.grad
    
    w.grad.zero_()
    b.grad.zero_()

  #print(torch.mean(w).item(), b.item())

CPU times: user 7.88 s, sys: 5.38 s, total: 13.3 s
Wall time: 13.3 s


In [0]:
# Speed comparison: NumPy < PyTorch < PyTorch on GPU