## Starting Pytorch

#### Tensor Basics

1. Tensors
    - Everything in PyTorch is based on Tensor operations. A Tensor is a multi dimensional matrix containing elements of a single data type:

In [4]:
import torch

#torch.empty(size): uninitiallized

x = torch.empty(1) #scalar

print("empty(1):", x)

torch.empty(3) #vector

print("empty(3):",x)

x = torch.empty(2, 3)# matrix

print("empty(2,3):",x)

x = torch.empty(2, 2, 3) # tensor, 3 dimensions

#x torch.empty(2,2,2,3) tensor, 4 dimensions

print("empty(2, 2, 3):",x)

#torch.rand(size): random numbers [0, 1]

x = torch.rand(5, 3)

print("rand(5,3):", x)

#torch.zeros(size), fill with 0

# torch.ones(size), fill with 1

x = torch.zeros(5, 3)

print("zeros(5,3):", x)

empty(1): tensor([0.])
empty(3): tensor([0.])
empty(2,3): tensor([[0., 0., 0.],
        [0., 0., 0.]])
empty(2, 2, 3): tensor([[[0., 0., 0.],
         [0., 0., 0.]],

        [[0., 0., 0.],
         [0., 0., 0.]]])
rand(5,3): tensor([[0.6801, 0.2567, 0.8993],
        [0.3126, 0.4414, 0.5780],
        [0.3521, 0.3500, 0.0722],
        [0.8922, 0.7260, 0.8374],
        [0.1239, 0.2635, 0.8099]])
zeros(5,3): tensor([[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]])


In [5]:
print("size", x.size()) #x.size(0)
print("shape", x.shape)   # x.shape[0]

size torch.Size([5, 3])
shape torch.Size([5, 3])


In [7]:
#Check dtypes

print(x.dtype)

#Specify types, float32 default
x = torch.zeros(5, 3, dtype=torch.float16)
print(x)

#check type
print(x.dtype)



torch.float32
tensor([[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]], dtype=torch.float16)
torch.float16


*Construct Tensor from Data or array*

In [10]:
x = torch.tensor([5.5,3])
print(x, x.dtype)

tensor([5.5000, 3.0000]) torch.float32


*Intro to require_grad argument*

In [11]:
#require_grad argument
# This will tell pytorch that it will need to calculate the gradients for this tensor
# later in your optimization steps
#i.e. this is a variable in your model that you want to optimize

x = torch.tensor([5.5,3], requires_grad=True)
print(x)

tensor([5.5000, 3.0000], requires_grad=True)


*Now if i perform any operation on my require_grad variable pytorch will keep track of it*

In [12]:
# Operations
x = torch.ones(2,2)
y = torch.rand(2,2)

# elementwise addition
z = x + y
# or
#torch.add(x, y)

# in place additon like it will modify the variable 
# y.add_(x)

print(x)
print(y)
print(z)

tensor([[1., 1.],
        [1., 1.]])
tensor([[0.6410, 0.9498],
        [0.7711, 0.8171]])
tensor([[1.6410, 1.9498],
        [1.7711, 1.8171]])


In [13]:
# Substraction
z = x - y
z = torch.sub(x,y)

# multiplication
z = x * y
z = torch.mul(x, y)

# Division
z = x / y
z = torch.div(x, y)



*Slicing is also done with tensors like as normal*

In [16]:
x = torch.rand(5, 3)

print(x)
print("x[:, 0]", x[:, 0]) #all rows, column 0

# get the actual value if only 1 element in tensor
print(x[0,0].item())

tensor([[0.4969, 0.4550, 0.2381],
        [0.2046, 0.0186, 0.2181],
        [0.5149, 0.6615, 0.0525],
        [0.7870, 0.0234, 0.7761],
        [0.0280, 0.0648, 0.4552]])
x[:, 0] tensor([0.4969, 0.2046, 0.5149, 0.7870, 0.0280])
0.49685782194137573


*Reshape the tensor using view*

In [18]:
# reshape with torch.view()
x = torch.randn(4,4)
y = x.view(16)
z = x.view(-1, 8)  # the size -1 means it will automatically determine the necessary size
print(x.size(), y.size(), z.size())

torch.Size([4, 4]) torch.Size([16]) torch.Size([2, 8])


**Numpy**

In [19]:
# Converting tensor into numpy array and vice versa
a = torch.ones(5)
print(a)

#torch to numpy with .numpy()
b = a.numpy()
print(b)
print(type(b))

tensor([1., 1., 1., 1., 1.])
[1. 1. 1. 1. 1.]
<class 'numpy.ndarray'>


In [20]:
# Careful if the Tensor is in CPU(not the GPU)
# both objects will share the same memory location, so changing one
# will also change the other
a.add_(1)
print(a)
print(b)

tensor([2., 2., 2., 2., 2.])
[2. 2. 2. 2. 2.]


In [21]:
import numpy as np
a = np.ones(5)
b = torch.from_numpy(a)
c = torch.tensor(a)
print(a)
print(b)
print(c)

#again be careful when modifying
a += 1
print(a)
print(b)
print(c)

[1. 1. 1. 1. 1.]
tensor([1., 1., 1., 1., 1.], dtype=torch.float64)
tensor([1., 1., 1., 1., 1.], dtype=torch.float64)
[2. 2. 2. 2. 2.]
tensor([2., 2., 2., 2., 2.], dtype=torch.float64)
tensor([1., 1., 1., 1., 1.], dtype=torch.float64)


#### GPU Support

*By default all tensors are created on the CPU. But we can also move them to the GPU(if its available), or create them directly on the GPU.*

In [22]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

x = torch.rand(2,2).to(device)   # move tensors to GPU device
#x = x.to('cpu')
# x = x.to('cuda')

x = torch.rand(2,2, device = device) # or directly create them on GPU


#### AutoGrad

*This package provides automatic differentiation for all operations on Tensors.*

In [23]:
x = torch.randn(3, requires_grad=True)
y = x + 2

# y was create a result of an operation so it has grad_fn attribute.
# grad_fn: reference a function that has created the Tensor
print(x)
print(y)
print(y.grad_fn)

tensor([-0.0588,  1.7982,  1.0857], requires_grad=True)
tensor([1.9412, 3.7982, 3.0857], grad_fn=<AddBackward0>)
<AddBackward0 object at 0x0000019D8775E2C0>


In [24]:
# Do more operations on y
z = y*y*3
print(z)
z = z.mean()
print(z)

tensor([11.3047, 43.2798, 28.5641], grad_fn=<MulBackward0>)
tensor(27.7162, grad_fn=<MeanBackward0>)


In [25]:
#Lets compute Gradient with backpropogation
# when we finish our computation we can call .backward() and have all the gradients computed
# the gradient for this tensor will be acculmulated into .grad attribute
# It is the partial derivative of the function w.r.t. the tensor

z.backward()
print(x.grad) # dz/dx

# !! Careful .backward() accumulated the gradient in grad attribute for this tensor
# we need to be careful during optimization to clear it, before reusing by optimizer.zero_grad()

tensor([3.8824, 7.5965, 6.1713])


#### Stop a Tensor from tracking history

For example during the training loop when we want to update our weights or after training when evaluation. these operations should not be part of gradient computation. to prevent this we can use:

    - x.requires_grad_(False)
    - x.detach()
    - wrap in with torch.no_grad(:)

In [26]:
# .requires_grad_(...) changes the existing flag in-place.
a = torch.randn(2,2)
b = (a * a).sum()
print(a.requires_grad)
print(b.grad_fn)

a.requires_grad_(True)
b = (a * a).sum()
print(a.requires_grad)
print(b.grad_fn)

False
None
True
<SumBackward0 object at 0x0000019D87B39B40>


In [27]:
# .detach(): get a new Tensor with the same content but no gradient computation:
a = torch.randn(2,2, requires_grad=True)
b = a.detach()
print(a.requires_grad)
print(b.requires_grad)

True
False


In [29]:
# wrap in 'with torch.no_grad():'
a = torch.randn(2,2, requires_grad=True)
print(a.requires_grad)
with torch.no_grad():
    b = a**2
    print(b.requires_grad)

True
False


#### Gradient Descent Autograd

Linear Regression example:

*f(x)* = *w* * x + b

    f(x) = 2 * x

In [35]:
# linear Regression
# f = w * x + b 
# here  : f = 2 * x 

X = torch.tensor([1,2,3,4,5,6,7,8], dtype=torch.float32)
y = torch.tensor([2,4,6,8,10,12,14,16], dtype=torch.float32)

w = torch.tensor(0.0, dtype=torch.float32, requires_grad=True)

# model output
def forward(X):
    return w * X

# loss = mse
def loss(y, y_pred):
    return ((y-y_pred)**2).mean()
X_test = 5.0
y_test = 10.0

print(f'Prediction before Training: f({X_test}) = {forward(X_test).item():.3f}')

Prediction before Training: f(5.0) = 0.000


*Training the Model*

In [36]:
learning_rate = 0.01
epochs = 100

for epoch in range(epochs):
    
    #predict
    y_pred = forward(X)
    
    # loss
    l = loss(y, y_pred)
    
    # calculate gradient  = backward pass
    l.backward()
    
    #update weights
    # w.data = w.data - learning_rate * w.grad
    with torch.no_grad():
        w -= learning_rate * w.grad
        
    # zero the gradients after updating(as grad accumulates)
    w.grad.zero_()
    
    if (epoch+1) % 10 == 0:
        print(f"epoch {epoch+1}: w = {w.item():.3f}, loss = {l.item():.3f}")
        
print(f"Predictions after training: f({X_test}) = {forward(X_test).item():.3f}")



epoch 10: w = 1.998, loss = 0.000
epoch 20: w = 2.000, loss = 0.000
epoch 30: w = 2.000, loss = 0.000
epoch 40: w = 2.000, loss = 0.000
epoch 50: w = 2.000, loss = 0.000
epoch 60: w = 2.000, loss = 0.000
epoch 70: w = 2.000, loss = 0.000
epoch 80: w = 2.000, loss = 0.000
epoch 90: w = 2.000, loss = 0.000
epoch 100: w = 2.000, loss = 0.000
Predictions after training: f(5.0) = 10.000


#### Model, Loss and Optimizer

A typical PyTorch Pipeline looks like this:

1. Design model(input, output, forward pass with different layers)
2. Construct loss and optimizer
3. Training loops:
    - Forward = compute prediction and loss
    - Backward = compute gradients
    - Update weights

In [44]:
import torch
import torch.nn as nn

# Linear regression
# f = w * x
# here : f = 2 * x

# 0) Training samples, watch the shape:
X = torch.tensor([[1],[2],[3], [4], [5], [6], [7], [8]], dtype=torch.float32)
y = torch.tensor([[2], [4], [6], [8], [10], [12],[14], [16]], dtype=torch.float32)

n_samples, n_features = X.shape
print(f'n_samples = {n_samples}, n_features = {n_features}')

# 0) create a test sample
X_test = torch.tensor([5], dtype=torch.float32)



n_samples = 8, n_features = 1


In [46]:
# 1) Design Model, the model has to implement the forward pass:

# Here we could simply use a built-in model from PyTorch
# model = nn.Linear(input_size, output_size)

class LinearRegression(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(LinearRegression, self).__init__()
        #define different layers
        self.lin = nn.Linear(input_dim, output_dim)
        
    def forward(self, x):
        return self.lin(x)
    
input_size, output_size = n_features, n_features

model = LinearRegression(input_size, output_size)

print(f"Prediction Before training: f({X_test.item()}) = {model(X_test).item():.3f}")


# 2) Define loss and optimizer
learning_rate = 0.01
n_epochs = 100

loss = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate)

# 3) Training Loop
for epoch in range(n_epochs):
    
    # predict
    y_predicted = model(X)
    
    # loss
    l=loss(y, y_predicted)
    
    # calculate gradient = backward pass 
    l.backward()
    
    #optimizer
    optimizer.step()
    
    # Zero the gradient after updating
    optimizer.zero_grad()
    
    if(epoch+1) % 10 == 0:
        w, b = model.parameters() # unpack parameters
        print(f"epoch {epoch+1}: w = {w[0][0].item()}, loss = {l.item()}")
        
print(f"Prediction after training: f({X_test.item()}) = {model(X_test).item():.3f}")

Prediction Before training: f(5.0) = -4.712
epoch 10: w = 1.9698002338409424, loss = 0.005676873028278351
epoch 20: w = 1.9725481271743774, loss = 0.004944734275341034
epoch 30: w = 1.9736254215240479, loss = 0.004564512521028519
epoch 40: w = 1.97465980052948, loss = 0.004213559906929731
epoch 50: w = 1.9756534099578857, loss = 0.003889568615704775
epoch 60: w = 1.976608157157898, loss = 0.003590524662286043
epoch 70: w = 1.9775254726409912, loss = 0.0033144361805170774
epoch 80: w = 1.9784067869186401, loss = 0.0030595939606428146
epoch 90: w = 1.9792535305023193, loss = 0.00282433838583529
epoch 100: w = 1.9800670146942139, loss = 0.0026071728207170963
Prediction after training: f(5.0) = 10.012
