PyTorch practice 

PyTorch 101 basic knowledges 

In [None]:
import torch
import numpy as np

print(f'Current PyTorch version: {torch.__version__}')

data = [[1, 2, 3],[4, 5, 6],[7, 8, 9]]
x_data = torch.tensor(data)

np_array = np.array(data)
x_np = torch.from_numpy(np_array)

x_ones = torch.ones_like(x_data)
x_rand = torch.rand_like(x_data, dtype=torch.float)

shape = (2,3,)
rand_tensor = torch.rand(shape)
ones_tensor = torch.ones(shape)
zeros_tensor = torch.zeros(shape)
print(f"Random Tensor: \n {rand_tensor} \n")
print(f"Ones Tensor: \n {ones_tensor} \n")
print(f"Zeros Tensor: \n {zeros_tensor}")

tensor = torch.rand(3,4)
print(f"Shape of tensor: {tensor.shape}")
print(f"Datatype of tensor: {tensor.dtype}")
print(f"Device tensor is stored on: {tensor.device}")


CUDA Tensors

In [None]:
import torch
print(f'GPU is available: {torch.cuda.is_available()}')
print(f'count GPU devices: {torch.cuda.device_count()}')

tensor = torch.randn((3,3))
print(tensor)
tensor = tensor.to(torch.int32)
print(tensor)

tensor = torch.rand((4, 3), dtype=torch.double)
print(tensor)
ten_2 = torch.ones((2,2)).to(tensor)
print(ten_2)

In [None]:
tensor = torch.rand((4,3,7,7))
print(f'Tensor is on {tensor.get_device()} device')

In [128]:
tensor = torch.rand((4,3,7,7))
tensor = tensor.to('cuda:0')
print(f'Tensor is on {tensor.get_device()} device')

tensor = torch.rand((4,3,7,7))
device = torch.device('cuda')
tensor = tensor.to(device)
print(f'Tensor is on {tensor.get_device()} device')

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
tensor = torch.rand((4,3,7,7))
tensor = tensor.to(device)
print(f'Tensor is on {tensor.get_device()} device')

ten_2 = torch.rand((4,3,7,7)).to(tensor.device)
print(f'Tensor is on {tensor.get_device()} device')

Tensor is on 0 device
Tensor is on 0 device
Tensor is on 0 device
Tensor is on 0 device


In [129]:
tensor = torch.rand((3,2), dtype=torch.double, device=device, requires_grad=True)
ten_2 = torch.zeros(2,4).to(tensor)
print(f'Tensor is on {ten_2.get_device()} device, data type: {ten_2.dtype}, grad: {ten_2.requires_grad}')

Tensor is on 0 device, data type: torch.float64, grad: False


In [132]:
print(f'GPU device: {torch.cuda.device_of(tensor)}')
#print(f'GPU SM_ARCH: {torch.cuda.get_arch_list()}')
print(f'GPU capability: {torch.cuda.get_device_capability()}')
print(f'GPU properties: {torch.cuda.get_device_properties(0)}')
#print(f'GPU is initialized: {torch.cuda.is_initialized()}')

GPU device: <torch.cuda.device_of object at 0x7fd9a17a1080>
GPU capability: (7, 5)
GPU properties: _CudaDeviceProperties(name='NVIDIA GeForce RTX 2080 Ti', major=7, minor=5, total_memory=11019MB, multi_processor_count=68)


CUDA Module

In [142]:
import torch
import torch.nn.functional as F
class Arch(torch.nn.Module):
  def __init__(self):
    super().__init__()
    self.layer1 = torch.nn.Linear(3, 7)
    self.layer2 = torch.nn.Linear(7, 1)
    
  def forward(self, x):
    x = F.tanh(layer1)
    x = F.tanh(layer2)
    return x


model = Arch()
model.to('cuda')
optim = torch.optim.Adagrad(model.parameters())
#print(optim.state_dict())

'''
That’s not possible. Modules can hold parameters of different types on different 
devices, and so it’s not always possible to unambiguously determine the device.
'''
print(f'Model {next(model.parameters()).device}')  
for n, p in model.named_parameters():
  print(n, p.device) 

{'state': {140572672483048: {'step': 0, 'sum': tensor([[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]], device='cuda:0')}, 140572672483120: {'step': 0, 'sum': tensor([0., 0., 0., 0., 0., 0., 0.], device='cuda:0')}, 140572672483192: {'step': 0, 'sum': tensor([[0., 0., 0., 0., 0., 0., 0.]], device='cuda:0')}, 140572672483264: {'step': 0, 'sum': tensor([0.], device='cuda:0')}}, 'param_groups': [{'lr': 0.01, 'lr_decay': 0, 'weight_decay': 0, 'initial_accumulator_value': 0, 'params': [140572672483048, 140572672483120, 140572672483192, 140572672483264]}]}
Model cuda:3
layer1.weight cuda:3
layer1.bias cuda:3
layer2.weight cuda:3
layer2.bias cuda:3


In [143]:
class LayerModule_1(torch.nn.Module):
  def __init__(self):
    super().__init__()
    self.layer1 = torch.nn.Linear(4, 8)
    self.layer2 = torch.nn.Linear(8, 2)
    
  def forward(self, x):
    x = F.tanh(layer1)
    x = F.tanh(layer2)
    return x

class LayerModule_2(torch.nn.Module):
  def __init__(self):
    super().__init__()
    self.op = torch.nn.Transformer()
    
  def forward(self, x):
    x = self.op(x)
    return x

In [None]:
class NewArch(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.features = LayerModule_1()
        self.data = LayerModule_2()

    def forward(self, x):
      x = self.features(x)
      x = self.data(x)
      return x
        
model = NewArch()        
# for n, p in model.named_parameters():
#   print(f'Layers names: {n}')
# print(f'modules: {model.modules}')
# for idx, m in enumerate(model.named_modules()):
#         print(idx, '->', m) 

In [None]:
model.add_module('Module_LSTM', torch.nn.LSTM(10, 2))  
for n, p in model.named_parameters():
   print(f'Layers names: {n}')
print(f'Model state_dict {model.state_dict()}')

Autograd

In [151]:
t1 = torch.rand((2,3))
t2 = torch.rand((3,4))
print(f'Requires grad: {t1.requires_grad}, {t2.requires_grad}')
print(f'Tensor contents: {t1}, {t2}')
print(f'Leaf tensor: {t1.is_leaf}, {t2.is_leaf}')
print(f'Gradients: {t1.grad}, {t2.grad}')
print(f'Grad function: {t1.grad_fn}, {t2.grad_fn}')

Requires grad: False, False
Tensor contents: tensor([[0.9908, 0.1574, 0.4543],
        [0.1250, 0.8412, 0.0419]]), tensor([[0.3362, 0.0197, 0.5008, 0.5163],
        [0.1634, 0.7573, 0.7735, 0.7368],
        [0.0629, 0.7425, 0.5592, 0.2885]])
Leaf tensor: True, True
Gradients: None, None
Grad function: None, None


In [152]:
t3 = torch.mm(t1,t2)
print(f'Requires grad: {t3.requires_grad}')
print(f'Tensor contents: {t3}')
print(f'Leaf tensor: {t3.is_leaf}')
print(f'Gradients: {t3.grad}')
print(f'Grad function: {t3.grad_fn}')

Requires grad: False
Tensor contents: tensor([[0.3874, 0.4759, 0.8719, 0.7586],
        [0.1821, 0.6706, 0.7366, 0.6964]])
Leaf tensor: True
Gradients: None
Grad function: None


In [155]:
t4 = torch.rand((4,3), requires_grad=True)
t5 = torch.mm(t3,t4)
t6 = torch.mm(t5,t2)
print(f'Requires grad: {t4.requires_grad}, {t5.requires_grad}, {t6.requires_grad}')
print(f'Tensor contents: {t4}, {t5}, {t6}')
print(f'Leaf tensor: {t4.requires_grad}, {t5.is_leaf}, {t6.is_leaf}')
#print(f'Gradients: {t4.grad}, {t5.grad}, {t6.grad}')
print(f'Grad function: {t4.grad_fn}, {t5.grad_fn}, {t6.grad_fn}')

Requires grad: True, True, True
Tensor contents: tensor([[0.8513, 0.9243, 0.8080],
        [0.2901, 0.6435, 0.0570],
        [0.8803, 0.2969, 0.8275],
        [0.4659, 0.3326, 0.8795]], requires_grad=True), tensor([[1.5888, 1.1755, 1.7289],
        [1.3225, 1.0502, 1.4075]], grad_fn=<MmBackward>), tensor([[0.8350, 2.2051, 2.6716, 2.1852],
        [0.7048, 1.8663, 2.2616, 1.8626]], grad_fn=<MmBackward>)
Leaf tensor: True, False, False
Grad function: None, <MmBackward object at 0x7fd9a17d3278>, <MmBackward object at 0x7fd9a17d3278>


In [158]:
'''
To optimize weights of parameters in the neural network, we need to compute the 
derivatives of our loss function with respect to parameters. To compute those 
derivatives, we call loss.backward(), and then retrieve the values from 
appropriate parameters by .grad attribute:
'''
input = torch.rand(7)
label = torch.rand(2)
weights = torch.rand((7,2), requires_grad=True)
biases = torch.rand(2, requires_grad=True)
output = torch.matmul(input, weights) + biases
loss = torch.nn.functional.l1_loss(output, label)
print(f'Gradient function for output = {output.grad_fn}')
print(f'Gradient function for loss = {loss.grad_fn}')
#loss.backward(retain_graph=True)
loss.backward()
print(f'Wghts gradients: {weights.grad}')
print(f'Bias gradients: {biases.grad}')
'''We can only obtain the grad properties for the leaf nodes of the computational graph, 
which have requires_grad property set to True
'''
print(f'[input, label, wght, bias, output]')
print(list(map(lambda x: x.is_leaf and x.requires_grad, [input, label, weights, biases, output])))

Gradient function for output = <AddBackward0 object at 0x7fd9a062a908>
Gradient function for loss = <L1LossBackward object at 0x7fd9a177e198>
Wghts gradients: tensor([[0.1747, 0.1747],
        [0.0310, 0.0310],
        [0.2693, 0.2693],
        [0.3915, 0.3915],
        [0.0824, 0.0824],
        [0.3638, 0.3638],
        [0.2762, 0.2762]])
Bias gradients: tensor([0.5000, 0.5000])
[input, label, wght, bias, output]
[False, False, True, True, False]


In [76]:
'''
By default, all tensors with requires_grad=True are tracking their computational 
history and support gradient computation. However, there are some cases when we do 
not need to do that, for example, when we have trained the model and just want to 
apply it to some input data, i.e. we only want to do forward computations through the
network. We can stop tracking computations by surrounding our computation code with 
torch.no_grad() block:
'''
with torch.no_grad():
    output = torch.matmul(input, weights) + biases
print(f'Requires_grad: {output.requires_grad}')

output = torch.matmul(input, weights) + biases
out = output.detach()
print(f'Requires_grad: {output.requires_grad}, {out.requires_grad}')

Requires_grad: False
Requires_grad: True, False


In [87]:
a = torch.rand(3, requires_grad=True)
b = torch.rand(3, requires_grad=True)
c = torch.rand(3, requires_grad=True)
with torch.no_grad():
  d = a + b
  with torch.enable_grad():
    out = c * 0.1 + d
#print(list(map(lambda x: x.requires_grad, [a,b,c,d,out])))
out.sum().backward()
print(list(map(lambda x: x.requires_grad, [a,b,c,d,out])))

@torch.enable_grad()
def foo(x):
    return x * 0.1
with torch.no_grad():
    d = a + b
    out = foo(c) + d
#out.sum().backward()
print(list(map(lambda x: x.requires_grad, [a,b,c,d,out])))
'''There are reasons you might want to disable gradient tracking:
To mark some parameters in your neural network as frozen parameters. 
This is a very common scenario for finetuning a pretrained network
To speed up computations when you are only doing forward pass,
because computations on tensors that do not track gradients would be more efficient'''

[True, True, True, False, True]
[True, True, True, False, False]


In [122]:
import torch 

a = torch.randn((3,3), requires_grad = True)

w1 = torch.randn((3,3), requires_grad = True)
w2 = torch.randn((3,3), requires_grad = True)
w3 = torch.randn((3,3), requires_grad = True)
w4 = torch.randn((3,3), requires_grad = True)

b = w1*a 
c = w2*a

d = w3*b + w4*c 

L = (10 -d)

L.backward()

In [None]:
import math
import torch
dtype = torch.float
device = torch.device('cpu')
#device = torch.device("cuda:0") 

'''
We will use a problem of fitting y=sin(x) with a third order polynomial as our running example. 
The network will have four parameters, and will be trained with gradient descent to fit random data by 
minimizing the Euclidean distance between the network output and the true output.
Polynom: a + b*x + c*x**2 + d*x**3 
'''
x = torch.linspace(-math.pi, math.pi, 3000, device=device, dtype=dtype)
y = torch.sin(x)
params_num = 4
params = [torch.randn((), device=device, dtype=dtype) for i in range(params_num)]
# params[0] = torch.randn((), device=device, dtype=dtype)
# params[1] = torch.randn((), device=device, dtype=dtype)
# params[2] = torch.randn((), device=device, dtype=dtype)
# params[3] = torch.randn((), device=device, dtype=dtype)

from functools import reduce
learning_rate = 1e-6
for t in range(2000):
    # Forward pass: compute predicted y
    #y_pred = params[0] + params[1] * x + params[2] * x ** 2 + params[3] * x ** 3
    y_pred = reduce(lambda x,y: x+y, [p*x**mod for mod, p in enumerate(params)]) 
    
    # Compute and print loss
    loss = (y_pred - y).pow(2).sum().item()
    if t % 100 == 99:
        print(t, loss)

    # Backprop to compute gradients of parameters with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grads = [(grad_y_pred*x**mod).sum() for mod, i in enumerate(range(params_num))]
    # params[0] = grad_y_pred.sum()
    # params[1] = (grad_y_pred * x).sum()
    # params[2] = (grad_y_pred * x ** 2).sum()
    # params[3] = (grad_y_pred * x ** 3).sum()

    # Update weights using gradient descent
    for id, p in enumerate(params):
        p-=learning_rate * grads[id]
        
    # params[0] -= learning_rate * grads[0]
    # params[1] -= learning_rate * grads[1]
    # params[2] -= learning_rate * grads[2]
    # params[3] -= learning_rate * grads[3]

[print(f'Result: {p.item()}') for p in params]

In [None]:
import math
import torch
dtype = torch.float
device = torch.device('cpu')
#device = torch.device("cuda:0") # Uncomment this to run on GPU

x = torch.linspace(-math.pi, math.pi, 3000, device=device, dtype=dtype)
y = torch.sin(x)
params_num = 4
params = [torch.randn((), device=device, dtype=dtype, requires_grad=True) for i in range(params_num)]

from functools import reduce
learning_rate = 1e-4
criterion = torch.nn.MSELoss(reduce='sum')
optim = torch.optim.Adam(params, lr=learning_rate)

for t in range(3000):
    optim.zero_grad()
    # Forward pass: compute predicted y
    #y_pred = params[0] + params[1] * x + params[2] * x ** 2 + params[3] * x ** 3
    y_pred = reduce(lambda x,y: x+y, [p*x**mod for mod, p in enumerate(params)]) 
    
    # Compute and print loss
    loss = criterion(y_pred, y)
    if t % 100 == 99:
        print(t, loss.item())
   
    # Backprop to compute gradients of parameters with respect to loss
    loss.backward()
    # Update weights using gradient descent
    optim.step()

[print(f'Result: {p.item()}') for p in params]

In [None]:
import torch
import math

'''
In PyTorch we can easily define our own autograd operator by defining a subclass of torch.autograd.
Function and implementing the forward and backward functions. We can then use our new autograd operator 
by constructing an instance and calling it like a function, passing Tensors containing input data.
Our function is 0.5 * (5 * x ** 3 - 3 * x ** 2)
'''
class CustomPolynomial(torch.autograd.Function):
    """
    We can implement our own custom autograd Functions by subclassing
    torch.autograd.Function and implementing the forward and backward passes
    which operate on Tensors.
    """

    @staticmethod
    def forward(ctx, input):
        """
        In the forward pass we receive a Tensor containing the input and return
        a Tensor containing the output. ctx is a context object that can be used
        to stash information for backward computation. You can cache arbitrary
        objects for use in the backward pass using the ctx.save_for_backward method.
        """
        ctx.save_for_backward(input)
        return 0.5 * (5 * input ** 3 - 3 * input)

    @staticmethod
    def backward(ctx, grad_output):
        """
        In the backward pass we receive a Tensor containing the gradient of the loss
        with respect to the output, and we need to compute the gradient of the loss
        with respect to the input.
        """
        input, = ctx.saved_tensors
        return grad_output * 1.5 * (5 * input ** 2 - 1)


class Polynomial3(torch.nn.Module):
    def __init__(self):
        """
        In the constructor we instantiate four parameters and assign them as
        member parameters.
        Create random Tensors for weights. For this example, we need
        4 weights: y = a + b * P3(c + d * x), these weights need to be initialized
        not too far from the correct result to ensure convergence.
        Setting requires_grad=True indicates that we want to compute gradients with
        respect to these Tensors during the backward pass.
        """
        super().__init__()
        self.a = torch.nn.Parameter(torch.randn((), device=device, dtype=dtype, requires_grad=True)*0.01)
        self.b = torch.nn.Parameter(torch.randn((), device=device, dtype=dtype, requires_grad=True)*0.01)
        self.c = torch.nn.Parameter(torch.randn((), device=device, dtype=dtype, requires_grad=True)*0.01)
        self.d = torch.nn.Parameter(torch.randn((), device=device, dtype=dtype, requires_grad=True)*0.01)
        '''
        P3 using our custom autograd operation.
        To apply our Function, we use Function.apply method. We alias this as 'P3'.
        '''
        self.operation = CustomPolynomial.apply

    def forward(self, x):
        """
        In the forward function we accept a Tensor of input data and we must return
        a Tensor of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Tensors.
        """
        return self.a + self.b * self.operation(self.c + self.d * x)

    def string(self):
        """
        Just like any class in Python, you can also define custom method on PyTorch modules
        """
        return f'y = {self.a.item()} + {self.b.item()} x + {self.c.item()} x^2 + {self.d.item()} x^3'


dtype = torch.float
device = torch.device("cuda")

# Create Tensors to hold input and outputs.
# By default, requires_grad=False, which indicates that we do not need to
# compute gradients with respect to these Tensors during the backward pass.
x = torch.linspace(-math.pi, math.pi, 2000, device=device, dtype=dtype)
y = torch.sin(x)

learning_rate = 1e-4
model = Polynomial3().to(device)
criterion = torch.nn.MSELoss(reduction='sum')
optim = torch.optim.Adam(model.parameters(), lr=learning_rate)
sheduler = torch.optim.lr_scheduler.MultiStepLR(optim, milestones=[25000], gamma=0.1)

for t in range(30000):
    # Zero gradients in optimizer
    optim.zero_grad()
    # Forward pass: compute predicted y using operations; we compute
    y_pred =  model(x) 
    # Compute and print loss
    loss = criterion(y_pred, y)
    if t % 1000 == 999:
        print(t, loss.item())
    # Use autograd to compute the backward pass.
    loss.backward()
    # Update weights using gradient descent
    sheduler.step()
    optim.step()

print(model.string())