In [1]:
''' 
1. use torch.tensor() not Tensor() -> alias for FloatTensor()
2. example : w = torch.empty((100,200), dtype=torch.float64, device="cuda")
3. castinf : x2.to(torch.float64)
4. view() is preferred over reshape() : Use view() instead to ensure the tensor is not copied.
5. torch.squeeze(), torch.unsqueeze()
6. use of dim : torch.mean(x,1) will compute the mean for each row
7. graph is recreated from scratch; after each .backward() call, autograd starts populating a new graph. This is exactly what allows you to use control flow statements in your model; you can change the shape, size and operations at every iteration if needed.

'''

' \n1. use torch.tensor() not Tensor() -> alias for FloatTensor()\n2. example : w = torch.empty((100,200), dtype=torch.float64, device="cuda")\n3. castinf : x2.to(torch.float64)\n4. view() is preferred over reshape() : Use view() instead to ensure the tensor is not copied.\n5. torch.squeeze(), torch.unsqueeze()\n6. use of dim : torch.mean(x,1) will compute the mean for each row\n'

In [2]:
import torch

In [3]:
w = torch.randint(0,10,(2,2))
torch.empty_like(w) , torch.ones_like(w) , torch.zeros_like(w) , w , torch.full(w.size(),10)

(tensor([[2823626428896,             0],
         [            0,             0]]),
 tensor([[1, 1],
         [1, 1]]),
 tensor([[0, 0],
         [0, 0]]),
 tensor([[4, 3],
         [1, 6]]),
 tensor([[10, 10],
         [10, 10]]))

In [4]:
torch.save(w,'w.pkl')

In [5]:
x2 = torch.load('w.pkl')
print(x2)

tensor([[4, 3],
        [1, 6]])


In [6]:
x2.dtype , x2.device , x2.shape , x2.ndim , x2.grad , x2.layout

(torch.int64, device(type='cpu'), torch.Size([2, 2]), 2, None, torch.strided)

In [7]:
x2.to(torch.float64)

tensor([[4., 3.],
        [1., 6.]], dtype=torch.float64)

In [8]:
torch.rand_like(x2.to(torch.float64))

tensor([[0.1105, 0.3492],
        [0.4588, 0.4326]], dtype=torch.float64)

In [9]:
x2[:,1] , x2[x2<3] , x2.t() , x2.view(4,1)

(tensor([3, 6]),
 tensor([1]),
 tensor([[4, 1],
         [3, 6]]),
 tensor([[4],
         [3],
         [1],
         [6]]))

In [10]:
y = torch.stack((x2.view(1,4), x2.view(1,4)))
print(y) , y.shape

tensor([[[4, 3, 1, 6]],

        [[4, 3, 1, 6]]])


(None, torch.Size([2, 1, 4]))

In [11]:
torch.cat((x2,x2)),torch.cat((x2,x2)).shape

(tensor([[4, 3],
         [1, 6],
         [4, 3],
         [1, 6]]),
 torch.Size([4, 2]))

In [12]:
torch.cat((x2,x2),dim=0) , torch.cat((x2,x2),dim=1)

(tensor([[4, 3],
         [1, 6],
         [4, 3],
         [1, 6]]),
 tensor([[4, 3, 4, 3],
         [1, 6, 1, 6]]))

In [13]:
#backward

In [14]:
x = torch.tensor([[1,2,3],[4,5,6]],
         dtype=torch.float, requires_grad=True)

In [15]:
f = x.pow(2).sum()

In [16]:
f

tensor(91., grad_fn=<SumBackward0>)

In [17]:
f.backward()
x.grad

tensor([[ 2.,  4.,  6.],
        [ 8., 10., 12.]])

In [18]:
a = torch.tensor([2., 3.], requires_grad=True)
b = torch.tensor([6., 4.], requires_grad=True)
Q = 3*a**3 - b**2

In [19]:
external_grad = torch.tensor([1., 1.])
Q.backward(gradient=external_grad)

In [20]:
print(9*a**2 == a.grad,a.grad)
print(-2*b == b.grad,b.grad)

tensor([True, True]) tensor([36., 81.])
tensor([True, True]) tensor([-12.,  -8.])


In [21]:
Q.backward?

[31mSignature:[39m
Q.backward(
    gradient=[38;5;28;01mNone[39;00m,
    retain_graph=[38;5;28;01mNone[39;00m,
    create_graph=[38;5;28;01mFalse[39;00m,
    inputs=[38;5;28;01mNone[39;00m,
)
[31mDocstring:[39m
Computes the gradient of current tensor wrt graph leaves.

The graph is differentiated using the chain rule. If the tensor is
non-scalar (i.e. its data has more than one element) and requires
gradient, the function additionally requires specifying a ``gradient``.
It should be a tensor of matching type and shape, that represents
the gradient of the differentiated function w.r.t. ``self``.

This function accumulates gradients in the leaves - you might need to zero
``.grad`` attributes or set them to ``None`` before calling it.
See :ref:`Default gradient layouts<default-grad-layouts>`
for details on the memory layout of accumulated gradients.

.. note::

    If you run any forward ops, create ``gradient``, and/or call ``backward``
    in a user-specified CUDA stream con

In [22]:
#Freezing Layers 

In [24]:
from torch import nn, optim
from torchvision.models import resnet18, ResNet18_Weights
model = resnet18(weights=ResNet18_Weights.DEFAULT)

# Freeze all the parameters in the network
for param in model.parameters():
    param.requires_grad = False

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to C:\Users\Suraj/.cache\torch\hub\checkpoints\resnet18-f37072fd.pth


100%|█████████████████████████████████████████████████████████████████████████████| 44.7M/44.7M [00:12<00:00, 3.82MB/s]


In [28]:
model

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [37]:
# autograd
# https://docs.pytorch.org/docs/stable/notes/autograd.html
# https://docs.pytorch.org/docs/stable/notes/extending.html
'''
DAG with leaves as inputs & roots as o/ps 

'''

In [31]:
a = torch.tensor(2.0,requires_grad=True)
b = torch.tensor(3.0)
c = a*b 

In [32]:
a,b,c

(tensor(2., requires_grad=True),
 tensor(3.),
 tensor(6., grad_fn=<MulBackward0>))

In [35]:
c.grad_fn??

[31mCall signature:[39m c.grad_fn(*args, **kwargs)
[31mType:[39m           MulBackward0
[31mString form:[39m    <MulBackward0 object at 0x0000022349024CD0>
[31mDocstring:[39m      <no docstring>

In [42]:
x = torch.randn(5, requires_grad=True)
y = x.pow(2)
print(x.equal(y.grad_fn._saved_self))

True


In [43]:
x,y,y.grad_fn._saved_self

(tensor([ 0.5059,  0.7238, -0.8231, -0.2720,  0.6299], requires_grad=True),
 tensor([0.2560, 0.5238, 0.6776, 0.0740, 0.3968], grad_fn=<PowBackward0>),
 tensor([ 0.5059,  0.7238, -0.8231, -0.2720,  0.6299], requires_grad=True))

In [44]:
'''
autograd example 
'''

'\nautograd example \n'

In [45]:
import torch
from typing import List, NamedTuple, Callable, Dict, Optional

_name: int = 0
def fresh_name() -> str:
    """ create a new unique name for a variable: v0, v1, v2 """
    global _name
    r = f'v{_name}'
    _name += 1
    return r


In [46]:
class Variable:
    def __init__(self, value : torch.Tensor, name: str=None):
        self.value = value
        self.name = name or fresh_name() 
    @staticmethod
    def constant(value: torch.Tensor, name: str=None):
        r = Variable(value, name)
        print(f'{r.name} = {value}')
        return r
    def __repr__(self):
        return repr(self.value)
    def __mul__(self, rhs: 'Variable') -> 'Variable':
        return operator_mul(self, rhs)

    def __add__(self, rhs: 'Variable') -> 'Variable':
        return operator_add(self, rhs)
            
    def sum(self, name: Optional[str]=None) -> 'Variable':
        return operator_sum(self, name)
    
    def expand(self, sizes: List[int]) -> 'Variable':
        return operator_expand(self, sizes)


In [47]:
class TapeEntry(NamedTuple):
    # names of the inputs to the original computation
    inputs : List[str]
    # names of the outputs of the original computation
    outputs: List[str]
    # apply chain rule
    propagate: 'Callable[List[Variable], List[Variable]]'

In [48]:
gradient_tape : List[TapeEntry] = []

def reset_tape():
  gradient_tape.clear()
  global _name
  _name = 0 # reset variable names too to keep them small.


In [49]:
def operator_mul(self : Variable, rhs: Variable) -> Variable:
    if isinstance(rhs, float) and rhs == 1.0:
        # peephole optimization
        return self

    # define forward
    r = Variable(self.value * rhs.value)
    print(f'{r.name} = {self.name} * {rhs.name}')

    # record what the inputs and outputs of the op were
    inputs = [self.name, rhs.name]
    outputs = [r.name]

    # define backprop
    def propagate(dL_doutputs: List[Variable]):
        dL_dr, = dL_doutputs
    
        dr_dself = rhs # partial derivative of r = self*rhs
        dr_drhs = self # partial derivative of r = self*rhs

        # chain rule propagation from outputs to inputs of multiply
        dL_dself = dL_dr * dr_dself
        dL_drhs = dL_dr * dr_drhs
        dL_dinputs = [dL_dself, dL_drhs] 
        return dL_dinputs
    # finally, we record the compute we did on the tape
    gradient_tape.append(TapeEntry(inputs=inputs, outputs=outputs, propagate=propagate))
    return r

In [50]:
def grad(L, desired_results: List[Variable]) -> List[Variable]:
    # this map holds dL/dX for all values X
    dL_d : Dict[str, Variable] = {}
    # It starts by initializing the 'seed' dL/dL, which is 1
    dL_d[L.name] = Variable(torch.ones(()))
    print(f'd{L.name} ------------------------')

    # look up dL_dentries. If a variable is never used to compute the loss,
    # we consider its gradient None, see the note below about zeros for more information.
    def gather_grad(entries: List[str]):
        return [dL_d[entry] if entry in dL_d else None for entry in entries]

    # propagate the gradient information backward
    for entry in reversed(gradient_tape):
        dL_doutputs = gather_grad(entry.outputs)
        if all(dL_doutput is None for dL_doutput in dL_doutputs):
            # optimize for the case where some gradient pathways are zero. See
            # The note below for more details.
            continue

        # perform chain rule propagation specific to each compute
        dL_dinputs = entry.propagate(dL_doutputs)

        # Accululate the gradient produced for each input.
        # Each use of a variable produces some gradient dL_dinput for that 
        # use. The multivariate chain rule tells us it is safe to sum 
        # all the contributions together.
        for input, dL_dinput in zip(entry.inputs, dL_dinputs):
            if input not in dL_d:
                dL_d[input] = dL_dinput
            else:
                dL_d[input] += dL_dinput

    # print some information to understand the values of each intermediate 
    for name, value in dL_d.items():
        print(f'd{L.name}_d{name} = {value.name}')
    print(f'------------------------')

    return gather_grad(desired.name for desired in desired_results)


In [51]:
def operator_add(self : Variable, rhs: Variable) -> Variable:
    # Add follows a similar pattern to Mul, but it doesn't end up
    # capturing any variables.
    r = Variable(self.value + rhs.value)
    print(f'{r.name} = {self.name} + {rhs.name}')
    def propagate(dL_doutputs: List[Variable]):
        dL_dr, = dL_doutputs
        dr_dself = 1.0
        dr_drhs = 1.0
        dL_dself = dL_dr * dr_dself
        dL_drhs = dL_dr * dr_drhs
        return [dL_dself, dL_drhs]
    gradient_tape.append(TapeEntry(inputs=[self.name, rhs.name], outputs=[r.name], propagate=propagate))
    return r

# sum is used to turn our matrices into a single scalar to get a loss.
# expand is the backward of sum, so it is added to make sure our Variable
# is closed under differentiation. Both have rules similar to mul above.

def operator_sum(self: Variable, name: Optional[str]) -> 'Variable':
    r = Variable(torch.sum(self.value), name=name)
    print(f'{r.name} = {self.name}.sum()')
    def propagate(dL_doutputs: List[Variable]):
        dL_dr, = dL_doutputs
        size = self.value.size()
        return [dL_dr.expand(*size)]
    gradient_tape.append(TapeEntry(inputs=[self.name], outputs=[r.name], propagate=propagate))
    return r


def operator_expand(self: Variable, sizes: List[int]) -> 'Variable':
    assert(self.value.dim() == 0) # only works for scalars
    r = Variable(self.value.expand(sizes))
    print(f'{r.name} = {self.name}.expand({sizes})')
    def propagate(dL_doutputs: List[Variable]):
        dL_dr, = dL_doutputs
        return [dL_dr.sum()]
    gradient_tape.append(TapeEntry(inputs=[self.name], outputs=[r.name], propagate=propagate))
    return r

In [52]:
a_global, b_global = torch.rand(4), torch.rand(4)

def simple(a, b):
    t = a + b
    return t * b

reset_tape() # reset any compute from other cells
a = Variable.constant(a_global, name='a')
b = Variable.constant(b_global, name='b')
loss = simple(a, b)
da, db = grad(loss, [a, b])
print("da", da)
print("db", db)

a = tensor([0.7680, 0.5106, 0.1505, 0.9638])
b = tensor([0.2539, 0.0201, 0.4764, 0.6176])
v0 = a + b
v1 = v0 * b
dv1 ------------------------
v3 = v2 * b
v4 = v2 * v0
v5 = v4 + v3
dv1_dv1 = v2
dv1_dv0 = v3
dv1_db = v5
dv1_da = v3
------------------------
da tensor([0.2539, 0.0201, 0.4764, 0.6176])
db tensor([1.2759, 0.5508, 1.1032, 2.1990])
