## Chapter 6: Beginner Guide
- Tied layer: gradient will add up along different chain
- Custom initialization: `apply` method
- I/O
  - save tensor: `torch.save(x:Uinon[List[tensor], Dict], name:str)` and load
  - save model: the same, just input dict of the net (`net.state_dict()`) then `net.load_state_dict(torch.load(name))`
- GPU
  - operation between tensors must in the same GPU
  - print or transform to numpy will copy to memory, and even worse wait the python **GIL** (`Global Interpreter Lock`, make sure at the same time only one thread can execute the python bytecode)

In [1]:
import torch
from torch import nn

In [25]:
class TiedModel(nn.Module):
    def __init__(self):
        super().__init__()
        linear_layer = nn.LazyLinear(32)
        # self.net = nn.Sequential(linear_layer, nn.ReLU(),
        #                          linear_layer, nn.ReLU(),
        #                          nn.LazyLinear(1))
        self.net = nn.Sequential(nn.LazyLinear(32), nn.ReLU(),
                                 linear_layer, nn.ReLU(),
                                 linear_layer, nn.ReLU(),)
    def forward(self, x):
        return self.net(x)

In [42]:
tied_model = TiedModel()
print(tied_model.net[2].weight)
data = torch.tensor([1, 2.0]).reshape(1, -1)
data.requires_grad_(True)
print(data.shape)

output = tied_model(data)
print(output.shape)
output.sum().backward()
print(tied_model.net)

print(tied_model.net[0].weight.grad.shape)
print(tied_model.net[1].state_dict().__len__()) # no weight
print(tied_model.net[2].weight.grad.shape)
print(tied_model.net[2].weight.grad) # 2 and 4 grad is the same
print(tied_model.net[4].weight.grad) # 2 and 4 grad is the same

<UninitializedParameter>
torch.Size([1, 2])
torch.Size([1, 32])
Sequential(
  (0): Linear(in_features=2, out_features=32, bias=True)
  (1): ReLU()
  (2): Linear(in_features=32, out_features=32, bias=True)
  (3): ReLU()
  (4): Linear(in_features=32, out_features=32, bias=True)
  (5): ReLU()
)
torch.Size([32, 2])
0
torch.Size([32, 32])
tensor([[ 0.9961,  0.0979,  0.0000,  ...,  0.1630,  0.0000,  0.5559],
        [-0.2359,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        ...,
        [ 0.6810,  0.0979,  0.0000,  ...,  0.1630,  0.0000,  0.5559],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.9429,  0.0979,  0.0000,  ...,  0.1630,  0.0000,  0.5559]])
tensor([[ 0.9961,  0.0979,  0.0000,  ...,  0.1630,  0.0000,  0.5559],
        [-0.2359,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        ...,
       

In [38]:
def init_normal(module):
    if type(module) == nn.Linear:
        nn.init.normal_(module.weight, mean=0, std=0.01)
        nn.init.zeros_(module.bias)

In [41]:
print(tied_model.net[2].weight.data[0])
tied_model.net[2].apply(init_normal)
print(tied_model.net[2].weight.data[0])

tensor([ 0.0079, -0.0104, -0.0116,  0.0134, -0.0147, -0.0072, -0.0090, -0.0035,
         0.0137, -0.0168,  0.0012, -0.0099, -0.0054,  0.0072, -0.0045, -0.0135,
         0.0071,  0.0014,  0.0197, -0.0017,  0.0031,  0.0025, -0.0042, -0.0049,
        -0.0184, -0.0064, -0.0088, -0.0035, -0.0220,  0.0105, -0.0146,  0.0110])
tensor([ 1.5377e-02, -1.7294e-02,  5.9649e-04, -2.4361e-03, -6.4072e-03,
         5.3947e-03,  3.9172e-03,  5.6747e-03, -2.8256e-04,  1.3490e-02,
         4.2003e-03,  8.5548e-03,  6.0128e-03, -1.4819e-03, -2.2291e-03,
         1.3369e-02, -1.3220e-02,  1.2654e-03, -4.5651e-03,  1.8961e-02,
        -1.1517e-02, -8.9003e-03,  5.3294e-03, -4.2507e-03,  2.2758e-05,
        -9.9561e-03,  2.2119e-03,  3.3493e-03,  5.2531e-05, -1.1092e-02,
        -2.8326e-04,  1.1497e-02])


In [70]:
"""
Try to understand tied module
"""
a = torch.tensor([[1, 1.0], [2, 2.0]])
a.requires_grad_(True)
x = torch.ones((2, 2))
y = a @ x + 1
y.retain_grad()
z = a @ y

# z.backward(torch.ones_like(a))
z.sum().backward()

print(y.grad)
print(a.grad)
print(2 * a.T @ x.T + torch.ones((2, 2)) + y.T)
print(a)
print(x)
print(y)
print(z)

tensor([[3., 3.],
        [3., 3.]])
tensor([[12., 16.],
        [12., 16.]])
tensor([[10., 12.],
        [10., 12.]], grad_fn=<AddBackward0>)
tensor([[1., 1.],
        [2., 2.]], requires_grad=True)
tensor([[1., 1.],
        [1., 1.]])
tensor([[3., 3.],
        [5., 5.]], grad_fn=<AddBackward0>)
tensor([[ 8.,  8.],
        [16., 16.]], grad_fn=<MmBackward0>)


In [65]:
print(torch.ones((2, 2)))

tensor([[1., 1.],
        [1., 1.]])


In [83]:
new_y = torch.tensor([[3., 3.],
        [5., 5.]])
new_y.requires_grad_(True)
new_a = torch.tensor([[1, 1.0], [2, 2.0]])
new_z = new_a @ new_y
new_z.backward(torch.ones_like(new_z))
print(new_y.grad)
print(torch.ones_like(new_z) @ new_a)
print(new_a)

tensor([[3., 3.],
        [3., 3.]])
tensor([[3., 3.],
        [3., 3.]])
tensor([[1., 1.],
        [2., 2.]])


In [79]:
second_y = torch.tensor([[3., 3.],
        [5., 5.]])
second_y.requires_grad_(True)
second_z = second_y.sum()
second_z.backward()
print(second_y.grad)

tensor([[1., 1.],
        [1., 1.]])


In [3]:
!nvidia-smi

Thu Mar 27 02:06:11 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 536.67                 Driver Version: 536.67       CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                     TCC/WDDM  | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 4060      WDDM  | 00000000:01:00.0  On |                  N/A |
|  0%   42C    P8              52W / 115W |   1286MiB /  8188MiB |      8%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [28]:
import time
t1 = time.time()
a_cpu = torch.randn((500, 500))
b_cpu = torch.randn((500, 500))
for i in range(10000):
    a_cpu @= b_cpu
t2 = time.time()
(t2 - t1)

4.6940789222717285

In [29]:
t1 = time.time()
a_gpu = torch.randn((500, 500), device="cuda:0")
b_gpu = torch.randn((500, 500), device="cuda:0")
for i in range(10000):
    a_gpu @= b_gpu
t2 = time.time()
(t2 - t1)

0.669034481048584