In [1]:
import torch
print(torch.__version__)

torch.backends.mps.is_available()

import torch
import time

device_cpu = torch.device("cpu")
device_mps = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

model_cpu = torch.nn.Linear(1000, 1000).to(device_cpu)
model_mps = torch.nn.Linear(1000, 1000).to(device_mps)

inputs_cpu = torch.randn(1000, 1000).to(device_cpu)
inputs_mps = torch.randn(1000, 1000).to(device_mps)

# Test CPU Time
start_time = time.time()
for _ in range(500):
    outputs_cpu = model_cpu(inputs_cpu)
cpu_time = time.time() - start_time
print(f"CPU Time: {cpu_time:.4f} seconds")

# Test MPS Time
start_time = time.time()
for _ in range(500):
    outputs_mps = model_mps(inputs_mps)
torch.mps.synchronize()  # Make sure MPS compeletes the computation
mps_time = time.time() - start_time
print(f"MPS Time: {mps_time:.4f} seconds")


2.2.2
CPU Time: 5.5037 seconds
MPS Time: 0.7264 seconds


# torch.autograd.grad

This function takes in a tensor $\mathbf{y} \in \mathbb{R}^{B \times D'}$, where $B$ is the batch size and $D'$ is the output dimension, and an input tensor $\mathbf{x} \in \mathbb{R}^{B \times D}$, where $D$ is the input dimension. By default, it also takes a gradient output tensor $\mathbf{v} \in \mathbb{R}^{B \times D'}$. 

This function computes the **Jacobian-Vector Product (JVP)**, **not** the full Jacobian matrix $J \in \mathbb{R}^{B \times D' \times D}$. The resulting product $J^T \mathbf{v}$ has the shape $\mathbb{R}^{B \times D}$. Here, $J^T$ refers to swapping the second and third dimensions of the Jacobian, i.e., $J^T \in \mathbb{R}^{B \times D \times D'}$.

From the **Einstein summation** perspective, this operation can be thought of as:

$$
\left[J^T \mathbf{v}\right]_{b, d} = \sum_{d'=1}^{D'} \frac{\partial y_b^{d'}}{\partial x_b^d} \cdot v_b^{d'}
$$

Or, in PyTorch's **`einsum`** notation:

$$
\texttt{torch.einsum(b d d', b d' -> b d, J, v)}
$$

Example Scenario:

Now, suppose we have a function $f: \mathbb{R}^2 \rightarrow \mathbb{R}^3$, with a batch size $B=2$. If you want to compute $\frac{\partial y_2}{\partial x_2}$ for **batch 1**, you can set:

$$
\mathbf{v} = \begin{bmatrix} 0 & 1 & 0 \\ 0 & 0 & 0 \end{bmatrix}
$$

This setting ensures that only the second output dimension of the first batch contributes to the gradient computation. The result of `torch.autograd.grad` will then give you:

$$
\frac{\partial y_2^{(1)}}{\partial \mathbf{x}^{(1)}}
$$

In [9]:
import torch
# A simple illustration for torch.autograd

##########
#Look at this to see what torhc.grad.autograd returns
x = torch.tensor(2.0, requires_grad=True)
z = torch.tensor(3.0, requires_grad=True)
# compute dy/dx and dy/dz
y = x ** 2 + z ** 3
print(torch.autograd.grad(y, [x, z])) #It returns a tuple

##########
# Now let's take batch into consideration
x = torch.tensor([[2.0],[3.0]], requires_grad=True) #[B,D]
# compute dy/dx and dy/dz
y = x ** 2 #[B,D']
print(torch.autograd.grad(y, x, grad_outputs=torch.tensor([[1],[0]]))) #It returns a tuple
#Setting gradeints this way we enable only focusing on gradient we care about (no cross batches)

##########
# What if x is a vector?
x = torch.tensor([[2.0, 3.0],[3.0, 4.0]], requires_grad=True) #[B,D]
w = torch.tensor([[.5, .5],[.25, .25],[.5, .5]]) #[3,2]
# compute dy/dx and dy/dz
y = torch.matmul(x, w.T) #[B,D'] [2,3] in this case, then J^T would be [B,D,D'] = [2,2,3]
grad_outputs = torch.tensor([[1, 0, 0], [0, 0, 0]])  # is the vector we mentioned above [B,D'] = [2,3]
#Only calculate gradient for sample 1 and y1, and this is 
# [[1*dy1/dx1 + 0*dy2/dx1 + 0*dy3/dx1], 
#  [0*dy1/dx2 + 0*dy2/dx2 + 0*dy3/dx2]]  #WRONG UNDERSTANDING?
print("x = \n",x)
print("w = \n",w)
print("y = x w^T\n",y)

print(torch.autograd.grad(y, x, grad_outputs=grad_outputs)[0]) #[B,D]

(tensor(4.), tensor(27.))
(tensor([[4.],
        [0.]]),)
x = 
 tensor([[2., 3.],
        [3., 4.]], requires_grad=True)
w = 
 tensor([[0.5000, 0.5000],
        [0.2500, 0.2500],
        [0.5000, 0.5000]])
y = x w^T
 tensor([[2.5000, 1.2500, 2.5000],
        [3.5000, 1.7500, 3.5000]], grad_fn=<MmBackward0>)
tensor([[0.5000, 0.5000],
        [0.0000, 0.0000]])


## In the project

In [3]:
import numpy as np
import os, sys
import torch

sys.path.append('/Users/zhanglige/Desktop/JP-Lab/Code/Velocity_Flow_Matching/')
import dnnlib
from training.networks import ToyMLP

#set up device first
device_name = 'mps' #can swap this to cuda:0, etc pending on resources
device = torch.device(device_name)
print(device)

#create ToyMLP instance, 
#adjust it to train mode, tracking grads and pass it to device
mlp = ToyMLP(dim=784, time_varying=True, n_hidden=3, w=64)
mlp.train().to(device)

#first, set up inputs for net 
#am choosing small batch size to make computation faster 
batch_size = 3
flat_data_dim = 784 
imgs = torch.randn(batch_size, flat_data_dim).type(torch.float32).to(device)
ts = torch.rand(batch_size, device=device) 

#ok now calc Jacobian of net_out w.r.t imgs input 
#set requires_grad to True for net inputs... 
ts.requires_grad=True
imgs.requires_grad=True


#small method to compute desired Jacobian, for a batch 
def batch_jacobian(model, imgs, ts):
    """Computes the Jacobian of a batch of outputs w.r.t a batch of inputs."""

    batch_size, input_size = imgs.shape #[B,D]
    output_size = model(imgs, ts).shape[1] #[D']

    jacobian = torch.zeros(batch_size, output_size, input_size) #[B,D',D]

    #note that we loop over batch AND dimensions here! 
    for i in range(batch_size): #for b \in [b_1,...,b_B]
        for j in range(output_size): #For y \in (y_1,y_2,...,y_{D'})
            grad_outputs = torch.zeros_like(model(imgs, ts)) #[B,D']
            grad_outputs[i, j] = 1.0 #Weight for gradient, but you may consider as a filter
            jacobian[i, j] = torch.autograd.grad(
                model(imgs, ts), imgs, grad_outputs=grad_outputs, retain_graph=True
            )[0][i]
        #in one loop, you fill dy_j/dx_1,dy_j/dx_2,...,dy_j/dx_D
    #in one loop, you fill a jacobian matrix for one batch

    return jacobian

ag_jac = batch_jacobian(mlp, imgs, ts)
#check that shape matches - this should already be collapsed across extra batch dim
print("Jacobian Size [B,D',D]:",ag_jac.shape) #[B,D',D] #This is the Jacobian (which we fill one row at a time through batches)

#ok, now compute jvp (product the output vector)
nabla_imgs_ag = ag_jac.transpose(2,1) #transpose to get grad  #[B,D,D']
imgs_jvp_ag = torch.einsum('bij, bjk -> bik', mlp(imgs, ts).unsqueeze(1), nabla_imgs_ag.to(device)).squeeze(1) #bs, dim 
#[B,1,D'] [B,D',D] -> [B,1,D], after squeeze it would be [B,D]
print("JVP Result [B,D]:",imgs_jvp_ag.shape)

mps
Jacobian Size [B,D',D]: torch.Size([3, 784, 784])
JVP Result [B,D]: torch.Size([3, 784])


# torch.functional.jacobian

# Some Thoughts

In [4]:
import numpy as np
import os, sys
import torch

sys.path.append('/Users/zhanglige/Desktop/JP-Lab/Code/Velocity_Flow_Matching/')
import dnnlib
from training.networks import ToyMLP

device = torch.device('mps') 

mlp = ToyMLP(dim=784, time_varying=True, n_hidden=3, w=64).train().to(device)

batch_size = 3
imgs = torch.randn(batch_size, 784, device=device, requires_grad=True)
ts = torch.rand(batch_size, device=device, requires_grad=True)

y = mlp(imgs, ts)

# Direct calculation of JVP
v = y.clone().detach()  #Check below
imgs_jvp = torch.autograd.grad(
    outputs=y,
    inputs=imgs,
    grad_outputs=v, # Directly use the model output as weight here
    retain_graph=True
)[0]

print("JVP Result:", imgs_jvp.shape)

JVP Result: torch.Size([3, 784])


In [5]:
#np.allclose(imgs_jvp.detach().cpu().numpy(), imgs_jvp_ag.detach().cpu().numpy())
np.allclose(imgs_jvp.detach().cpu().numpy(), imgs_jvp_ag.detach().cpu().numpy())

False