In [38]:
import torch
from torch.autograd import Variable
import numpy as np

Suppose $A \in \mathbb{R}^{m \times n}, x \in \mathbb{R}^{n}$.

Let $f(A, x)$ be a simple matrix-vector product $Ax$. 

And suppose $h$ is some function of $f(A, x)$. It can be multidimensional, but without loss of generality we consider one-dimensional case, e.g. $h(\cdot) = \| \cdot \|^2$. Vector and matrix functions can be differenciated in the same way.

Chain-rule states that

$$
\dfrac{\partial}{\partial x_{i}} g(f(A, x))
    =
\sum\limits_{k}
    \underbrace{\dfrac{\partial g(Ax)}{\partial (Ax)_k}}_\text{grad_output[k]}
\cdot
    \underbrace{\dfrac{\partial (Ax)_k}{\partial x_i}}_{\nabla_x (Ax)[k, i]}
$$

In PyTorch we just have to implement these computations.

Gradients w.r.t. A are computed in the same way:
$$
\dfrac{\partial}{\partial a_{ij}} g(f(A, x))
    =
\sum\limits_{k}
    \underbrace{\dfrac{\partial g(Ax)}{\partial (Ax)_k}}_\text{$g_k :=$ grad_output[k]}
\cdot
    \underbrace{\dfrac{\partial (Ax)_k}{\partial a_{i, j}}}_{f_{i,j,k} := \nabla_A (Ax)[i, j, k]}
$$

It is easy to understand that $f_{i, j, k} = x_j \cdot [k = i]$, where $[\cdot]$ is an indicator function. That's why
$$
\sum\limits_{k} g_k f_{i, j, k} = g_i^\top x_j.
$$

PyTorch code looks like:

In [39]:
class My_MatVec(torch.autograd.Function):
    @staticmethod
    def forward(ctx, A, x):
        '''
        A: (m, n)
        x: (n, )
        '''
        ctx.save_for_backward(A, x)
        return A.mv(x)
    @staticmethod
    def backward(ctx, grad_output):
        '''
        grad_output: (n, )
        '''
        A, x = ctx.saved_variables
        grad_x = A.t().mv(grad_output)
        grad_A = grad_output.unsqueeze(1) * x.unsqueeze(0)
        return grad_A, grad_x
    
matvec = My_MatVec.apply

In [40]:
m, n = 3, 5
np_A = np.random.rand(m, n)
np_x = np.random.rand(n)

In [41]:
A = Variable(torch.from_numpy(np_A), requires_grad=True)
x = Variable(torch.from_numpy(np_x), requires_grad=True)
res = torch.norm(matvec(A, x)).pow(2)
res.backward()
A.grad, x.grad

(Variable containing:
  1.9169  1.4665  0.5486  0.4641  1.5120
  2.4609  1.8826  0.7043  0.5957  1.9411
  2.5767  1.9713  0.7374  0.6238  2.0324
 [torch.DoubleTensor of size 3x5], Variable containing:
  8.0665
  3.8004
  3.4847
  5.7956
  4.4506
 [torch.DoubleTensor of size 5])

Let's check it:

In [42]:
from torch.autograd import gradcheck

A = Variable(torch.randn(5, 7).double(), requires_grad=True)
x = Variable(torch.randn(7).double(), requires_grad=True)
input = (A, x)
test = gradcheck(matvec, input, eps=1e-10, atol=1e-4)
print(test)

True
