In [1]:
import torch
import torch.nn.functional as F
from hessian import hessian
from hessian_eigenthings.power_iter import Operator, deflated_power_iteration
from hessian_eigenthings.lanczos import lanczos
from lanczos_generalized import lanczos_generalized
from GAN_hvp_operator import GANHVPOperator, compute_hessian_eigenthings
import numpy as np
from time import time
from imageio import imwrite
from build_montages import build_montages
import matplotlib.pylab as plt
from os.path import join

In [None]:
from GAN_utils import upconvGAN
G = upconvGAN("fc6")
G.requires_grad_(False).cuda() # this notation is incorrect in older pytorch
#%
import torchvision as tv
# VGG = tv.models.vgg16(pretrained=True)
alexnet = tv.models.alexnet(pretrained=True).cuda()
for param in alexnet.parameters():
    param.requires_grad_(False)

The following code shows this is not working.... The local 2nd order derivative is 0

In [None]:
feat = torch.randn(4096).cuda()
feat.requires_grad_(True)
weight = torch.randn(192, 31, 31).cuda()
objective = FeatLinModel(alexnet, layername='features_4', type="weight", weight=weight)
act = objective(G.visualize(feat))
gradient = torch.autograd.grad(act, feat, retain_graph=True, create_graph=True,)
torch.autograd.grad(gradient[0], feat, retain_graph=True, only_inputs=True, grad_outputs=10*torch.ones(4096).cuda())

In [None]:
feat = torch.tensor(np.random.randn(4096)).float().cuda()
feat.requires_grad_(True)
img = G.visualize(feat)
fc8 = alexnet.forward(img)
act = - fc8[0, 1]
H = hessian(act, feat, create_graph=False)

In [None]:
feat = torch.tensor(np.random.randn(4096)).float().cuda()
feat.requires_grad_(True)
img = G.visualize(feat)
act = - img.mean()
# fc8 = alexnet.forward(img)
# act = - fc8[0, 1]
# H = hessian(act, feat, create_graph=False)

In [None]:
gradient = torch.autograd.grad(act, feat, retain_graph=True, create_graph=True,)
torch.autograd.grad(gradient[0], feat, retain_graph=True, only_inputs=True, grad_outputs=10*torch.ones(4096).cuda())

In [None]:
H = hessian(act, feat, create_graph=False)
# it will be all zero


In [None]:
x = torch.tensor([1.0,2])
x.requires_grad_(True)
A = torch.tensor([[2.0, 3], [3, 1]])
y = x.view(1, -1)@A@x.view(-1, 1)
x_grad = torch.autograd.grad(y, x, retain_graph=True, create_graph=True)
torch.autograd.grad(x_grad, x, retain_graph=True, only_inputs=True)

In [None]:
feat = torch.tensor(np.random.randn(4096)).float().cuda()
feat.requires_grad_(True)
img = G.visualize(feat)
resz_img = F.interpolate(img, (224, 224), mode='bilinear', align_corners=True)
obj = alexnet.features[:10](resz_img)[0, :, 6, 6].mean().pow(2)  # esz_img.std()
ftgrad = torch.autograd.grad(obj, feat, retain_graph=True, create_graph=True, only_inputs=True)
torch.autograd.grad(1 * ftgrad[0], feat, retain_graph=True, only_inputs=True, grad_outputs=torch.randn(4096).cuda(), )

So here is the conclusion:
* If the objective function is linear to the neural activation in a ReLU NN, then the 2nd order derivative will be 0 analytically. We need finite differencing for this case.
* As the Perceptual loss take a squared difference when comparing
feature tensros, the dependency of loss on image is power 2, thus the derivative of it is not independent of image.

There is literature that proves the functional family that ReLU NN can represent IS exactly piecewise linear functions. So it's not surprising that the 2nd derivative is 0 almost everywhere. And using finite differencing is in effect computing the Hessian of a smoothed version of this function.

![](https://upload.wikimedia.org/wikipedia/commons/6/6d/Piecewise_linear_function2D.svg)

So next, I show the feasibility of using forward differencing to compute the approximate **Hessian vector product**. The formula is simply based on Taylor expansion

$$Hv \approx {g(x+\epsilon v) - g(x-\epsilon v)) \over 2\epsilon}$$

In [None]:
feat = torch.tensor(np.random.randn(4096)).float().cuda()
feat.requires_grad_(False)
vect = torch.tensor(np.random.randn(4096)).float().cuda()
vect = vect / vect.norm()
vect.requires_grad_(False)
hvp_col = []
for eps in [100, 50, 25, 10, 5, 1, 5E-1, 1E-1, 1E-2, 1E-3, 1E-4, 1E-5, 1E-6, ]:
    perturb_vecs = feat.detach() + eps * torch.tensor([1, -1.0]).view(-1, 1).cuda() * vect.detach()
    perturb_vecs.requires_grad_(True)
    img = G.visualize(perturb_vecs)
    resz_img = F.interpolate(img, (224, 224), mode='bilinear', align_corners=True)
    obj = alexnet.features[:10](resz_img)[:, :, 6, 6].mean()   # esz_img.std()
    ftgrad_both = torch.autograd.grad(obj, perturb_vecs, retain_graph=False, create_graph=False, only_inputs=True)
    hvp = (ftgrad_both[0][0, :] - ftgrad_both[0][1, :]) / (2 * eps)
    hvp_col.append(hvp)
    # print(hvp)

hvp_arr = torch.cat(tuple(hvp.unsqueeze(0) for hvp in hvp_col), dim=0)
corrmat = np.corrcoef(hvp_arr.cpu().numpy())

In [None]:
plt.matshow(corrmat, cmap=plt.cm.jet)
plt.yticks(range(12), labels=[50, 25, 10, 5, 1, 5E-1, 1E-1, 1E-2, 1E-3, 1E-4, 1E-5, 1E-6, ])
plt.xticks(range(12), labels=[50, 25, 10, 5, 1, 5E-1, 1E-1, 1E-2, 1E-3, 1E-4, 1E-5, 1E-6, ])
plt.ylim(top = -0.5, bottom=11.5)
plt.xlim(left = -0.5, right=11.5)
plt.xlabel("Perturb Vector Norm (Base Vector Norm 300)")
plt.suptitle("Correlation of HVP result (500 Trials)\nusing different EPS in forward differencing\n")
plt.colorbar()