In [None]:
import torch
import polyak


x = torch.randn(2, requires_grad=True, dtype=torch.double)
def loss_function():
    return sum(torch.abs(x))

def closure():
    optimizer.zero_grad()
    loss = loss_function()
    loss.backward()
    return loss

param = [x]
optimizer = polyak.Polyak(param)
# A short loop that applies the Polyak subgradient method to the loss function
for i in range(100):
    loss = optimizer.step(closure)
    print("Iteration: {}, Loss: {}".format(i, loss))
    print("x: {}".format(x.data))


# Illustration of SuperPolyak Optimizer Class.
This example runs with default option, i.e., bundle size 1.

In [None]:
import torch
import SuperPolyak

x = torch.randn(2, requires_grad=True, dtype=torch.double)
def loss_function():
    return sum(torch.abs(x))

def closure():
    optimizer.zero_grad()
    loss = loss_function()
    loss.backward()
    return loss

param = [x]
optimizer = SuperPolyak.SuperPolyak(param)
# A short loop that applies the Polyak subgradient method to the loss function
for i in range(100):
    loss = optimizer.step(closure)
    print("Iteration: {}, Loss: {}".format(i, loss))
    print("x: {}".format(x.data))

In [None]:
# Quick working test of superpolyak with numpy functions
import util
import numpy as np
import torch

def f(y):
    return np.sum(np.abs(y))

def gradf(y):
    return np.sign(y)

d = 5
max_elts = d
y0 = np.random.randn(d)
while f(y0) > 1e-20:
    y, d = util.build_bundle(f=f, gradf=gradf, y0=y0, tau=1.,eta_est=1.5,min_f=0., max_elts=max_elts)
    y0 = y
    print("f(y)", f(y0))

# A second example of the superpolyak optimizer class
This example runs with bundlesize d

In [None]:
# A pytorch compatible version of the above code.
import util
import numpy as np
import torch
import SuperPolyak

d = 5
max_elts = d
x = torch.randn(d, requires_grad=True, dtype=torch.double)

def f():
    return torch.sum(abs(x))

optimizer=SuperPolyak.SuperPolyak([x], max_elts=max_elts, eta_est=1.5)
# Closure function to allow us to call backward.
def closure():
    optimizer.zero_grad()
    loss = f()
    loss.backward()
    return loss

current_iter = 0
while closure().item() > 1e-20 and current_iter < 100:
    loss, bundle_index = optimizer.step(closure)
    print("f(y)", closure().item())
    print("Bundle index", bundle_index)
    current_iter += 1

In [None]:
# A version of the previous code that supports parameter groups.
import numpy as np
import torch
import torch.nn.functional as F

d = 5
max_elts = d
x = torch.randn(d, requires_grad=True, dtype=torch.double)

def f():
    return torch.sum(torch.abs(x))


# Closure function to allow us to call backward.
def closure():
    if x.grad is None:
        x.grad = torch.zeros_like(x)
    else:
        x.grad.data.zero_()
    loss = f()
    loss.backward()
    return loss

current_iter = 0
while closure().item() > 1e-20 and current_iter < 100:
    _, bundle_idx = util.build_bundle_torch_param_groups(closure, params=[x], tau=np.inf,eta_est=1.5,min_f=0., max_elts=max_elts)
    # x.data = torch.Tensor(y)
    print("f(y)", closure().item())
    print("Bundle index", bundle_idx)
    current_iter += 1

In [1]:
# Fitting a small neural network with pytorch.

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import util

input_size = 100
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(input_size, 2000, dtype=torch.double)
        self.fc2 = nn.Linear(2000, 1, dtype=torch.double)
        # add a convolutional layer of the appropriate __sizeof__

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        # add another relu layer
        # x = F.relu(self.conv1(x))
        return x

net = Net()
# Let d be the number of parameters in net
d = sum(p.numel() for p in net.parameters() if p.requires_grad)
print("number of parameters", d)
max_elts = int(200)
print("bundle size", max_elts)

# fake training data
x = torch.randn(1000, input_size, dtype=torch.double)
y = net(x).detach().clone().requires_grad_(False)
# Reset the parameters
net = Net()

# a loss function
def loss_function():
    return sum(torch.abs(net(x) - y))

# a closure function to allow us to call backward
def closure():
    net.zero_grad()
    loss = loss_function()
    loss.backward()
    return loss

max_oracle_calls = 10000
params = list(net.parameters())
gap = [closure().item()]
cumulative_oracle_calls = [0]
linsys_solver=util.BundleLinearSystemSolver.LSMR
while closure().item() > 1e-10 and cumulative_oracle_calls[-1] < max_oracle_calls:
    _, bundle_idx = util.build_bundle_torch_param_groups(closure, params=params, tau=np.inf,eta_est=.1,min_f=0., max_elts=max_elts, linsys_solver=linsys_solver)
    print("Iteration: ", cumulative_oracle_calls[-1], ", Loss: ", closure().item(), ", Bundle_exit_step ", bundle_idx)
    cumulative_oracle_calls.append(bundle_idx + cumulative_oracle_calls[-1])
    gap.append(closure().item())

# A matplotlib semilogylog plot of the gap between the loss.
# the x axes is the cumulative oracle calls
# the y axes is the loss
import matplotlib.pyplot as plt
plt.semilogy(cumulative_oracle_calls, gap)
# label the x axes cumulative oracle calls
# label the y axes the $f(x) - f^*$ gap
plt.xlabel("Cumulative oracle calls")
plt.ylabel("$f(x) - f^*$")
# title the plot: 2 layer neural network with d parameters, where d is replaced by its numerical value
plt.title("2 layer neural network with {} parameters".format(d))
plt.show()


number of parameters 204001
bundle size 200


KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt
plt.semilogy(cumulative_oracle_calls, gap)
# label the x axes cumulative oracle calls
# label the y axes the $f(x) - f^*$ gap
plt.xlabel("Cumulative oracle calls")
plt.ylabel("$f(x) - f^*$")
# title the plot: 2 layer neural network with d parameters, where d is replaced by its numerical value
plt.title("2 layer neural network with {} parameters".format(d))
plt.show()