<h3>This is <strong>Hyperwrap</strong>: a library for efficient hyperparameter optimization</h3>

Cloning the OHO library

In [2]:
!git clone https://github.com/jiwoongim/OHO.git

Cloning into 'OHO'...
remote: Enumerating objects: 114, done.[K
remote: Counting objects: 100% (114/114), done.[K
remote: Compressing objects: 100% (83/83), done.[K
remote: Total 114 (delta 55), reused 73 (delta 29), pack-reused 0 (from 0)[K
Receiving objects: 100% (114/114), 511.40 KiB | 14.21 MiB/s, done.
Resolving deltas: 100% (55/55), done.


In [3]:
%cd OHO
!pip install -e .
%cd ..
%mkdir save_dir
%cd OHO

/content/OHO
Obtaining file:///content/OHO
  Preparing metadata (setup.py) ... [?25l[?25hdone
Installing collected packages: metaopt
  Running setup.py develop for metaopt
Successfully installed metaopt-1.0
/content
/content/OHO


Imports

In [None]:
# %cd ../..
try: 
    from metaopt.mnist.main import load_mnist
except:
    %cd metaopt/mnist
    from metaopt.mnist.main import load_mnist
from metaopt.util import *
from metaopt.util_ml import *
from metaopt.mnist.mlp import *

import os, sys, math, argparse, time, pickle
import torch, torch.nn as nn, torch.optim as optim, numpy as np
from torch.optim.optimizer import Optimizer
from copy import copy
from itertools import tee
from operator import mul

[Errno 2] No such file or directory: 'metaopt/mnist'
/content/OHO/metaopt/mnist


Some argument settings for using the OHO library
Download the MNIST database

In [7]:
parser = argparse.ArgumentParser(description='Example parser')
parser.add_argument('--valid_size', type=int, default=10000)
parser.add_argument('--batch_size', type=int, default=100)
parser.add_argument('--is_cuda', type=int)
parser.add_argument('--batch_size_vl', type=int)
args = parser.parse_args(['--valid_size', '10000', '--batch_size', '100', '--is_cuda', '1', '--batch_size_vl', '100'])
dataset = load_mnist(args)

100%|██████████| 9.91M/9.91M [00:00<00:00, 17.9MB/s]
100%|██████████| 28.9k/28.9k [00:00<00:00, 479kB/s]
100%|██████████| 1.65M/1.65M [00:00<00:00, 4.46MB/s]
100%|██████████| 4.54k/4.54k [00:00<00:00, 7.47MB/s]


Model definitions

In [None]:
class MLP(nn.Module):

    def __init__(self, n_layers, layer_sizes, lr_init, is_cuda=0):
        super(MLP, self).__init__()


        self.layer_sizes = layer_sizes
        self.n_layers = n_layers
        self.n_params = 0
        for i in range(1, self.n_layers):
            attr = 'layer_{}'.format(i)
            layer = nn.Linear(layer_sizes[i - 1], layer_sizes[i])
            if is_cuda: layer = layer.cuda()
            setattr(self, attr, layer)

            param_size = (layer_sizes[i - 1] + 1) * layer_sizes[i]
            self.n_params += param_size

        self.param_sizes = [p.numel() for p in self.parameters()]
        self.param_shapes = [tuple(p.shape) for p in self.parameters()]
        self.param_cumsum = np.cumsum([0] + self.param_sizes)

        self.reset_jacob(is_cuda)
        self.eta  = lr_init
        # self.lambda_l2 = lambda_l2
        self.name = 'MLP'
        self.grad_norm = 0
        self.grad_norm_vl = 0
        self.grad_angle = 0
        self.param_norm = 0
        self.dFdlr_norm = 0
        self.dFdl2_nrom = 0

    def reset_jacob(self, is_cuda=1):
        self.dFdlr = torch.zeros(self.n_params)
        self.dFdlr_norm = 0
        if is_cuda:
            self.dFdlr = self.dFdlr.cuda()

    def forward(self, x, logsoftmaxF=1):

        x = x.view(-1, self.layer_sizes[0])
        for i_layer in range(1, self.n_layers):
            attr = 'layer_{}'.format(i_layer)
            layer = getattr(self, attr)
            x = layer(x)
            if i_layer < self.n_layers - 1:
                x = torch.tanh(x)
        if logsoftmaxF:
            return F.log_softmax(x, dim=1)
        else:
            return F.softmax(x, dim=1)

    def update_dFdlr(self, Hv, param, grad, is_cuda=0, opt_type='sgd', noise=None, N=50000):
        self.Hlr = self.eta*Hv
        self.Hlr_norm = norm(self.Hlr)
        self.dFdlr_norm = norm(self.dFdlr)
        if opt_type == 'sgld':
            if noise is None: noise = torch.randn(size=param.shape)
            self.dFdlr.data = self.dFdlr.data + 0.5 * torch.sqrt(2*noise / self.eta / N)
        - self.Hl2  - 2*self.eta*param


    def update_eta(self, mlr, val_grad):

        val_grad = flatten_array(val_grad)
        delta = val_grad.dot(self.dFdlr).data.cpu().numpy()
        self.eta -= mlr * delta
        self.eta = np.maximum(0.0, self.eta)

class AMLP(MLP):

    def __init__(self, n_layers, layer_sizes, lr_init, is_cuda=0):
        super(MLP, self).__init__()


        self.layer_sizes = layer_sizes
        self.n_layers = n_layers
        self.n_params = 0
        for i in range(1, self.n_layers):
            attr = 'layer_{}'.format(i)
            layer = nn.Linear(layer_sizes[i - 1], layer_sizes[i])
            if is_cuda: layer = layer.cuda()
            setattr(self, attr, layer)

            param_size = (layer_sizes[i - 1] + 1) * layer_sizes[i]
            self.n_params += param_size

        self.param_sizes = [p.numel() for p in self.parameters()]
        self.param_shapes = [tuple(p.shape) for p in self.parameters()]
        self.param_cumsum = np.cumsum([0] + self.param_sizes)

        self.reset_jacob(is_cuda)
        self.eta  = np.ones(len(self.param_sizes)) * lr_init

        self.name = 'MLP'

    def _get_adaptive_hyper(self, l2_params, is_cuda=0):

        layerwise_eta, layerwise_l2, layerwise_eta_np, layerwise_l2_np = [], [], [], []
        for i, shape in enumerate(self.param_shapes):
            layerwise_eta.append(self.eta[i] * torch.ones(shape).flatten())
            layerwise_l2.append(l2_params[i] * torch.ones(shape).flatten())

        layerwise_l2 = torch.cat(layerwise_l2)
        layerwise_eta = torch.cat(layerwise_eta)

        if is_cuda:
            layerwise_l2 = layerwise_l2.cuda()
            layerwise_eta = layerwise_eta.cuda()
        return layerwise_eta, layerwise_l2

    def update_dFdlr(self, Hv, param, grad, lambda_l2_params, is_cuda=0, opt_type='sgd', noise=None, N=50000):

        layerwise_eta, layerwise_l2 = self._get_adaptive_hyper(lambda_l2_params, is_cuda)

        self.Hlr = layerwise_eta *Hv
        self.Hlr_norm = norm(self.Hlr)
        self.dFdlr_norm = norm(self.dFdlr)
        self.dFdlr.data = self.dFdlr.data * (1-2*layerwise_l2*layerwise_eta) \
                                - self.Hlr - grad - 2*layerwise_l2*param
        if opt_type == 'sgld':
            if noise is None: noise = torch.randn(size=param.shape)
            self.dFdlr.data = self.dFdlr.data +  0.5 * torch.sqrt(2 * noise  / N / layerwise_eta)


    def update_eta(self, mlr, val_grad):

        dFdlr_ = unflatten_array(self.dFdlr, self.param_cumsum, self.param_shapes)
        for i, (dFdlr_l, val_grad_l) in enumerate(zip(dFdlr_, val_grad)):
            dFdlr_l = flatten_array(dFdlr_l)
            val_grad_l = flatten_array(val_grad_l)
            delta = (val_grad_l.dot(dFdlr_l)).data.cpu().numpy()
            self.eta[i] -= mlr * delta
            self.eta[i] = np.maximum(0, self.eta[i])

# model as a function of parameters and an input, for evograd perturbed models ouputs
def model_patched(model, parameters, x):
    x = x.view(-1, model.layer_sizes[0])
    param_idx = 0

    for i in range(1, model.n_layers):
        w_shape = (model.layer_sizes[i], model.layer_sizes[i-1])
        b_shape = (model.layer_sizes[i],)

        W = parameters[param_idx].view(w_shape)
        b = parameters[param_idx + 1].view(b_shape)
        param_idx += 2

        x = F.linear(x, W, b)
        if i < model.n_layers - 1:
            x = torch.tanh(x)

    return F.log_softmax(x, dim=1)

Optimizers

In [None]:
class SGD_Multi_LR(Optimizer):

    def __init__(self, params, lr=0.005):

        params, params_copy = tee(params)
        LR = []
        for p in params:
            LR.append(lr*np.ones(p.shape))

        defaults = dict(lr=LR)
        super(SGD_Multi_LR, self).__init__(params_copy, defaults)

    def __setstate__(self, state):
        super(SGD_Multi_LR, self).__setstate__(state)


    def step(self):
        """Performs a single optimization step."""

        for group in self.param_groups:
            for param, lr in zip(group['params'], group['lr']):
                if param.grad is None:
                    continue

                d_p = param.grad.data
                lr = torch.from_numpy(np.asarray([lr]))

                if d_p.is_cuda:
                    lr = lr.cuda()
                p_change = -lr[0] * (d_p)
                param.data.add_(p_change)


In [None]:
def loss(pred, target, model):
    return F.nll_loss(pred, target)

Training functions

In [None]:
TRAIN=0
VALID=1
TEST =2


def train(model_type, mlr, num_epoch, reset_freq, update_freq, checkpoint_freq, opt_type, dataset, model, optimizer, fdir, saveF=0, is_cuda=1):
    counter = 0
    lr_list, l2_list = [], []
    dFdlr_list, Wn_list, gang_list = [], [], []
    tr_epoch, tr_loss_list, tr_acc_list = [], [], []
    vl_epoch, vl_loss_list, vl_acc_list = [], [], []
    te_epoch, te_loss_list, te_acc_list = [], [], []
    tr_corr_mean_list, tr_corr_std_list = [], []
    optimizer = update_optimizer_hyperparams(model, optimizer)
    lambda_l2_params = nn.ParameterList([
        nn.Parameter(torch.tensor(0.0000, dtype=torch.float32))
        for _ in range(len(model.param_sizes))
    ])
    evo_grad_opt = torch.optim.SGD(lambda_l2_params.parameters(), lr=mlr)

    start_time0 = time.time()
    for epoch in range(num_epoch+1):
        if epoch % 10 == 0:
            te_losses, te_accs = [], []
            for batch_idx, (data, target) in enumerate(dataset[TEST]):
                data, target = to_torch_variable(data, target, is_cuda, floatTensorF=1)

                _, loss, accuracy, _, _, _ = feval(data, target, model, optimizer, lambda_l2_params, mode='eval', is_cuda=is_cuda)
                te_losses.append(loss)
                te_accs.append(accuracy)
            te_epoch.append(epoch)
            te_loss_list.append(np.mean(te_losses))
            te_acc_list.append(np.mean(te_accs))

            print('Valid Epoch: %d, Loss %f Acc %f' %
                (epoch, np.mean(te_losses), np.mean(te_accs)))


        grad_list = []
        start_time = time.time()
        for batch_idx, (data, target) in enumerate(dataset[TRAIN]):

            data, target = to_torch_variable(data, target, is_cuda)
            opt_type = opt_type
            model, loss, accuracy, output, noise, grad_vec = feval(data, target, model, optimizer, lambda_l2_params, \
                                is_cuda=is_cuda, mode='meta-train', opt_type=opt_type)
            tr_epoch.append(counter)
            tr_loss_list.append(loss)
            tr_acc_list.append(accuracy)
            grad_list.append(grad_vec)

            if reset_freq > 0 and counter % reset_freq == 0:
                model.reset_jacob()

            if counter % update_freq == 0 and mlr != 0.0:
                data_vl, target_vl = next(dataset[VALID])
                data_vl, target_vl = to_torch_variable(data_vl, target_vl, is_cuda)
                model, loss_vl, optimizer = meta_update(data_vl, target_vl, data, target, model, optimizer, evo_grad_opt, lambda_l2_params, noise)
                vl_epoch.append(counter)
                vl_loss_list.append(loss_vl.item())

            counter += 1
        corr_mean, corr_std = compute_correlation(grad_list, normF=1)
        tr_corr_mean_list.append(corr_mean)
        tr_corr_std_list.append(corr_std)
        grad_list = np.asarray(grad_list)

        end_time = time.time()
        if epoch == 0: print('Single epoch timing %f' % ((end_time-start_time) / 60))


        if epoch % checkpoint_freq == 0:
            os.makedirs(fdir+ '/checkpoint/', exist_ok=True)
            save(model, fdir+ '/checkpoint/epoch%d' % epoch)


        fprint = 'Train Epoch: %d, Tr Loss %f Vl loss %f Acc %f Eta %s, L2 %s, |dFdlr| %.2f |G| %.4f |G_vl| %.4f Gang %.3f |W| %.2f, Grad Corr %f %f'
        print(fprint % (epoch, np.mean(tr_loss_list[-100:]), \
                        np.mean(vl_loss_list[-100:]), \
                        np.mean(tr_acc_list[-100:]), \
                        str(model.eta), str([p for p in lambda_l2_params]), \
                        model.dFdlr_norm,\
                        model.grad_norm,  model.grad_norm_vl, \
                        model.grad_angle, model.param_norm, corr_mean, corr_std))

        Wn_list.append(model.param_norm)
        dFdlr_list.append(model.dFdlr_norm)
        if model_type == 'amlp':
            lr_list.append(model.eta.copy())
            l2_list.append([p.data.cpu().numpy() for p in lambda_l2_params])
        else:
            lr_list.append(model.eta)
            l2_list.append(lambda_l2_params)
        gang_list.append(model.grad_angle)

    Wn_list = np.asarray(Wn_list)
    l2_list = np.asarray(l2_list)
    lr_list = np.asarray(lr_list)
    dFdlr_list = np.asarray(dFdlr_list)
    tr_epoch = np.asarray(tr_epoch)
    vl_epoch = np.asarray(vl_epoch)
    te_epoch = np.asarray(te_epoch)
    tr_acc_list = np.asarray(tr_acc_list)
    te_acc_list = np.asarray(te_acc_list)
    tr_loss_list = np.asarray(tr_loss_list)
    vl_loss_list = np.asarray(vl_loss_list)
    te_loss_list = np.asarray(te_loss_list)
    gang_list = np.asarray(gang_list)
    tr_corr_mean_list = np.asarray(tr_corr_mean_list)
    tr_corr_std_list = np.asarray(tr_corr_std_list)

    return Wn_list, l2_list, lr_list, dFdlr_list, gang_list, \
                tr_epoch, vl_epoch, te_epoch, tr_acc_list, te_acc_list, \
                tr_loss_list, vl_loss_list, te_loss_list, tr_corr_mean_list, tr_corr_std_list


In [None]:
def criterion(pred, target, model, lambda_l2_params):
    loss = F.nll_loss(pred, target)
    l2_penalty = sum((p**2).sum() * (lmbda / 2) for p, lmbda in zip(model.parameters(), lambda_l2_params))
    return loss + l2_penalty

def feval(data, target, model, optimizer, lambda_l2_params, mode='eval', is_cuda=0, opt_type='sgd', N=50000):

    if mode == 'eval':
        model.eval()
        with torch.no_grad():
            output = model(data)
    else:
        model.train()
        optimizer.zero_grad()
        output = model(data)

    loss = criterion(output, target, model, lambda_l2_params)
    pred = output.argmax(dim=1, keepdim=True).flatten()
    accuracy = pred.eq(target).float().mean()

    grad_vec = []
    noise = None
    if 'train' in mode:
        loss.backward()

        for i,param in enumerate(model.parameters()):
            if opt_type == 'sgld':
                noise = torch.randn(size=param.shape)
                if type(model.eta) == type(np.array([])):
                    eps = np.sqrt(model.eta[i]*2/ N) * noise  if model.eta[i] > 0 else 0 * noise
                else:
                    eps = np.sqrt(model.eta*2/ N) * noise  if model.eta > 0 else 0 * noise
                eps = to_torch_variable(eps, is_cuda=is_cuda)
                param.grad.data = param.grad.data + eps.data
            grad_vec.append(param.grad.data.cpu().numpy().flatten())

        if 'SGD_Quotient_LR' in str(optimizer):
            optimizer.mlp_step()
        else:
            optimizer.step()
        grad_vec = np.hstack(grad_vec)
        grad_vec = grad_vec / norm_np(grad_vec)

    elif 'grad' in mode:
        loss.backward()

    return model, loss.item(), accuracy.item(), output, noise, grad_vec

In [None]:
def update_optimizer_hyperparams(model, optimizer):
    optimizer.param_groups[0]['lr'] = np.copy(model.eta)
    return optimizer

In [None]:
# evograd parameters
n_model_candidates = 2
sigma = 0.001
temperature = 0.05

def meta_update(data_vl, target_vl, data_tr, target_tr, model, optimizer, evo_grad_opt, lambda_l2_params, noise=None, is_cuda=1):
    param_shapes = model.param_shapes
    dFdlr = unflatten_array(model.dFdlr, model.param_cumsum, param_shapes)
    Hv_lr  = compute_HessianVectorProd(model, dFdlr, data_tr, target_tr, is_cuda=is_cuda)

    model, loss_valid, grad_valid = get_grad_valid(model, data_vl, target_vl, is_cuda)

    grad = flatten_array(get_grads(model.parameters(), is_cuda)).data
    param = flatten_array(model.parameters())#.data.cpu().numpy()
    model.grad_norm = norm(grad)
    model.param_norm = norm(param)
    grad_vl = flatten_array(grad_valid)
    model.grad_angle = torch.dot(grad / model.grad_norm, grad_vl / model.grad_norm_vl).item()

    model.update_dFdlr(Hv_lr, param, grad, lambda_l2_params, is_cuda, noise=noise)
    model.update_eta(mlr, val_grad=grad_valid)
    param = flatten_array_w_0bias(model.parameters()).data

    model_parameters = [i.detach() for i in model.parameters()]
    theta_list = [[j + sigma*torch.sign(torch.randn_like(j)) for j in model_parameters] for i in range(n_model_candidates)]
    pred_list = [model_patched(model, theta, data_vl) for theta in theta_list]
    loss_list = [criterion(pred, target_vl, model, lambda_l2_params) for pred in pred_list]
    weights = torch.softmax(-torch.stack(loss_list) / temperature, 0)
    theta_updated = [sum(map(mul, theta, weights)) for theta in zip(*theta_list)]
    preds_meta = model_patched(model, theta_updated, data_vl)
    loss_l2 = loss(preds_meta, target_vl, model)
    evo_grad_opt.zero_grad()
    grads = torch.autograd.grad(loss_l2, lambda_l2_params.parameters(), retain_graph=True)
    for p, g in zip(lambda_l2_params.parameters(), grads):
        p.grad = g
    evo_grad_opt.step()
    with torch.no_grad():
      for p in lambda_l2_params:
          p.clamp_(min=0.0, max=0.0001)
    
    optimizer = update_optimizer_hyperparams(model, optimizer)

    return model, loss_valid, optimizer

In [17]:
def get_grad_valid(model, data, target, is_cuda):

    val_model = deepcopy(model)
    val_model.train()

    output = val_model(data)
    loss = F.nll_loss(output, target)
    loss.backward()
    grads = get_grads(val_model.parameters(), is_cuda)
    model.grad_norm_vl = norm(flatten_array(grads))

    return model, loss, grads

Pipeline

In [None]:
model_type = "amlp"
mlr = 0.00001
num_epoch = 10
reset_freq = 0
update_freq = 1
checkpoint_freq = 10
opt_type = 'sgd'
lr = 0.1
lambda_l2 = 0.000
is_cuda = 1
num_layers = 5
hdims = [784] + [128]*3 + [10]
fdir = '../save_dir'

model = AMLP(num_layers, hdims, lr, is_cuda=is_cuda)
optimizer = SGD_Multi_LR(model.parameters(), lr=lr)
Wn_list, l2_list, lr_list, dFdlr_list, gang_list, tr_epoch, vl_epoch, te_epoch,\
                            tr_acc_list, te_acc_list, tr_loss_list, vl_loss_list, te_loss_list,\
                            tr_corr_mean_list, tr_corr_std_list \
                            = train(model_type, mlr, num_epoch, reset_freq, update_freq, checkpoint_freq, opt_type, dataset, model, optimizer, fdir, is_cuda=is_cuda)

print('Final test loss %f' % te_loss_list[-1])

Valid Epoch: 0, Loss 2.305242 Acc 0.095600
Single epoch timing 0.418067
Train Epoch: 0, Tr Loss 0.318316 Vl loss 0.328819 Acc 0.907600 Eta [0.10474716 0.10005984 0.10188739 0.10004982 0.10257516 0.10005449
 0.10425777 0.10009306], L2 [Parameter containing:
tensor(1.4864e-09, requires_grad=True), Parameter containing:
tensor(3.0186e-12, requires_grad=True), Parameter containing:
tensor(1.4676e-09, requires_grad=True), Parameter containing:
tensor(1.0841e-11, requires_grad=True), Parameter containing:
tensor(1.5180e-09, requires_grad=True), Parameter containing:
tensor(1.0180e-11, requires_grad=True), Parameter containing:
tensor(5.6399e-10, requires_grad=True), Parameter containing:
tensor(4.2836e-12, requires_grad=True)], |dFdlr| 22.73 |G| 1.0606 |G_vl| 0.6352 Gang -0.117 |W| 13.85, Grad Corr 0.028311 0.196882
Train Epoch: 1, Tr Loss 0.261353 Vl loss 0.253437 Acc 0.922000 Eta [0.10647327 0.10008544 0.10236875 0.100089   0.10295855 0.10009383
 0.10463858 0.10012676], L2 [Parameter conta

In [22]:
!python -u main.py --is_cuda 1 --mlr 0.00001 --lr 0.1 --lambda_l2 0.0000 --opt_type sgd --update_freq 1 --save 1  --model_type amlp --num_epoch 10 --batch_size_vl 100 --save_dir '../../save_dir'

Model Type: amlp Opt Type: sgd Update Freq 1 Reset Freq 0
../../save_dir/exp/mnist/mlr0.000010_lr0.100000_l20.000000/amlp_10epoch_100vlbz_sgd_1updatefreq_0resetfreq_fold0/
Valid Epoch: 0, Loss 2.305165 Acc 0.097500
Single epoch timing 0.451251
Train Epoch: 0, Tr Loss 0.318154 Vl loss 0.348746 Acc 0.905900 Eta [0.10505738 0.10005581 0.1018957  0.10003892 0.10257875 0.10003885
 0.10419294 0.10005693], L2 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 1.72420451e-08 1.58655837e-07 1.65552425e-08], |dFdlr| 22.78 |dFdl2| 1179.96 |G| 0.9697 |G_vl| 0.9494 Gang -0.160 |W| 13.86, Grad Corr 0.030127 0.193183
Train Epoch: 1, Tr Loss 0.258982 Vl loss 0.262102 Acc 0.924300 Eta [0.10675105 0.10011777 0.10244388 0.10008138 0.10310605 0.1000797
 0.10470598 0.10010963], L2 [0.00000000e+00 9.21795023e-08 0.00000000e+00 2.85593455e-07
 0.00000000e+00 1.65916662e-07 1.38395976e-07 8.37083736e-08], |dFdlr| 32.04 |dFdl2| 2386.21 |G| 0.7515 |G_vl| 0.9013 Gang 0.163 |W| 14.60, Gr