In [1]:
!git clone https://github.com/jiwoongim/OHO.git

Cloning into 'OHO'...
remote: Enumerating objects: 114, done.[K
remote: Counting objects: 100% (114/114), done.[K
remote: Compressing objects: 100% (83/83), done.[K
remote: Total 114 (delta 55), reused 73 (delta 29), pack-reused 0 (from 0)[K
Receiving objects: 100% (114/114), 511.40 KiB | 1.99 MiB/s, done.
Resolving deltas: 100% (55/55), done.


In [2]:
%cd OHO
!pip install -e .
%cd ..
%mkdir save_dir
%cd OHO

/content/OHO
Obtaining file:///content/OHO
  Preparing metadata (setup.py) ... [?25l[?25hdone
Installing collected packages: metaopt
  Running setup.py develop for metaopt
Successfully installed metaopt-1.0
/content
/content/OHO


In [None]:
try:
    from metaopt.cifar.main import load_cifar10
except:
    %cd metaopt/cifar
    from metaopt.cifar.main import load_cifar10
from metaopt.util import *
from metaopt.util_ml import *
from metaopt.cifar.resnet18 import *

import os, sys, math, argparse, time, pickle
import torch, torch.nn as nn, torch.optim as optim, numpy as np
from torch.optim.optimizer import Optimizer
from copy import copy
from itertools import tee
from operator import mul
import torch.backends.cudnn as cudnn

/content/OHO/metaopt/cifar


In [5]:
parser = argparse.ArgumentParser(description='Example parser')
parser.add_argument('--valid_size', type=int, default=10000)
parser.add_argument('--batch_size', type=int, default=100)
parser.add_argument('--is_cuda', type=int)
parser.add_argument('--batch_size_vl', type=int)
args = parser.parse_args(['--valid_size', '10000', '--batch_size', '100', '--is_cuda', '1', '--batch_size_vl', '1000'])
dataset = load_cifar10(args)

100%|██████████| 170M/170M [00:04<00:00, 42.3MB/s]


In [None]:
class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, in_planes, planes, stride=1):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(
            in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
                               stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*planes,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion*planes)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out


class ResNet(nn.Module):
    def __init__(self, block, num_blocks, \
                num_classes=10, lr_init=0.00001, is_cuda=1):
        super(ResNet, self).__init__()
        self.in_planes = 64

        self.conv1 = nn.Conv2d(3, 64, kernel_size=3,
                               stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
        self.linear = nn.Linear(512*block.expansion, num_classes)

        self.param_shapes = [tuple(p.shape) for p in self.parameters()]
        self.param_sizes = [p.numel() for p in self.parameters()]
        self.param_cumsum = np.cumsum([0] + self.param_sizes)
        self.n_params = sum(self.param_sizes)

        self.reset_jacob(is_cuda)
        self.eta  = lr_init
        self.name = 'REZ'
        self.grad_norm = 0
        self.grad_norm_vl = 0
        self.grad_angle = 0
        self.param_norm = 0

    def _make_layer(self, block, planes, num_blocks, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_planes, planes, stride))
            self.in_planes = planes * block.expansion
        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = F.avg_pool2d(out, 4)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return F.log_softmax(out, dim=1)

    def reset_jacob(self, is_cuda):
        self.dFdlr = torch.zeros(self.n_params)
        self.dFdlr_norm = 0
        if is_cuda:
            self.dFdlr = self.dFdlr.cuda()

    def update_dFdlr(self, Hv, param, grad, is_cuda=0, opt_type='sgd', noise=None, N=50000):

        self.Hlr = self.eta*Hv
        self.Hlr_norm = norm(self.Hlr)
        self.dFdlr_norm = norm(self.dFdlr)
        self.dFdlr.data = self.dFdlr.data * (1-2*self.lambda_l2*self.eta) \
                                - self.Hlr - grad - 2*self.lambda_l2*param
        if opt_type == 'sgld':
            if noise is None: noise = torch.randn(size=param.shape)
            self.dFdlr.data = self.dFdlr.data + 0.5 * torch.sqrt(2*noise / self.eta / N)


    def update_eta(self, mlr, val_grad):

        val_grad = flatten_array(val_grad)
        delta = (val_grad.dot(self.dFdlr)).data.cpu().numpy()
        self.eta -= mlr * delta
        self.eta = np.maximum(0.0, self.eta)


class AResNet(ResNet):

    def __init__(self, block, num_blocks, \
                num_classes=10, lr_init=0.00001, is_cuda=1):
        super(ResNet, self).__init__()
        self.in_planes = 64

        self.conv1 = nn.Conv2d(3, 64, kernel_size=3,
                               stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
        self.linear = nn.Linear(512*block.expansion, num_classes)

        self.param_shapes = [tuple(p.shape) for p in self.parameters()]
        self.param_sizes = [p.numel() for p in self.parameters()]
        self.param_cumsum = np.cumsum([0] + self.param_sizes)
        self.n_params = sum(self.param_sizes)

        self.reset_jacob(is_cuda)
        self.name = 'AREZ'
        self.grad_norm = 0
        self.grad_norm_vl = 0
        self.grad_angle = 0
        self.param_norm = 0

        self.eta  = np.ones(len(self.param_sizes)) * lr_init


    def _get_adaptive_hyper(self, lambda_l2, is_cuda=0):

        layerwise_eta, layerwise_l2, layerwise_eta_np, layerwise_l2_np = [], [], [], []
        for i, shape in enumerate(self.param_shapes):
            layerwise_eta.append(self.eta[i] * torch.ones(shape).flatten())
            layerwise_l2.append(lambda_l2[i] * torch.ones(shape).flatten())

        layerwise_l2 = torch.cat(layerwise_l2)
        layerwise_eta = torch.cat(layerwise_eta)

        if is_cuda:
            layerwise_l2 = layerwise_l2.cuda()
            layerwise_eta = layerwise_eta.cuda()
        return layerwise_eta, layerwise_l2


    def update_dFdlr(self, Hv, param, grad, lambda_l2, is_cuda=0, opt_type='sgd', noise=None, N=50000):
        layerwise_eta, layerwise_l2 = self._get_adaptive_hyper(lambda_l2, is_cuda)

        self.Hlr = layerwise_eta *Hv
        self.Hlr_norm = norm(self.Hlr)
        self.dFdlr_norm = norm(self.dFdlr)
        self.dFdlr.data = self.dFdlr.data * (1-2*layerwise_l2*layerwise_eta) \
                                - self.Hlr - grad - 2*layerwise_l2*param
        if opt_type == 'sgld':
            if noise is None: noise = torch.randn(size=param.shape)
            self.dFdlr.data = self.dFdlr.data +  0.5 * torch.sqrt(2 * noise  / N / layerwise_eta)


    def update_eta(self, mlr, val_grad):

        dFdlr_ = unflatten_array(self.dFdlr, self.param_cumsum, self.param_shapes)
        for i, (dFdlr_l, val_grad_l) in enumerate(zip(dFdlr_, val_grad)):
            dFdlr_l = flatten_array(dFdlr_l)
            val_grad_l = flatten_array(val_grad_l)
            delta = (val_grad_l.dot(dFdlr_l)).data.cpu().numpy()
            self.eta[i] -= mlr * delta
            self.eta[i] = np.maximum(0, self.eta[i])



def AResNet18(lr_init):
    return AResNet(BasicBlock, [2, 2, 2, 2], lr_init=lr_init)


def model_patched(model, param_tensors, x):
    idx = 0

    def conv_bn_relu(x, weight, bn_weight, bn_bias, running_mean, running_var):
        x = F.conv2d(x, weight, bias=None, stride=1, padding=1)
        x = F.batch_norm(x, running_mean.detach(), running_var.detach(), bn_weight, bn_bias, training=False)
        return F.relu(x)

    def basic_block(x, weights, bns, shortcut_weights=None, shortcut_bns=None, stride=1):
        out = F.conv2d(x, weights[0], bias=None, stride=stride, padding=1)
        out = F.batch_norm(out, bns[0][0].detach(), bns[0][1].detach(), bns[0][2], bns[0][3], training=False)
        out = F.relu(out)
        out = F.conv2d(out, weights[1], bias=None, stride=1, padding=1)
        out = F.batch_norm(out, bns[1][0].detach(), bns[1][1].detach(), bns[1][2], bns[1][3], training=False)

        identity = x
        if shortcut_weights is not None:
            identity = F.conv2d(x, shortcut_weights[0], bias=None, stride=stride)
            identity = F.batch_norm(identity, shortcut_bns[0].detach(), shortcut_bns[1].detach(),
                                    shortcut_bns[2], shortcut_bns[3], training=False)
        out += identity
        return F.relu(out)

    conv1_weight = param_tensors[idx]; idx += 1
    bn1_weight = param_tensors[idx]; idx += 1
    bn1_bias = param_tensors[idx]; idx += 1
    bn1_running_mean = model.bn1.running_mean.detach()
    bn1_running_var = model.bn1.running_var.detach()

    x = F.conv2d(x, conv1_weight, bias=None, stride=1, padding=1)
    x = F.batch_norm(x, bn1_running_mean, bn1_running_var, bn1_weight, bn1_bias, training=False)
    x = F.relu(x)

    for layer in [model.layer1, model.layer2, model.layer3, model.layer4]:
        for block in layer:
            conv1_w = param_tensors[idx]; idx += 1
            bn1_w = param_tensors[idx]; idx += 1
            bn1_b = param_tensors[idx]; idx += 1
            conv2_w = param_tensors[idx]; idx += 1
            bn2_w = param_tensors[idx]; idx += 1
            bn2_b = param_tensors[idx]; idx += 1

            shortcut_weights, shortcut_bns = None, None
            if isinstance(block.shortcut, torch.nn.Sequential) and len(block.shortcut) > 0:
                shortcut_conv_w = param_tensors[idx]; idx += 1
                shortcut_bn_w = param_tensors[idx]; idx += 1
                shortcut_bn_b = param_tensors[idx]; idx += 1
                shortcut_weights = [shortcut_conv_w]
                shortcut_bns = [block.shortcut[1].running_mean,
                                block.shortcut[1].running_var,
                                shortcut_bn_w,
                                shortcut_bn_b]

            x = basic_block(x,
                            weights=[conv1_w, conv2_w],
                            bns=[
                                [block.bn1.running_mean, block.bn1.running_var, bn1_w, bn1_b],
                                [block.bn2.running_mean, block.bn2.running_var, bn2_w, bn2_b]
                            ],
                            shortcut_weights=shortcut_weights,
                            shortcut_bns=shortcut_bns,
                            stride=block.conv1.stride[0])

    x = F.avg_pool2d(x, 4)
    x = x.view(x.size(0), -1)

    linear_weight = param_tensors[idx]; idx += 1
    linear_bias = param_tensors[idx]; idx += 1
    x = F.linear(x, linear_weight, linear_bias)

    return F.log_softmax(x, dim=1)

In [None]:
class SGD_Multi_LR(Optimizer):

    def __init__(self, params, lr=0.005):

        params, params_copy = tee(params)
        LR = []
        for p in params:
            LR.append(lr*np.ones(p.shape))

        defaults = dict(lr=LR)
        super(SGD_Multi_LR, self).__init__(params_copy, defaults)

    def __setstate__(self, state):
        super(SGD_Multi_LR, self).__setstate__(state)


    def step(self):
        """Performs a single optimization step."""

        for group in self.param_groups:
            for param, lr in zip(group['params'], group['lr']):
                if param.grad is None:
                    continue

                d_p = param.grad.data
                lr = torch.from_numpy(np.asarray([lr]))

                if d_p.is_cuda:
                    lr = lr.cuda()

                p_change = -lr[0] * (d_p)
                param.data.add_(p_change)


In [None]:
def loss(pred, target, model):
    return F.nll_loss(pred, target)

In [None]:
TRAIN=0
VALID=1
TEST =2

def train(dataset, model, optimizer, mlr, opt_type, reset_freq, update_freq, model_type, gamma, step_size, num_epoch, saveF=0, is_cuda=1):

    start_time0 = time.time()
    opt_type = opt_type
    if 'step' in opt_type:
        lrsch_type = opt_type.split('_')[-1]
        if 'sgd_expstep' == opt_type:
            scheduler = lr_scheduler_init(optimizer, lrsch_type, gamma=gamma)
        elif 'sgd_step' == opt_type:
            scheduler = lr_scheduler_init(optimizer, lrsch_type, step_size=step_size)
        else:
            scheduler = lr_scheduler_init(optimizer, lrsch_type, N=num_epoch+1)

    counter = 0
    lr_list = []
    dFdlr_list, dFdl2_list, Wn_list, gang_list = [], [], [], []
    tr_epoch, tr_loss_list, tr_acc_list = [], [], []
    vl_epoch, vl_loss_list, vl_acc_list = [], [], []
    te_epoch, te_loss_list, te_acc_list = [], [], []
    tr_corr_mean_list, tr_corr_std_list = [], []
    optimizer = update_optimizer_hyperparams(model, optimizer)

    lambda_l2_params = nn.ParameterList([
      nn.Parameter(torch.tensor(0.0000, dtype=torch.float32))
      for _ in range(len(model.module.param_sizes))
    ])
    evo_grad_opt = torch.optim.SGD(lambda_l2_params.parameters(), lr=mlr)

    for epoch in range(num_epoch+1):
        if epoch % 1 == 0:
            te_losses, te_accs = [], []
            for batch_idx, (data, target) in enumerate(dataset[TEST]):
                data, target = to_torch_variable(data, target, is_cuda)
                _, loss, accuracy, _, _, _ = feval(data, target, model, optimizer, lambda_l2_params, mode='eval', is_cuda=is_cuda)
                te_losses.append(loss)
                te_accs.append(accuracy)
            te_epoch.append(epoch)
            te_loss_list.append(np.mean(te_losses))
            te_acc_list.append(np.mean(te_accs))

            print('Valid Epoch: %d, Loss %f Acc %f' %
                (epoch, np.mean(te_losses), np.mean(te_accs)))

            if 'step' in opt_type:
                scheduler.step()
                model.module.eta = optimizer.param_groups[0]['lr']

        grad_list = []
        start_time = time.time()
        for batch_idx, (data, target) in enumerate(dataset[TRAIN]):
            if batch_idx % 100 == 0:
                print('Train Epoch: %d, Batch %d out of %d' % (epoch, batch_idx, len(dataset[TRAIN])))
            data, target = to_torch_variable(data, target, is_cuda)
            if 'step' in opt_type:
                model, loss, accuracy, output, noise, grad_vec = feval(data, target, model, optimizer, lambda_l2_params, \
                                is_cuda=is_cuda, mode='train', opt_type=opt_type)
            else:
                model, loss, accuracy, output, noise, grad_vec = feval(data, target, model, optimizer, lambda_l2_params, \
                                is_cuda=is_cuda, mode='meta-train', opt_type=opt_type)
            tr_epoch.append(counter)
            tr_loss_list.append(loss)
            tr_acc_list.append(accuracy)
            if batch_idx % 5 == 0: grad_list.append(grad_vec)

            if reset_freq > 0 and counter % reset_freq == 0:
                model_ = model.module if 'DataParallel' in str(type(model)) else model
                model_.reset_jacob(is_cuda)

            if epoch % update_freq == 0 and 'step' not in opt_type and mlr != 0.0:
                data_vl, target_vl = next(dataset[VALID])
                data_vl, target_vl = to_torch_variable(data_vl, target_vl, is_cuda)

                model, loss_vl, optimizer = meta_update(data_vl, target_vl, data, target, model, optimizer, evo_grad_opt, lambda_l2_params, mlr, noise, is_cuda=is_cuda)
                vl_epoch.append(counter)
                vl_loss_list.append(loss_vl.item())
            counter += 1

        corr_mean, corr_std = compute_correlation(grad_list, normF=1)
        tr_corr_mean_list.append(corr_mean)
        tr_corr_std_list.append(corr_std)

        end_time = time.time()
        if epoch == 0: print('Single epoch timing %f' % ((end_time-start_time) / 60))

        model_ = model.module if 'DataParallel' in str(type(model)) else model
        fprint = 'Train Epoch: %d, Tr Loss %f Vl loss %f Acc %f Eta %s, L2 %s, |dFdlr| %.2f |G| %.4f |G_vl| %.4f Gang %.3f |W| %.2f, Grad Corr %f %f'
        print(fprint % (epoch, np.mean(tr_loss_list[-100:]), \
                        np.nanmean(vl_loss_list[-100:]), \
                        np.nanmean(tr_acc_list[-100:]), \
                        str(model_.eta), str([p for p in lambda_l2_params]), \
                        model_.dFdlr_norm,\
                        model_.grad_norm,  model_.grad_norm_vl, \
                        model_.grad_angle, model_.param_norm, corr_mean, corr_std))

        Wn_list.append(model_.param_norm)
        dFdlr_list.append(model_.dFdlr_norm)
        if model_type == 'arez18' or model_type == 'qrez18':
            lr_list.append(model_.eta.copy())
        else:
            lr_list.append(model_.eta)
        gang_list.append(model_.grad_angle)

    Wn_list = np.asarray(Wn_list)
    lr_list = np.asarray(lr_list)
    dFdlr_list = np.asarray(dFdlr_list)
    tr_epoch = np.asarray(tr_epoch)
    vl_epoch = np.asarray(vl_epoch)
    te_epoch = np.asarray(te_epoch)
    tr_acc_list = np.asarray(tr_acc_list)
    te_acc_list = np.asarray(te_acc_list)
    tr_loss_list = np.asarray(tr_loss_list)
    vl_loss_list = np.asarray(vl_loss_list)
    te_loss_list = np.asarray(te_loss_list)
    gang_list = np.asarray(gang_list)
    tr_corr_mean_list = np.asarray(tr_corr_mean_list)
    tr_corr_std_list = np.asarray(tr_corr_std_list)

    end_time0 = time.time()
    print('Total training timing %f' % ((end_time0-start_time0) / 3600))

    return Wn_list, l2_list, lr_list, dFdlr_list, dFdl2_list, gang_list, \
                tr_epoch, vl_epoch, te_epoch, tr_acc_list, te_acc_list, \
                tr_loss_list, vl_loss_list, te_loss_list, tr_corr_mean_list, tr_corr_std_list

In [None]:
def criterion(pred, target, model, lambda_l2_params):
    loss = F.nll_loss(pred, target)
    l2_penalty = sum((p**2).sum() * (lmbda / 2) for p, lmbda in zip(model.parameters(), lambda_l2_params))
    return loss + l2_penalty

def feval(data, target, model, optimizer, lambda_l2_params, mode='eval', is_cuda=1, opt_type='sgd', N=50000):

    if mode == 'eval':
        model.eval()
        with torch.no_grad():
            output = model(data)
    else:
        model.train()
        optimizer.zero_grad()
        output = model(data)

    loss = criterion(output, target, model, lambda_l2_params)
    pred = output.argmax(dim=1, keepdim=True).flatten()
    accuracy = pred.eq(target).float().mean()

    grad_vec = []
    noise = None
    if 'train' in mode:
        loss.backward()


        for i,param in enumerate(model.parameters()):
            if opt_type == 'sgld':
                noise = torch.randn(size=param.shape)
                model_ = model.module if 'DataParallel' in str(type(model)) else model
                if type(model_.eta) == type(np.array([])):
                    eps = np.sqrt(model_.eta[i]*2/ N) * noise  if model_.eta[i] > 0 else 0 * noise
                else:
                    eps = np.sqrt(model_.eta*2/ N) * noise  if model_.eta > 0 else 0 * noise
                eps = to_torch_variable(eps, is_cuda=is_cuda)
                param.grad.data = param.grad.data + eps.data
            grad_vec.append(param.grad.data.cpu().numpy().flatten())

        if 'SGD_Quotient_LR' in str(optimizer):
            optimizer.rez_step()
        else:
            optimizer.step()
        grad_vec = np.hstack(grad_vec)
        grad_vec = grad_vec / norm_np(grad_vec)

    elif 'grad' in mode:
        loss.backward()

    return model, loss.item(), accuracy.item(), output, noise, grad_vec

In [None]:
# evograd parameters
n_model_candidates = 2
sigma = 0.001
temperature = 0.05

def meta_update(data_vl, target_vl, data_tr, target_tr, model_, optimizer, evo_grad_opt, lambda_l2_params, mlr, noise=None, is_cuda=1):

    model = model_.module if 'DataParallel' in str(type(model_)) else model_

    param_shapes = model.param_shapes
    dFdlr= unflatten_array(model.dFdlr, model.param_cumsum, param_shapes)
    Hv_lr  = compute_HessianVectorProd(model, dFdlr, data_tr, target_tr, is_cuda=is_cuda)

    model, loss_valid, grad_valid = get_grad_valid(model, data_vl, target_vl, is_cuda)

    grad = flatten_array(get_grads(model.parameters(), is_cuda))
    param = flatten_array(model.parameters())
    model.grad_norm = norm(grad)
    model.param_norm = norm(param)
    grad_vl = flatten_array(grad_valid)
    model.grad_angle = torch.dot(grad / model.grad_norm, grad_vl / model.grad_norm_vl).item()


    model.update_dFdlr(Hv_lr, param, grad, lambda_l2_params, is_cuda, noise=noise)
    model.update_eta(mlr, val_grad=grad_valid)
    param = flatten_array_w_0bias(model.parameters()).data

    model_parameters = [i.detach() for i in model.parameters()]
    theta_list = [[j + sigma*torch.sign(torch.randn_like(j)) for j in model_parameters] for i in range(n_model_candidates)]
    pred_list = [model_patched(model, theta, data_vl) for theta in theta_list]
    loss_list = [criterion(pred, target_vl, model, lambda_l2_params) for pred in pred_list]
    weights = torch.softmax(-torch.stack(loss_list) / temperature, 0)
    theta_updated = [sum(map(mul, theta, weights)) for theta in zip(*theta_list)]
    preds_meta = model_patched(model, theta_updated, data_vl)
    loss_l2 = loss(preds_meta, target_vl, model)
    evo_grad_opt.zero_grad()
    grads = torch.autograd.grad(loss_l2, lambda_l2_params.parameters(), retain_graph=True)
    for p, g in zip(lambda_l2_params.parameters(), grads):
        p.grad = g
    evo_grad_opt.step()
    with torch.no_grad():
      for p in lambda_l2_params:
          p.clamp_(min=0.0, max=0.0002)

    optimizer = update_optimizer_hyperparams(model, optimizer)

    return model_, loss_valid, optimizer


In [11]:
def get_grad_valid(model, data, target, is_cuda):

    val_model = deepcopy(model)
    val_model.train()

    output = val_model(data)
    loss = F.nll_loss(output, target)
    loss.backward()
    grads = get_grads(val_model.parameters(), is_cuda)
    model.grad_norm_vl = norm(flatten_array(grads))

    return model, loss, grads

In [None]:
def update_optimizer_hyperparams(model, optimizer):

    model_ = model.module if 'DataParallel' in str(type(model)) else model
    optimizer.param_groups[0]['lr'] = np.copy(model_.eta)

    return optimizer

In [None]:
dataset = load_cifar10(args)

lr = 0.1
is_cuda = 1
mlr = 0.00001
opt_type = 'sgd'
reset_freq = 1
upd_freq = 1
model_type = 'arez18'
gamma = 0.97
step_size = 1000
num_epoch = 20
device = 'cuda'

print('==> Building model..')
model = AResNet18(lr)
optimizer = SGD_Multi_LR(model.parameters(), lr=lr)

optimizer = update_optimizer_hyperparams(model, optimizer)


if is_cuda:
    model = model.to(device)
    model = torch.nn.DataParallel(model)
    cudnn.benchmark = True

Wn_list, l2_list, lr_list, dFdlr_list, dFdl2_list, gang_list, tr_epoch, vl_epoch, te_epoch,\
                            tr_acc_list, te_acc_list, \
                            tr_loss_list, vl_loss_list, te_loss_list, \
                            tr_corr_mean_list, tr_corr_std_list \
                            = train(dataset, model, optimizer, mlr, opt_type, reset_freq, upd_freq, model_type, gamma, step_size, num_epoch, is_cuda=is_cuda)

print('Final test loss %f' % te_loss_list[-1])

==> Building model..
Valid Epoch: 0, Loss 2.302894 Acc 0.095900
Train Epoch: 0, Batch 0 out of 400
Train Epoch: 0, Batch 100 out of 400
Train Epoch: 0, Batch 200 out of 400
Train Epoch: 0, Batch 300 out of 400
Single epoch timing 17.509612
Train Epoch: 0, Tr Loss 1.437332 Vl loss 1.445358 Acc 0.461600 Eta [0.09937508 0.09999824 0.09999823 0.09935156 0.09999925 0.09999917
 0.09949898 0.09999907 0.09999956 0.09970121 0.09999966 0.09999969
 0.099706   0.09999952 0.09999984 0.09973374 0.09999979 0.09999983
 0.09963909 0.09999959 0.09999979 0.09994082 0.09999974 0.09999979
 0.09970518 0.09999985 0.09999989 0.09972945 0.09999978 0.09999992
 0.09972575 0.09999989 0.09999991 0.09961553 0.09999975 0.09999985
 0.09994025 0.09999983 0.09999985 0.0997635  0.09999993 0.09999994
 0.09975645 0.09999989 0.09999992 0.09986623 0.09999997 0.09999997
 0.09971221 0.0999997  0.09999734 0.09977108 0.09999898 0.09999734
 0.09953579 0.09999977 0.09999988 0.09938589 0.09999925 0.09999383
 0.09435606 0.09995991]

In [13]:
!python -u main.py --is_cuda 1 --ifold 0 --mlr 0.00001 --lr 0.1 --lambda_l2 0.0000 --opt_type sgd --update_freq 1 --save 1  --model_type arez18 --num_epoch 4 --batch_size_vl 1000 --update_lambda 1 --save_dir '../../save_dir'

Model Type: arez18 Opt Type: sgd meta-lr 0.000010 lr 0.100000 l2 0.000000, Update Freq 1 Reset Freq 0 |Nvl| 10000 Epoch 4
==> Building model..
../../save_dir/exp/cifar10/mlr0.000010_lr0.100000_l20.000000/arez18_4epoch_1000vlbz_sgd_1updatefreq_0resetfreq_1updatelabmda_fold0/
Valid Epoch: 0, Loss 2.302152 Acc 0.111600
Single epoch timing 11.966960
Train Epoch: 0, Tr Loss 1.426281 Vl loss 1.406026 Acc 0.475200 Eta [0.09988647 0.10002322 0.1000125  0.10285077 0.09997592 0.09998719
 0.09955115 0.10000604 0.10000124 0.10069115 0.10000653 0.10000471
 0.09969797 0.09999151 0.10000315 0.0998326  0.10000406 0.10000134
 0.10045404 0.10000608 0.09999487 0.10008082 0.10001068 0.09999487
 0.10049646 0.1000002  0.10000003 0.09993367 0.10000161 0.0999985
 0.10007716 0.0999981  0.10000013 0.10025845 0.09999928 0.09999958
 0.09974884 0.10000186 0.09999958 0.09891383 0.09999818 0.09999899
 0.10011662 0.10000145 0.09999988 0.10019482 0.0999999  0.09999965
 0.1010579  0.1000208  0.10003459 0.10049614 0.100