In [153]:
import sys
import random
import time, datetime
import os, shutil
import yaml
import ast, bisect
import csv

import numpy as np
import torch
import torch.nn as nn
import torch.backends.cudnn as cudnn
from torch import optim
from torch.optim.lr_scheduler import LambdaLR
from torch.autograd import grad
import torchnet as tnt

import dataloader
from dataloader import cutout
from models.resnet import ResNet

# -------------
# Initial setup
# -------------





In [154]:
# args = parser.parse_args()
seed = None
# CUDA info
has_cuda = torch.cuda.is_available()
cudnn.benchmark = True

# Set random seed
if seed is None:
    seed = int(time.time())
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)




In [155]:
datadir = '/home/math/oberman-lab/data/'
test_batch_size = 3
batch_size = 3
args_cutout = 16
args_model = 'ResNet50'

workers = 4
test_loader = getattr(dataloader, 'cifar10')(datadir,
                                             mode='test', transform=False,
                                             batch_size=test_batch_size,
                                             num_workers=workers,
                                             shuffle=False,
                                             pin_memory=has_cuda)

transforms = [cutout(args_cutout, channels=3)]
train_loader = getattr(dataloader, 'cifar10')(datadir,
                                              mode='train', transform=True,
                                              batch_size=batch_size,
                                              training_transforms=transforms,
                                              num_workers=workers,
                                              shuffle=True,
                                              pin_memory=has_cuda,
                                              drop_last=True)

model = ResNet([3,4,6,3],base_channels=64, block='Bottleneck')




Files already downloaded and verified
Files already downloaded and verified


In [156]:
lr = 0.1
decay = 5e-4
momentum = 0.9
lr_schedule = '[[0,1],[60,0.2],[120,0.04],[160,0.008]]'

criterion = nn.CrossEntropyLoss()
train_criterion = nn.CrossEntropyLoss(reduction='none')


if has_cuda:
    criterion = criterion.cuda(0)
    train_criterion = train_criterion.cuda(0)
    model = model.cuda(0)

optimizer = optim.SGD(model.parameters(),
                      lr=lr,
                      weight_decay=decay,
                      momentum=momentum,
                      nesterov=False)


def scheduler(optimizer, lr_schedule):
    """Return a hyperparmeter scheduler for the optimizer"""
    lS = np.array(ast.literal_eval(lr_schedule))
    llam = lambda e: float(lS[max(bisect.bisect_right(lS[:, 0], e) - 1, 0), 1])
    lscheduler = LambdaLR(optimizer, llam)

    return lscheduler


schedule = scheduler(optimizer, lr_schedule)




In [157]:
penalty = 0.0005


ix = 0  # count of gradient steps

tik = penalty

regularizing = tik > 0

h = 1e-2  # finite difference step size



In [158]:
norm = 'L2'
fd_order = 'O2'
log_interval = 100

def train(epoch, ttot):
    global ix

    # Put the model in train mode (unfreeze batch norm parameters)
    model.train()

    # Run through the training data
    if has_cuda:
        torch.cuda.synchronize()
    tepoch = time.perf_counter()


    for batch_ix, (x, target) in enumerate(train_loader):

        if has_cuda:
            x = x.cuda()
            target = target.cuda()

        optimizer.zero_grad()
        if regularizing:
            x.requires_grad_(True)

        prediction = model(x)
        lx = train_criterion(prediction, target)
        loss = lx.mean()

        # Compute finite difference approximation of directional derivative of grad loss wrt inputs
        if regularizing:

            dx = grad(loss, x, retain_graph=True)[0]
            sh = dx.shape
            print(f"sh = {sh}")
            x.requires_grad_(False)

            # v is the finite difference direction.
            # For example, if norm=='L2', v is the gradient of the loss wrt inputs
            v = dx.view(sh[0], -1)
            print(f"v :{v.shape}")
            Nb, Nd = v.shape
            print(f"Nb = {Nb}")
            print(f"Nd = {Nd}")

            if norm == 'L2':
                nv = v.norm(2, dim=-1, keepdim=True)
                print(f"nv: {nv.shape}")
                print(f"nv = {nv}")
                nz = nv.view(-1) > 0
                print(f"nz: {nz.shape}")
                print(f"nz = {nz}")
                v[nz] = v[nz].div(nv[nz])
                print(f"v: {v.shape}")
                print(f"v = {v}")

            if norm == 'L1':
                v = v.sign()
                v = v / np.sqrt(Nd)
            elif norm == 'Linf':
                vmax, Jmax = v.abs().max(dim=-1)
                sg = v.sign()
                I = torch.arange(Nb, device=v.device)
                sg = sg[I, Jmax]

                v = torch.zeros_like(v)
                I = I * Nd
                Ix = Jmax + I
                v.put_(Ix, sg)

            v = v.view(sh)
            print(f"v: {v.shape}")
            xf = x + h * v
            print(f" x: {x.shape}")
            print(f" h = {h}")
            print(f"xf: {xf.shape}")


            mf = model(xf)
            print(f"mf = {mf}")
            print(f"mf : {mf.shape}")
            lf = train_criterion(mf, target)
            print(f"lf = {lf}")
            print(f"lf: {lf.shape}")

            if fd_order == 'O2':
                xb = x - h * v
                mb = model(xb)
                lb = train_criterion(mb, target)
                H = 2 * h
            else:
                H = h
                lb = lx
            dl = (lf - lb) / H
            print(f"dl = {dl}")# This is the finite difference approximation
            # of the directional derivative of the loss
            print(f"dl: {dl.shape}")
            sys.exit(0)

        tik_penalty = torch.tensor(np.nan)
        dlmean = torch.tensor(np.nan)
        dlmax = torch.tensor(np.nan)
        if tik > 0:
            dl2 = dl.pow(2)
            tik_penalty = dl2.mean() / 2
            loss = loss + tik * tik_penalty

        loss.backward()

        optimizer.step()

        if np.isnan(loss.data.item()):
            raise ValueError('model returned nan during training')

        t = ttot + time.perf_counter() - tepoch
        fmt = '{:.4f}'

        if (batch_ix % log_interval == 0 and batch_ix > 0):
            print('[%2d, %3d] penalized training loss: %.3g' %
                  (epoch, batch_ix, loss.data.item()))
        ix += 1

    if has_cuda:
        torch.cuda.synchronize()

    return ttot + time.perf_counter() - tepoch


In [159]:
epochs = 10

def main():
    pct_max = 90.
    fail_count = fail_max = 5
    time = 0.
    pct0 = 100.
    for e in range(epochs):

        # Update the learning rate
        schedule.step()

        time = train(e, time)



        if fail_count < 1:
            raise ValueError('Percent error has not decreased in %d epochs' % fail_max)


In [160]:
main()




sh = torch.Size([3, 3, 32, 32])
v :torch.Size([3, 3072])
Nb = 3
Nd = 3072
nv: torch.Size([3, 1])
nv = tensor([[74.3334],
        [73.5516],
        [71.6138]], device='cuda:0')
nz: torch.Size([3])
nz = tensor([True, True, True], device='cuda:0')
v: torch.Size([3, 3072])
v = tensor([[-0.0009, -0.0042, -0.0024,  ..., -0.0008,  0.0006,  0.0022],
        [-0.0148,  0.0036,  0.0119,  ...,  0.0048,  0.0025, -0.0003],
        [ 0.0021,  0.0059, -0.0071,  ..., -0.0040,  0.0030, -0.0013]],
       device='cuda:0')
v: torch.Size([3, 3, 32, 32])
 x: torch.Size([3, 3, 32, 32])
 h = 0.01
xf: torch.Size([3, 3, 32, 32])
mf = tensor([[0.9512, 0.0000, 0.8275, 0.0000, 0.0000, 1.3871, 1.2602, 0.0000, 0.8439,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.9332, 0.0000, 0.0000, 0.0000, 0.6963, 0.0000,
         0.5227],
        [0.3339, 1.2672, 0.6552, 0.4004, 1.4296, 0.0000, 0.0000, 0.7156, 0.5057,
         0.8510]], device='cuda:0', grad_fn=<ReluBackward0>)
mf : torch.Size([3, 10])
lf = tensor([2.138

SystemExit: 0

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
