<a href="https://colab.research.google.com/github/Deeksha-P/Adaptive-gradient-descent-without-descent/blob/master/NN_with_acc_GD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/Deeksha-P/Adaptive-gradient-descent-without-descent.git

Cloning into 'Adaptive-gradient-descent-without-descent'...
remote: Enumerating objects: 132, done.[K
remote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects: 100% (6/6), done.[K
remote: Total 132 (delta 2), reused 0 (delta 0), pack-reused 126[K
Receiving objects: 100% (132/132), 8.86 MiB | 11.88 MiB/s, done.
Resolving deltas: 100% (59/59), done.


In [None]:
%pwd

'/content'

In [None]:
!ls

Adaptive-gradient-descent-without-descent  sample_data


In [None]:
%cd Adaptive-gradient-descent-without-descent

/content/Adaptive-gradient-descent-without-descent


In [None]:
%pwd

'/content/Adaptive-gradient-descent-without-descent'

In [None]:
import torch
import numpy as np

from torch.optim.optimizer import Optimizer, required

    
class Adsgd(Optimizer):
    def __init__(self, params, lr=0.2, amplifier=0.02, theta=1, damping=1, eps=1e-5, weight_decay=0):
        if lr is not required and lr < 0.0:
            raise ValueError("Invalid initial learning rate: {}".format(lr))
        if weight_decay < 0.0:
            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))

        defaults = dict(lr=lr, amplifier=amplifier, theta=theta, damping=damping,
                        eps=eps, weight_decay=weight_decay)
        super(Adsgd, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(Adsgd, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('lr', 0.2)
            group.setdefault('amplifier', 0.02)
            group.setdefault('damping', 1)
            group.setdefault('theta', 1)
                
    def compute_dif_norms(self, prev_optimizer=required):
        for group, prev_group in zip(self.param_groups, prev_optimizer.param_groups):
            grad_dif_norm = 0
            param_dif_norm = 0
            for p, prev_p in zip(group['params'], prev_group['params']):
                if p.grad is None:
                    continue
                d_p = p.grad.data
                prev_d_p = prev_p.grad.data
                grad_dif_norm += (d_p - prev_d_p).norm().item() ** 2
                param_dif_norm += (p.data - prev_p.data).norm().item() ** 2
            group['grad_dif_norm'] = np.sqrt(grad_dif_norm)
            group['param_dif_norm'] = np.sqrt(param_dif_norm)

    def step(self, closure=None):
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        
        # TODO: use closure to compute gradient difference
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            eps = group['eps']
            lr = group['lr']
            damping = group['damping']
            amplifier = group['amplifier']
            theta = group['theta']
            grad_dif_norm = group['grad_dif_norm']
            param_dif_norm = group['param_dif_norm']
            if param_dif_norm > 0 and grad_dif_norm > 0:
                lr_new = min(lr * np.sqrt(1 + amplifier * theta), param_dif_norm / (damping * grad_dif_norm)) + eps
            else:
                lr_new = lr * np.sqrt(1 + amplifier * theta)
            theta = lr_new / lr
            group['theta'] = theta
            group['lr'] = lr_new
            for p in group['params']:
                if p.grad is None:
                    continue
                d_p = p.grad.data
                if group['weight_decay'] != 0:
                    d_p.add_(group['weight_decay'], p.data)
                p.data.add_(d_p, alpha=-lr_new)
        return loss

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, in_planes, planes, stride=1):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion*planes)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out


class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, in_planes, planes, stride=1):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, self.expansion*planes, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(self.expansion*planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion*planes)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = F.relu(self.bn2(self.conv2(out)))
        out = self.bn3(self.conv3(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out


class ResNet(nn.Module):
    def __init__(self, block, num_blocks, num_classes=10):
        super(ResNet, self).__init__()
        self.in_planes = 64

        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
        self.linear = nn.Linear(512*block.expansion, num_classes)

    def _make_layer(self, block, planes, num_blocks, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_planes, planes, stride))
            self.in_planes = planes * block.expansion
        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = F.avg_pool2d(out, 4)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out


def ResNet18():
    return ResNet(BasicBlock, [2,2,2,2])

def ResNet34():
    return ResNet(BasicBlock, [3,4,6,3])

def ResNet50():
    return ResNet(Bottleneck, [3,4,6,3])

def ResNet101():
    return ResNet(Bottleneck, [3,4,23,3])

def ResNet152():
    return ResNet(Bottleneck, [3,8,36,3])

In [None]:
%pwd

'/content/Adaptive-gradient-descent-without-descent'

In [None]:
%cd pytorch 

/content/Adaptive-gradient-descent-without-descent/pytorch


In [None]:
%load optimizer.py

In [None]:
%load utils.py

In [None]:
%load resnet.py

In [None]:
import numpy as np
import os
import random
import torch
import torchvision

import torchvision.transforms as transforms

from pathlib import Path


def seed_everything(seed=1029):
    '''
    :param seed:
    :param device:
    :return:
    '''
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # some cudnn methods can be random even after fixing the seed
    # unless you tell it to be deterministic
    torch.backends.cudnn.deterministic = True


def load_data(dataset='cifar10', batch_size=128, num_workers=4):
    """
    Loads the required dataset
    :param dataset: Can be either 'cifar10' or 'cifar100'
    :param batch_size: The desired batch size
    :return: Tuple (train_loader, test_loader, num_classes)
    """
    print('==> Preparing data..')
    transform_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    ])

    transform_test = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    ])

    if dataset == 'cifar10':
        # classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
        num_classes = 10
        trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train)
        testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test)
    elif dataset == 'cifar100':
        num_classes = 100
        trainset = torchvision.datasets.CIFAR100(root='./data', train=True, download=True, transform=transform_train)
        testset = torchvision.datasets.CIFAR100(root='./data', train=False, download=True, transform=transform_test)
    else:
        raise ValueError('Only cifar 10 and cifar 100 are supported')

    trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
    testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=num_workers)

    return trainloader, testloader, num_classes
    
    
def accuracy_and_loss(net, dataloader, device, criterion):
    net.eval()
    correct = 0
    total = 0
    loss = 0
    with torch.no_grad():
        for data in dataloader:
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = net(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            loss += criterion(outputs, labels).cpu().item() / len(dataloader)

    return correct / total, loss

def save_results(losses, test_losses, train_acc, test_acc, it_train, it_test, grad_norms, method='sgd', 
                 lrs=[], experiment='cifar10_resnet18', folder='./', to_save_extra=[], prefixes_extra=[]):
    path = f'./{folder}/{experiment}/'
    Path(path).mkdir(parents=True, exist_ok=True)
    to_save = [losses, test_losses, train_acc, test_acc, it_train, it_test, grad_norms, lrs] + to_save_extra
    prefixes = ['l', 'tl', 'a', 'ta', 'itr', 'ite', 'gn', 'lr'] + prefixes_extra
    for log, prefix in zip(to_save, prefixes):
        np.save(f'{path}/{method}_{prefix}.npy', log)
        
def load_results(method, logs_path, load_lr=False):
    path = logs_path
    if logs_path[-1] != '/':
        path += '/'
    path += method + '_'
    prefixes = ['l', 'tl', 'a', 'ta', 'itr', 'ite', 'gn']
    if load_lr:
        prefixes += ['lr']
    out = [np.load(path + prefix + '.npy') for prefix in prefixes]
    return tuple(out)


In [None]:
import copy
import numpy as np
import torch

from optimizer import Adsgd
from utils import load_data, accuracy_and_loss, save_results, seed_everything


def run_adgd(net, n_epoch=2, amplifier=0.02, damping=1, weight_decay=0, eps=1e-8, checkpoint=125, batch_size=128, noisy_train_stat=True):
    losses = []
    train_acc = []
    test_losses = []
    test_acc = []
    it_train = []
    it_test = []
    grad_norms = []
    
    prev_net = copy.deepcopy(net)
    prev_net.to(device)
    net.train()
    prev_net.train()
    lrs = []
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = Adsgd(net.parameters(), amplifier=amplifier, damping=damping, weight_decay=weight_decay, eps=eps)
    prev_optimizer = Adsgd(prev_net.parameters(), weight_decay=weight_decay)
            
    for epoch in range(n_epoch):  # loop over the dataset multiple times

        running_loss = 0.0
        for i, data in enumerate(trainloader, 0):
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad(set_to_none=True)
            prev_optimizer.zero_grad(set_to_none=True)

            prev_outputs = prev_net(inputs)
            prev_loss = criterion(prev_outputs, labels)
            prev_loss.backward()

            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()

            optimizer.compute_dif_norms(prev_optimizer)
            prev_net.load_state_dict(net.state_dict())
            optimizer.step()

            running_loss += loss.item()
            if (i % 10) == 0:
                if noisy_train_stat:
                    losses.append(loss.cpu().item())
                    it_train.append(epoch + i * batch_size / N_train)
                lrs.append(optimizer.param_groups[0]['lr'])

            if i % checkpoint == checkpoint - 1:
                if running_loss / checkpoint < 0.01:
                    print('[%d, %5d] loss: %.4f' %
                          (epoch + 1, i + 1, running_loss / checkpoint), end='')
                else:
                    print('[%d, %5d] loss: %.3f' %
                          (epoch + 1, i + 1, running_loss / checkpoint), end='')
                running_loss = 0.0
                test_a, test_l = accuracy_and_loss(net, testloader, device, criterion)
                test_acc.append(test_a)
                test_losses.append(test_l)
                grad_norms.append(np.sum([p.grad.data.norm().item() for p in net.parameters()]))
                net.train()
                it_test.append(epoch + i * batch_size / N_train)
                
        if not noisy_train_stat:
            it_train.append(epoch)
            train_a, train_l = accuracy_and_loss(net, trainloader, device, criterion)
            train_acc.append(train_a)
            losses.append(train_l)
            net.train()

    del prev_net
    return (np.array(losses), np.array(test_losses), np.array(train_acc), np.array(test_acc),
            np.array(it_train), np.array(it_test), np.array(lrs), np.array(grad_norms))


if __name__ == "__main__":
    # Train ResNet18 on Cifar10 data
    import argparse
    from resnet import ResNet18
    
    # parser = argparse.ArgumentParser('Model-Agnostic Meta-Learning (MAML)')
    # parser.add_argument('--lr_amplifier', type=float, default=0.02,
    #     help='Coefficient alpha for multiplying the stepsize by (1+alpha) (default: 0.02).')
    # parser.add_argument('--lr_damping', type=float, default=1.,
    #     help='Divide the inverse smoothness by damping (default: 1.).')
    # parser.add_argument('--weight_decay', type=float, default=0.,
    #     help='Weight decay parameter (default: 0.).')
    # parser.add_argument('--batch_size', type=int, default=128,
    #     help='Number of passes over the data (default: 128).')
    
    # parser.add_argument('--n_epoch', type=int, default=120,
    #     help='Number of passes over the data (default: 120).')
    # parser.add_argument('--n_seeds', type=int, default=1,
    #     help='Number of random seeds to run the method (default: 1).')
    # parser.add_argument('--output_folder', type=str, default='./',
    #     help='Path to the output folder for saving the logs (optional).')
    
    # args = parser.parse_args()
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    
    N_train = 50000
    trainloader, testloader, num_classes = load_data(batch_size=128)
    checkpoint = len(trainloader) // 3 + 1
    amplifier = 0.02
    
    n_seeds = 1
    max_seed = 424242
    rng = np.random.default_rng(42)
    seeds = [rng.choice(max_seed, size=1, replace=False)[0] for _ in range(n_seeds)]

    for r, seed in enumerate(seeds):
        seed_everything(seed)
        net = ResNet18()
        net.to(device)
        losses_adgd, test_losses_adgd, train_acc_adgd, test_acc_adgd, it_train_adgd, it_test_adgd, lrs_adgd, grad_norms_adgd = run_adgd(
            net=net, n_epoch=4, amplifier=0.02, damping=1., weight_decay=0., 
            checkpoint=checkpoint, batch_size=128, noisy_train_stat=False
        )
        method = f'adgd_{0.02}_{1.}'
        experiment = 'cifar10_resnet18'
        save_results(losses_adgd, test_losses_adgd, train_acc_adgd, test_acc_adgd, it_train_adgd, it_test_adgd, lrs=lrs_adgd, 
                 grad_norms=grad_norms_adgd, method=method, experiment=experiment, folder="/content/Adaptive-gradient-descent-without-descent/saved_data")

==> Preparing data..
Files already downloaded and verified
Files already downloaded and verified


  cpuset_checked))


[1,   131] loss: 1.938[1,   262] loss: 1.550[2,   131] loss: 1.273[2,   262] loss: 1.170[3,   131] loss: 1.012[3,   262] loss: 0.930[4,   131] loss: 0.848[4,   262] loss: 0.846