# Training Environment

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import torch  # Package name: torch (for pip), pytorch (for conda)
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data

class BasicDataset(data.Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    
    def __getitem__(self, idx):
        return dict(X=self.X[idx], y=self.y[idx])
    
    def __len__(self):
        return self.X.shape[0]

## Prepare Data

In [2]:
import torchvision.datasets as datasets

mnist = False
cifar10 = True
cifar100 = False
assert mnist ^ cifar10 ^ cifar100

n_classes = 10
if mnist:
    trainset = datasets.MNIST(root='./data', train=True, download=True, transform=None)
    testset = datasets.MNIST(root='./data', train=False, download=True, transform=None)
if cifar10:
    trainset = datasets.CIFAR10(root='./data', train=True, download=True, transform=None)
    testset = datasets.CIFAR10(root='./data', train=False, download=True, transform=None)
if cifar100:
    n_classes = 100
    trainset = datasets.CIFAR100(root='./data', train=True, download=True, transform=None)
    testset = datasets.CIFAR100(root='./data', train=False, download=True, transform=None)

print(trainset)
print(testset)

Files already downloaded and verified
Files already downloaded and verified
Dataset CIFAR10
    Number of datapoints: 50000
    Root location: ./data
    Split: Train
Dataset CIFAR10
    Number of datapoints: 10000
    Root location: ./data
    Split: Test


In [3]:
def to_rgb(x_grey: torch.Tensor) -> torch.Tensor:
    if len(x_grey.size()) == 3:
        helper = torch.unsqueeze(x_grey, 1)
        return helper.repeat(1, 3, 1, 1).float()
    elif len(x_grey.size()) == 4 and x_grey.size()[1] == 1:
        return x_grey.repeat(1, 3, 1, 1).float()
    elif len(x_grey.size()) == 4 and x_grey.size()[1] == 3:
        return x_grey
    elif len(x_grey.size()) == 4:
        raise ValueError(f'The size of this image tensor is not valid.\
        A 4th order image tensor must have dim1==1 (grey-scale) or dim1==3 (rgb).\
        Unknown format cannot be transformed to rgb.')
    else:
        raise ValueError(f'The size of this image-tensor is not valid.\
        Must be either 3rd (grey-scale) order tensor or 4th order tensor (rgb).\
        Got order {len(x_grey.size())}')
        
def swap_data(X):
    X1 = np.swapaxes(X, 1, 3)
    X2 = np.swapaxes(X1, 2, 3)
    return X2

if mnist:
    X_train_grey = trainset.train_data
    X_train = to_rgb(X_train_grey)
    X_test_grey = testset.test_data
    X_test = to_rgb(X_test_grey)
    y_train = trainset.train_labels
    y_test = testset.test_labels
else:
    X_train = torch.tensor(swap_data(trainset.data)[:128])
    y_train = torch.tensor(trainset.targets[:128])
    X_test = torch.tensor(swap_data(testset.data)[:128])
    y_test = torch.tensor(testset.targets[:128])
    
print('X_train:', X_train.shape)
print('y_train:', y_train.shape)
print('X_test:', X_test.shape)
print('y_test:', y_test.shape)

X_train: torch.Size([128, 3, 32, 32])
y_train: torch.Size([128])
X_test: torch.Size([128, 3, 32, 32])
y_test: torch.Size([128])


## Model load, modifications and GPU optimization

In [4]:
# https://github.com/kuangliu/pytorch-cifar/blob/49b7aa97b0c12fe0d4054e670403a16b6b834ddd/models/resnet.py

'''ResNet in PyTorch.

For Pre-activation ResNet, see 'preact_resnet.py'.

Reference:
[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
    Deep Residual Learning for Image Recognition. arXiv:1512.03385
'''
import torch
import torch.nn as nn
import torch.nn.functional as F


class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, in_planes, planes, stride=1):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(
            in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
                               stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*planes,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion*planes)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out


class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, in_planes, planes, stride=1):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
                               stride=stride, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, self.expansion *
                               planes, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(self.expansion*planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*planes,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion*planes)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = F.relu(self.bn2(self.conv2(out)))
        out = self.bn3(self.conv3(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out


class ResNet(nn.Module):
    def __init__(self, block, num_blocks, num_classes=10):
        super(ResNet, self).__init__()
        self.in_planes = 64

        self.conv1 = nn.Conv2d(3, 64, kernel_size=3,
                               stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
        self.linear = nn.Linear(512*block.expansion, num_classes)

    def _make_layer(self, block, planes, num_blocks, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_planes, planes, stride))
            self.in_planes = planes * block.expansion
        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = F.avg_pool2d(out, 4)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out


def ResNet18():
    return ResNet(BasicBlock, [2, 2, 2, 2])


def ResNet34():
    return ResNet(BasicBlock, [3, 4, 6, 3])


def ResNet50():
    return ResNet(Bottleneck, [3, 4, 6, 3])


def ResNet101():
    return ResNet(Bottleneck, [3, 4, 23, 3])


def ResNet152():
    return ResNet(Bottleneck, [3, 8, 36, 3])


def test():
    net = ResNet18()
    y = net(torch.randn(1, 3, 32, 32))
    print(y.size())

#test()

In [5]:
from torchvision import models

#model = ResNet18() # set model here
model = models.resnet18(pretrained=False)
in_ftr = model.fc.in_features
out_ftr = n_classes
model.fc = nn.Linear(in_ftr,out_ftr,bias=True)
    
dataset_train = BasicDataset(X_train, y_train)
dataset_test = BasicDataset(X_test, y_test)

Check specs for GPU-based training.

In [6]:
print('cuda available:', torch.cuda.is_available())
print('cuda device name:', torch.cuda.get_device_name())
print('cuda device id', torch.cuda.current_device())

cuda available: True
cuda device name: NVIDIA GeForce 940MX
cuda device id 0


Move model and data to GPU.

In [7]:
#device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu" if torch.cuda.is_available() else "cpu")
model.to(device)
X_train.to(device)
print('worked?', X_train.to(device).is_cuda)

worked? False


In [68]:
from typing import List
import copy

class AdaSecant(optim.Optimizer):
    r"""Documentation
    Basis copied from https://github.com/pytorch/pytorch/blob/master/torch/optim/sgd.py.
    Left out closure, momentum-related stuff, __setstate__ as it does not seem to be necessary.
    """

    def __init__(self, params, max_lr=1.0, lr=None):
        if lr is not None:
            print('Warning: lr is not a parameter for AdaSecant. Your lr will be set to None')
            lr = None
        defaults = dict(lr=lr)
        super().__init__(params, defaults)
        self.ready = False
        self.current_gradients = None
        self.gamma_numerators = []
        self.gamma_denomenators = []
        self.mean_gradients = []
        self.mean_gradient_squares = []
        self.mean_deltas = []
        self.mean_delta_squares = []
        self.mean_alphas = []
        self.mean_alpha_squares = []
        self.mean_delta_times_alphas = []
        self.old_gradients = []
        self.old_deltas = []
        self.taus = []
        # lr threshold for simple adasecant
        self.max_lr = max_lr
        # stop tau from increasing infinetely
        self.upper_bound_tau = 1e7
        self.lower_bound_tau = 1.5
        for group in self.param_groups:
            for p in group['params']:
                if p.requires_grad:
                    self.gamma_numerators.append(torch.zeros_like(p))
                    self.gamma_denomenators.append(torch.zeros_like(p))
                    self.mean_gradients.append(torch.zeros_like(p))
                    self.mean_gradient_squares.append(torch.zeros_like(p))
                    self.mean_deltas.append(torch.zeros_like(p))
                    self.mean_delta_squares.append(torch.zeros_like(p))
                    self.mean_alphas.append(torch.zeros_like(p))
                    self.mean_alpha_squares.append(torch.zeros_like(p))
                    self.mean_delta_times_alphas.append(torch.zeros_like(p))
                    self.taus.append(torch.ones_like(p))
                    self.old_gradients.append(None)
                    self.old_deltas.append(None)        

    @torch.no_grad()
    def step(self, epoch, version='normal'):
        """Performs a single optimization step.
        """

        average_lr = 0
        for group in self.param_groups:
            params_with_grad = []
            next_gradients = []

            for p in group['params']:
                # subgrouping of parameters for each layer, bias and weights separately (each tensor)
                if p.grad is not None:
                    params_with_grad.append(p)
                    next_gradients.append(p.grad)
            
            #print(group)
            #print('enter adasecant')
            if version == 'normal':
                average_lr += adasecant(self, params_with_grad, next_gradients, epoch)
            else:
                average_lr += simple_adasecant(self, params_with_grad, next_gradients, epoch, version)

        return average_lr / len(self.param_groups)

                
def moving_average(mean, new_value, tau):
    return (1 - 1 / tau) * mean + (1 / tau) * new_value


def needs_memory_reset(g, alpha, optimizer, i):
    return torch.logical_or(torch.gt(torch.abs(g - optimizer.mean_gradients[i]),
                                     2 * torch.sqrt(torch.abs(optimizer.mean_gradient_squares[i] 
                                                              - optimizer.mean_gradients[i] ** 2))),
                            torch.gt(torch.abs(alpha - optimizer.mean_alphas[i]),
                                     2 * torch.sqrt(torch.abs(optimizer.mean_alpha_squares[i]
                                                              - optimizer.mean_alphas[i] ** 2))))

simple adasecant variations: delta adasecant, gradient adasecant

In [69]:
def simple_adasecant(optimizer: AdaSecant, params: List[torch.Tensor], gradients: List[torch.Tensor], epoch: int,
                    version: str):

    average_lr = 0

    for i, param in enumerate(params):

        epsilon = 1e-7
        
        g = gradients[i]

        sgd = -0.01 * g
        
        if optimizer.old_gradients[i] is None:
            # alpha = 0 for first iteration because no second derivative can be made yet
            alpha = copy.deepcopy(g)
        else:
            # normal calculation of alpha
            alpha = g - optimizer.old_gradients[i]
            #if (alpha == 0).count_nonzero() > 0:
            #    print(alpha)
        
        if optimizer.old_deltas[i] is None:
            # delta = -lr * corrected_gradient, lr = 1 as initialization
            delta = -copy.deepcopy(g)
        else:
            delta = optimizer.old_deltas[i]      
        
        if version == 'delta':
            d = delta
        elif version == 'gradient':
            d = g
        else:
            raise RuntimeError(f'{version} is unknown version ("normal", "delta", "gradient" are valid versions)')
        lr_threshold = optimizer.max_lr
        lr = torch.abs(d / alpha)
        # version1 - investigate why this does not work as well (lr smaller than version2, but why?):
        #lr = torch.where(lr < float("inf"), lr, torch.full_like(lr, 0.0))
        #lr = torch.where(lr < lr_threshold, lr, torch.full_like(lr, lr_threshold))
        # version2 - worked better:
        lr = lr = torch.where(lr < lr_threshold, lr, torch.full_like(lr, 0.0))
        average_lr += torch.mean(lr).item()
        
        optimizer.taus[i] += 1
        
        if torch.linalg.norm(g) == 0:
            new_delta = 0
        else:
            if version == 'delta':
                new_delta = -lr * g
            elif version == 'gradient':
                new_delta = -lr * g / torch.linalg.norm(g)            
        params[i] += new_delta
        
        debug = False
        if debug:
            if i == 1:
                #print('tau', optimizer.taus[i][0], '\n')
                print('i', i)
                print('g', g[0], '\n')
                if optimizer.old_gradients[i] is not None:
                    print('old g', optimizer.old_gradients[i][0], '\n')
                print('alpha', alpha[0], '\n')
                print('lr', lr[0], '\n')
                #print('corrected g', corrected_gradient[0], '\n')
                #print('new delta', new_delta[0], '\n')
                #print('sgd', sgd[0], '\n')
                #print('params', params[i][0], '\n')        
        
        #optimizer.old_deltas[i] = copy.deepcopy(new_delta)
        optimizer.old_gradients[i] = copy.deepcopy(g)
                         
    return average_lr / len(params)

dataloader

In [70]:
import itertools
from more_itertools import peekable

def adasecant_dataloader(dataset, batch_size, shuffle=False, drop_last=False):
    data_loader = peekable(iter(data.DataLoader(dataset=dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last)))
    return data_loader

data_loader = adasecant_dataloader(dataset_test, 60, True, True)
for batch in data_loader:
    #print('current', batch['y'])
    try:
        peek = data_loader.peek()
        #print('next', peek['y'])
    except StopIteration:
        pass

## Training

In [71]:
def evaluate_model(model, dataset, batch_size=100):
    
    data_loader = data.DataLoader(dataset=dataset, batch_size=batch_size, shuffle=False, drop_last=False)
    loss = 0.0
    correct = 0
    
    for batch in data_loader:
        yhat = model.forward(batch['X'].float().to(device))
        y = batch['y'].long().to(device)
        batch_loss = f_loss(yhat, y)
        loss += batch_loss.item() * len(batch['X'])
        correct += (torch.argmax(yhat, dim=1) == y).float().sum().item()
    accuracy = correct / len(dataset)
    print('accuracy:', f'{accuracy * 100}%')
    
    return loss, accuracy

def get_scheduler(optimizer, base_lr, max_lr, epochs_per_cycle, len_dataset, batch_size):
    if epochs_per_cycle is None:
        epochs_per_cycle = epochs
    iterations_per_cycle = epochs_per_cycle * (len_dataset // batch_size)
    return torch.optim.lr_scheduler.CyclicLR(optimizer, base_lr, max_lr, step_size_up=iterations_per_cycle / 2)

In [72]:
def train_SGD(model, dataset, validation_set, base_lr=0.01, max_lr=0.01, batch_size=64, epochs=1, 
              f_loss=F.cross_entropy, epochs_per_cycle=None,
              lr_history=[], validation_accuracy=[], epoch_losses=[], validation_losses=[]):
    
    optimizer = f_opt(model.parameters(), lr=base_lr)
    scheduler = get_scheduler(optimizer, base_lr, max_lr, epochs_per_cycle, len(dataset), batch_size)

    for epoch in range(epochs):
        # training and epoch loss logging
        # drop last to avoid stochastic outliers in gradient update
        data_loader = data.DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True, drop_last=True)
        epoch_loss = 0.0 
        for batch in data_loader:
            model.zero_grad()
            yhat = model.forward(batch['X'].float().to(device))
            batch_loss = f_loss(yhat, batch['y'].long().to(device))
            epoch_loss += batch_loss.item() * len(batch['X'])
            batch_loss.backward()
            optimizer.step()
            scheduler.step()
            lr_history.append(scheduler.get_last_lr()[0])
        print(f'Epoch {epoch+1}/{epochs} - Loss: {epoch_loss / len(dataset)} - LR: {scheduler.get_last_lr()[0]}')
        epoch_losses.append(epoch_loss)
        
        # calculate validation loss and accuracy
        validation_loss, accuracy = evaluate_model(model, validation_set)
        validation_losses.append(validation_loss)
        validation_accuracy.append(accuracy)
    
    return(epoch_losses, validation_losses, validation_accuracy, lr_history, None)

In [73]:
def train_SAS(model, dataset, validation_set,
              max_lr=1.0, last_update_step=None, batch_size=64, epochs=1, f_loss=F.cross_entropy,
              lr_history=[], validation_accuracy=[], epoch_losses=[], validation_losses=[]):
    
    torch.autograd.set_detect_anomaly(True)
    
    optimizer = AdaSecant(model.parameters(), max_lr=max_lr)

    for epoch in range(epochs):
        # training and epoch loss logging
        # drop last to avoid stochastic outliers in gradient update
        data_loader = adasecant_dataloader(dataset=dataset, batch_size=batch_size, shuffle=True, drop_last=True)
        epoch_loss = 0.0
        
        for batch in data_loader:
            model.zero_grad()
            yhat = model.forward(batch['X'].float().to(device))
            batch_loss = f_loss(yhat, batch['y'].long().to(device))
            epoch_loss += batch_loss.item() * len(batch['X'])
            batch_loss.backward()
            lr = optimizer.step(epoch, version='delta')
            lr_history.append(lr)            
        print(f'Epoch {epoch+1}/{epochs} - Loss: {epoch_loss / len(dataset)} - LR: {lr}')
        epoch_losses.append(epoch_loss)
        
        # calculate validation loss and accuracy
        validation_loss, accuracy = evaluate_model(model, validation_set)
        validation_losses.append(validation_loss)
        validation_accuracy.append(accuracy)
    
    return(epoch_losses, validation_losses, validation_accuracy, lr_history)

In [74]:
def train_model(model, dataset, validation_set, clr_min=0.001, clr_max=1.0,
                epochs_switch=10, max_lr=1.0, batch_size=64, epochs=100,
                f_opt=optim.SGD, f_loss=F.cross_entropy, epochs_per_cycle=10):
    
    lr_history = []
    validation_accuracy = []
    epoch_losses = []
    validation_losses = []
    
    # evaluate initial state of model
    initial_training_loss, _ = evaluate_model(model, dataset)
    epoch_losses.append(initial_training_loss)
    validation_loss, accuracy = evaluate_model(model, validation_set)
    validation_losses.append(validation_loss)
    validation_accuracy.append(accuracy)
    training_losses = []
    
    (training_losses,
     validation_losses,
     validation_accuracy,
     lr_history,
     last_update_step) = train_SGD(model=model,
                                   dataset=dataset,
                                   validation_set=validation_set,
                                   base_lr=clr_min,
                                   max_lr=clr_max,
                                   batch_size=batch_size,
                                   epochs=epochs_switch,
                                   f_loss=f_loss,
                                   epochs_per_cycle=epochs_per_cycle,
                                   epoch_losses=training_losses,
                                   validation_losses=validation_losses,
                                   validation_accuracy=validation_accuracy,
                                   lr_history=lr_history)
    
    training_losses, validation_losses, validation_accuracy, lr_history = train_SAS(model=model,
                                                                                    dataset=dataset,
                                                                                    validation_set=validation_set,
                                                                                    max_lr=max_lr,
                                                                                    last_update_step=last_update_step,
                                                                                    batch_size=batch_size,
                                                                                    epochs=epochs-epochs_switch,
                                                                                    f_loss=f_loss,
                                                                                    epoch_losses=training_losses,
                                                                                    validation_losses=validation_losses,
                                                                                    validation_accuracy=validation_accuracy,
                                                                                    lr_history=lr_history)

    
    return (np.array(epoch_losses) / len(dataset), 
            np.array(validation_losses) / len(validation_set), 
            validation_accuracy, 
            lr_history)

In [75]:
clr_min = 0.001
clr_max = 1.0
max_lr = 10.0
batch_size = 128
cycle = 10
epochs_switch = 1
epochs = 100
f_opt = optim.SGD
f_loss = F.cross_entropy

training_loss, validation_loss, validation_accuracy, lr_history = train_model(model.to(device),
                                                                              dataset_train,
                                                                              dataset_test,
                                                                              clr_min,
                                                                              clr_max,
                                                                              epochs_switch,
                                                                              max_lr,
                                                                              batch_size,
                                                                              epochs,
                                                                              cycle,
                                                                              f_loss=f_loss)

<class 'function'>
accuracy: 17.1875%
accuracy: 7.03125%
Epoch 1/1 - Loss: 2.291903018951416 - LR: 0.20080000000000017
accuracy: 7.03125%
Epoch 1/99 - Loss: 2.291698455810547 - LR: 0.8540937530718022
accuracy: 7.03125%
Epoch 2/99 - Loss: 2.23301362991333 - LR: 1.309383293012938
accuracy: 8.59375%
Epoch 3/99 - Loss: 2.1480214595794678 - LR: 1.0087529065147522
accuracy: 14.0625%
Epoch 4/99 - Loss: 2.1317553520202637 - LR: 0.8781761388865209
accuracy: 10.15625%
Epoch 5/99 - Loss: 2.093514919281006 - LR: 0.9906340591128795
accuracy: 10.15625%


KeyboardInterrupt: 

In [None]:
plt.plot(training_loss, label='training loss')
plt.plot(validation_loss, label='validation loss')
plt.xlabel('Epochs')
plt.ylabel('Mean Cross Entropy Loss')
#plt.ylim(0.0, 10)
plt.legend()

In [None]:
plt.plot(lr_history)
plt.ylim(0, 0.001)

In [None]:
plt.plot(validation_accuracy)
print(max(validation_accuracy))

In [None]:
import csv
from google.colab import files

with open('train_loss', 'w') as f:
    write = csv.writer(f)
    write.writerow(training_loss)

with open('val_loss', 'w') as f: 
    write = csv.writer(f)  
    write.writerow(validation_loss)

with open('val_accuracy', 'w') as f: 
    write = csv.writer(f)  
    write.writerow(validation_accuracy)
    
with open('lr_history', 'w') as f: 
    write = csv.writer(f)  
    write.writerow(lr_history)

files.download('train_loss')
files.download('val_loss')
files.download('val_accuracy')
files.download('lr_history')