In [22]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [23]:
! git clone https://github.com/amirgholami/adahessian.git #original, if we want to modify something need to replace with a forked one

fatal: destination path 'adahessian' already exists and is not an empty directory.


In [24]:
import sys
sys.path.insert(0,'/content/drive/MyDrive/Opti-ML') #to access other files
sys.path.insert(0,"adahessian/image_classification/")
#import os
#assert os.environ['COLAB_TPU_ADDR'], 'Make sure to select TPU from Edit > Notebook settings > Hardware accelerator'

In [25]:
# Useful starting lines
import numpy as np
import matplotlib.pyplot as plt
import torch
from torch.utils.data import Dataset
from torchvision import datasets, models, transforms, utils
from torchvision.transforms import ToTensor
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
import copy
#import torch_xla
#import torch_xla.core.xla_model as xm

from optimizer import set_parameter_requires_grad, initialize_model, train_and_test, test_model
from optim_adahessian import Adahessian

%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [26]:
# classic dataset
is_renormalized = True
if is_renormalized:
  renormalize = transforms.Normalize([0,0,0], [0.0001,1,10000], inplace=False)
else:
  renormalize = transforms.Normalize([0,0,0], [1,1,1], inplace=False)
transform =  transforms.Compose([ToTensor(),renormalize])
train_data=datasets.CIFAR10(root='data',train=True,download=True,transform=transform)
test_data=datasets.CIFAR10(root='data',train=False,download=True,transform=transform)

Files already downloaded and verified
Files already downloaded and verified


In [27]:
# Use a small subset of the data
# only error: 'Subset' object has no attribute 'targets'
make_small = False # put False if don't want it
if make_small: # put False if don't want it
  rng = np.random.default_rng()

  indices = rng.choice(len(train_data), 1000, replace=False)
  train_data = torch.utils.data.Subset(train_data, indices)

  indices = rng.choice(len(test_data), 200, replace=False)
  test_data = torch.utils.data.Subset(test_data, indices)
  labels = np.array([0,1,2,3,4,5,6,7,8,9])
else:
    labels=np.unique(np.array(train_data.targets)) #don't seem to work for all datasets, like 'Subset' objects
print(labels)

[0 1 2 3 4 5 6 7 8 9]


In [28]:
print("Data of length",len(train_data) )
print("samples of type",type(train_data[0]),"with length",len(train_data[0]))
print("   - First entry: Image of type",type(train_data[0][0]),"and size",train_data[0][0].size())
print("   - Second entry: Label of type",type(train_data[0][1]))

Data of length 50000
samples of type <class 'tuple'> with length 2
   - First entry: Image of type <class 'torch.Tensor'> and size torch.Size([3, 32, 32])
   - Second entry: Label of type <class 'int'>


In [29]:
n=len(train_data)
images = np.zeros((n,3,32,32))
for i in range(n):
  images[i,:,:,:] = train_data[i][0]
images = np.transpose(images,(0,2,3,1)) # size (n,32,32,3)
print(np.amin(images,axis=(0,1,2)),np.amax(images,axis=(0,1,2)))

# new method:
print(images.std(axis=(0,1,2)))
# Old method:
"""
mu = images.mean(axis=(0,3))
sigma = images.std(axis=(0,3))

plt.imshow(mu)
plt.title("mean")
plt.figure()
plt.imshow(sigma)
plt.title("std")
plt.colorbar()

std_max = np.amax(sigma)
std_min = np.amin(sigma)
factor = 10
sigma2 = (sigma - std_min)/(std_max-std_min)*(std_max - std_min/factor) + std_min/factor
print(np.shape(sigma2))
plt.figure()
plt.imshow(sigma2)
plt.title("modified std")
plt.colorbar()
"""

[0. 0. 0.] [1.00000000e+04 1.00000000e+00 9.99999975e-05]
[2.47032241e+03 2.43485131e-01 2.61587844e-05]


'\nmu = images.mean(axis=(0,3))\nsigma = images.std(axis=(0,3))\n\nplt.imshow(mu)\nplt.title("mean")\nplt.figure()\nplt.imshow(sigma)\nplt.title("std")\nplt.colorbar()\n\nstd_max = np.amax(sigma)\nstd_min = np.amin(sigma)\nfactor = 10\nsigma2 = (sigma - std_min)/(std_max-std_min)*(std_max - std_min/factor) + std_min/factor\nprint(np.shape(sigma2))\nplt.figure()\nplt.imshow(sigma2)\nplt.title("modified std")\nplt.colorbar()\n'

In [30]:
# Old manual method:
"""
for i in range(n):
  train_data[i][0] = (train_data[i][0] - mu)/sigma * sigma2 + mu
"""

'\nfor i in range(n):\n  train_data[i][0] = (train_data[i][0] - mu)/sigma * sigma2 + mu\n'

In [31]:
batch_size = 256
trainloader = DataLoader(train_data, batch_size=batch_size, shuffle=True) #change batch size with our need
testloader = DataLoader(test_data, batch_size=batch_size, shuffle=True)
classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

In [32]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

#device = xm.xla_device()

Using cpu device


In [33]:
req_grad = True
seed = 100
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
# Initialize the model for this run
model, input_size = initialize_model(use_pretrained=False)



In [34]:
# Send the model to GPU

model = model.to(device)

# Gather the parameters to be optimized/updated in this run. If we are
#  finetuning we will be updating all parameters. However, if we are
#  doing feature extract method, we will only update the parameters
#  that we have just initialized, i.e. the parameters with requires_grad
#  is True.
params_to_update = model.parameters()
print("Params to learn:")
if req_grad:
    params_to_update = []
    for name,param in model.named_parameters():
        if param.requires_grad == True:
            params_to_update.append(param)
            print("\t",name)
else:
    for name,param in model.named_parameters():
        if param.requires_grad == True:
            print("\t",name)

Params to learn:
	 conv1.weight
	 bn1.weight
	 bn1.bias
	 layer1.0.conv1.weight
	 layer1.0.bn1.weight
	 layer1.0.bn1.bias
	 layer1.0.conv2.weight
	 layer1.0.bn2.weight
	 layer1.0.bn2.bias
	 layer1.1.conv1.weight
	 layer1.1.bn1.weight
	 layer1.1.bn1.bias
	 layer1.1.conv2.weight
	 layer1.1.bn2.weight
	 layer1.1.bn2.bias
	 layer2.0.conv1.weight
	 layer2.0.bn1.weight
	 layer2.0.bn1.bias
	 layer2.0.conv2.weight
	 layer2.0.bn2.weight
	 layer2.0.bn2.bias
	 layer2.0.downsample.0.weight
	 layer2.0.downsample.1.weight
	 layer2.0.downsample.1.bias
	 layer2.1.conv1.weight
	 layer2.1.bn1.weight
	 layer2.1.bn1.bias
	 layer2.1.conv2.weight
	 layer2.1.bn2.weight
	 layer2.1.bn2.bias
	 layer3.0.conv1.weight
	 layer3.0.bn1.weight
	 layer3.0.bn1.bias
	 layer3.0.conv2.weight
	 layer3.0.bn2.weight
	 layer3.0.bn2.bias
	 layer3.0.downsample.0.weight
	 layer3.0.downsample.1.weight
	 layer3.0.downsample.1.bias
	 layer3.1.conv1.weight
	 layer3.1.bn1.weight
	 layer3.1.bn1.bias
	 layer3.1.conv2.weight
	 layer3.1.b

[image_classification/optim_adahessian.py](https://github.com/amirgholami/adahessian/blob/master/image_classification/optim_adahessian.py) on github:


```
"""Implements Adahessian algorithm.
    It has been proposed in `ADAHESSIAN: An Adaptive Second Order Optimizer for Machine Learning`.
    Arguments:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float, optional): learning rate (default: 0.15)
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its square (default: (0.9, 0.999))
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-4)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        hessian_power (float, optional): Hessian power (default: 1). You can also try 0.5. For some tasks we found this to result in better performance.
        single_gpu (Bool, optional): Do you use distributed training or not "torch.nn.parallel.DistributedDataParallel" (default: True)
    """
```
- hessian_power is the "$k$"
- the weight decay is something like 0.1,...,0.0001

In [35]:
lr=0.001 # default: 0.15
betas = (0.9, 0.999) # default: (0.9, 0.999)
weight_decay = 0.0005 # default: 0
hessian_power = 1 # default: 1
num_epoch = 3 #360
momentum = 0.95 # 0.95

#optimizer_name = 'adahessian'
optimizer_name = 'adahessian'

if optimizer_name == 'adahessian':
  optimizer = Adahessian(params_to_update, lr=lr, betas=betas, weight_decay=weight_decay,hessian_power=hessian_power) #set the parameter
elif optimizer_name == 'SGD':
  optimizer = optim.SGD(params_to_update, lr=lr, momentum=momentum, weight_decay=weight_decay)
elif optimizer_name == "adam":
  optimizer = optim.Adam(params_to_update, lr=lr, weight_decay=weight_decay)
criterion = nn.CrossEntropyLoss()

scheduler_name = "multilr"
scheduler = None
if scheduler_name == "multilr":
  scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[180, 270], gamma=0.1)
if is_renormalized:
  case_name = "renormalized"
else:
  case_name = "unrenormalized"
path= f'./drive/MyDrive/Opti-ML/model_weights/{case_name}_model_{optimizer_name}.pt'
train_loss_path = f'./drive/MyDrive/Opti-ML/loss_and_acc/{case_name}_train_loss_{optimizer_name}.txt'
train_acc_path = f'./drive/MyDrive/Opti-ML/loss_and_acc/{case_name}_train_acc_{optimizer_name}.txt'
test_loss_path = f'./drive/MyDrive/Opti-ML/loss_and_acc/{case_name}_test_loss_{optimizer_name}.txt'
test_acc_path = f'./drive/MyDrive/Opti-ML/loss_and_acc/{case_name}_test_acc_{optimizer_name}.txt'

In [None]:
#check the warning, nothing more on the adahessian git
train_loss, list_testloss, lowest_loss, model_weights, train_acc, test_acc = train_and_test(model,trainloader,testloader,criterion,optimizer,scheduler,num_epoch,path,device)
with open(train_loss_path, 'w') as fp:
    for item in train_loss:
        # write each item on a new line
        fp.write("%s\n" % item)
with open(train_acc_path, 'w') as fp:
    for item in train_acc:
        # write each item on a new line
        fp.write("%s\n" % item)
with open(test_loss_path, 'w') as fp:
    for item in list_testloss:
        # write each item on a new line
        fp.write("%s\n" % item)
with open(test_acc_path, 'w') as fp:
    for item in test_acc:
        # write each item on a new line
        fp.write("%s\n" % item)

  0%|          | 0/3 [00:00<?, ?it/s]

Epoch 0/2
----------


In [None]:
testloader = DataLoader(test_data, batch_size=1, shuffle=True)
accuracy = test_model(model,testloader,path,device)
print(f"accuracy={accuracy}")

In [None]:
plt.plot(train_loss,'.-')
plt.plot(list_testloss,'.-')
plt.title(f"Evolution of loss during training: {case_name}, {optimizer_name} \n  accuracy={accuracy} \n lr={lr}, betas={betas}, weight_decay={weight_decay}, hessian_power={hessian_power}")
plt.xlabel("epoch")
plt.ylabel("loss")
plt.legend(["train","test"])
plt.savefig(f"drive/MyDrive/Opti-ML/graphics/{case_name}_{optimizer_name}_loss.pdf")