<img src="../rsag_convex.png" alt="algoconvex" />
<img src="../x_update.png" alt="x_update" />
<img src="../mean.png" alt="mean" />
<img src="../rsag_composite.png" alt="algo" />

__Parameters :__
- $\alpha$: (1-$\alpha$) weight of aggregated x on current state, i.e. momentum
- $\lambda$: learning rate
- $\beta$: change for aggregated x
- $p_k$ termination probability



In [61]:
from torch.optim.optimizer import Optimizer, required
import torch
import copy
from torch.nn import functional as F
from torch import nn
import numpy as np

In [56]:
import torchvision.datasets as dsets
from torchvision.transforms import ToTensor
from torch.utils.data.sampler import SubsetRandomSampler

import torch.utils.data as data_utils
torch.manual_seed(42)

<torch._C.Generator at 0x23aab3208f0>

In [20]:
print('Using PyTorch version:', torch.__version__)
if torch.cuda.is_available():
    print('Using GPU, device name:', torch.cuda.get_device_name(0))
    device = torch.device('cuda')
else:
    print('No GPU found, using CPU instead.') 
    device = torch.device('cpu')

Using PyTorch version: 2.1.2+cu121
Using GPU, device name: NVIDIA GeForce GTX 1660 Ti


In [3]:
import path
import sys
sys.path.append('../')
from models import MLP

In [43]:
class RSAG(Optimizer):
    r"""
    Args:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float): learning rate (lambda) (required)
        kappa (float): lambda  (default: 1000)
        xi (float, optional): statistical advantage parameter (default: 10)
        smallConst (float, optional): any value <=1 (default: 0.7)
    Example:
        >>> from RSAG import *
        >>> optimizer = RSAG(model.parameters(), lr=0.1, kappa = 1000.0, xi = 10.0)
        >>> optimizer.zero_grad()
        >>> loss_fn(model(input), target).backward()
        >>> optimizer.step()
    """

    def __init__(self, params, lr=0.01, alpha = 0.1, beta = 0.1): #, smallConst = 0.7, weight_decay=0):
        #defaults = dict(lr=lr, kappa=kappa, xi, smallConst=smallConst,
                        # weight_decay=weight_decay)
        defaults = dict(lr=lr, alpha=alpha, beta=beta)
        super(RSAG, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(RSAG, self).__setstate__(state)

    def step(self, closure=None):
        """ Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            # weight_decay = group['weight_decay']
            lr = group['lr']
            alpha, beta = group['alpha'], group['beta']
            alpha_bar = 1.0-alpha
            momentum_buffer_list = []

            # INITIALIZE GROUPS
            # params_with_grad, d_p_list, momentum_buffer_list = [], [], []
            # for p in group['params']:
            #     if p.grad is not None:
            #         params_with_grad.append(p)b
            #         d_p_list.append(p.grad)
            #         # if p.grad.is_sparse:
            #         #     has_sparse_grad = True

            #         state = self.state[p]
            #         if 'momentum_aggr' not in state:
            #             momentum_buffer_list.append(None)
            #         else:
            #             momentum_buffer_list.append(state['momentum_buffer'])
            
            # UPDATE GROUPS
            for p in group['params']:
                if p.grad is None:
                    continue

                d_w = p.grad.data
                # w = p.data
                param_state = self.state[p]

                # if weight_decay != 0:
                #     grad_d.add_(weight_decay, p.data)
                
                if 'momentum_aggr' not in param_state:
                    param_state['momentum_aggr'] = copy.deepcopy(p.data)
                    param_state['prev_momentum_aggr'] = copy.deepcopy(p.data)
                buf = param_state['momentum_aggr']
                aggr_grad = (buf-param_state['prev_momentum_aggr'])
                aggr_grad.mul_(alpha_bar)
                aggr_grad.add_(d_w, alpha=alpha)
                
                param_state['prev_momentum_aggr'] = copy.deepcopy(buf)
                
                # Update momentum buffer:'
                buf.mul_(alpha_bar)
                buf.add_(p.data, alpha=alpha)
                buf.add_(aggr_grad, alpha=-beta)
                
                p.data.add_(aggr_grad, alpha=-lr)
                # print('aggr_grad', aggr_grad)
            
            # UPDATE MOMENTUM BUFFER
            # for p, momentum_buffer in zip(params_with_grad, momentum_buffer_list):
            #     state = self.state[p]
                
            #     state['momentum_buffer'] = momentum_buffer

        return loss

## Run

In [10]:
train_data = dsets.MNIST(root = './data', train = True, transform = ToTensor(), download = True)
test_data = dsets.MNIST(root = './data', train = False, transform = ToTensor())

In [6]:
# train_data.targets = F.one_hot(train_data.targets)
# indices = torch.arange(10000)
# train_data = data_utils.Subset(train_data, indices)


In [58]:
train_sampler, valid_sampler = get_train_valid_sampler(train_data)
loaders = {}
loaders['train'] = torch.utils.data.DataLoader(train_data, batch_size=100, num_workers=1, sampler=train_sampler)
loaders['valid'] = torch.utils.data.DataLoader(train_data, batch_size=100, num_workers=1, sampler=valid_sampler)
loaders['test'] = torch.utils.data.DataLoader(test_data, batch_size=100, shuffle=True, num_workers=1)

In [28]:
model = MLP().to(device)
print(model)

loss_function = torch.nn.CrossEntropyLoss()
optimizer = RSAG(model.parameters(), lr=1e-4, alpha=1e-2, beta=.1)
# optimizer = torch.optim.Adagrad(mlp.parameters(), lr=1e-4)

MLP(
  (layers): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)


In [23]:
def calc_accuracy(y_pred, labels):
    predicted_digits = y_pred.argmax(1)                            # pick digit with largest network output
    correct_ones = (predicted_digits == labels).type(torch.float)  # 1.0 for correct, 0.0 for incorrect
    return correct_ones.sum().item()

In [None]:
def run_training(model, loaders, optimizer, loss_function, device, epochs=5, experiment=None):
    log = {}
    log['loss'], log['accuracy'] = [], []
    log['v_loss'], log['v_accuracy'] = [], []
    
    log['v_loss_std'] = []
    log['v_accuracy_std'] = []

    
    log['loss_std'] = []
    log['accuracy_std'] = []

    for epoch in range(0,epochs):
        print(f'Starting Epoch {epoch+1}')

        current_loss, total_acc = [], []
        v_loss, v_acc = [], []

        for data, targets in loaders['train']:
            # inputs, targets = data
            # inputs, targets = inputs.float(), targets.float()
            # targets = targets.reshape((targets.shape[0], 1))
            
            # Copy data and targets to GPU
            data = data.to(device)
            targets = targets.to(device)
            
            optimizer.zero_grad()

            outputs = model(data)

            # Calculate the loss
            loss = loss_function(outputs, targets)
            # current_loss += loss

            # Backpropagation
            loss.backward()
            optimizer.step()

            current_loss.append(loss.item())
            total_acc.append(calc_accuracy(outputs, targets))
            
        # Validation
        model.eval()

        for data, targets in loaders['valid']:
            data = data.to(device)
            targets = targets.to(device)
            
            outputs = model(data)

            loss = loss_function(outputs, targets)
            v_loss.append(loss.item())
            v_acc.append(calc_accuracy(outputs, targets))
            
        experiment.log_metric('v;loss', np.mean(current_loss), epoch)
        

            # if i%10 == 0:
            #     print(f'Loss after mini-batch %5d: %.3f'%(i+1, current_loss/500))
            #     current_loss = 0.0

        print(f'Epoch {epoch+1} finished')
        # current_loss /= len(loaders['train'])
        # total_acc /= len(loaders['train'])
        # print('loss {:.4f}'.format(current_loss))
        # print('Accuracy:  {:.4f}'.format(total_acc))
        
        log['loss_std'].append(np.std(current_loss))
        log['accuracy_std'].append(np.std(total_acc))

        current_loss = sum(current_loss)/len(loaders['train'])
        total_acc = sum(total_acc)/len(loaders['train'])
        log['loss'].append(current_loss)
        log['accuracy'].append(total_acc)

        
        log['v_loss_std'].append(np.std(v_loss))
        log['v_accuracy_std'].append(np.std(v_acc))
        v_loss = sum(v_loss)/len(loaders['valid'])
        v_acc = sum(v_acc)/len(loaders['valid'])
        log['v_loss'].append(v_loss)
        log['v_accuracy'].append(v_acc)


    experiment.log_metrics({"accuracy": log['v_accuracy'], "loss": log['v_loss']}, epoch=epoch)

    print("Training has completed")
    return log

In [63]:
from comet_ml import Experiment
from comet_ml.integration.pytorch import log_model

In [78]:
loss_function = torch.nn.CrossEntropyLoss()

lr_values = [1, .01, .001, .0001 , .00001]
alpha_values = [.9, .75, .5, .25,.1]
best_alpha, best_lr = 0.0, 0.0
best_accuracy = 0.0
for alpha in alpha_values:
        for lr in lr_values:
            experiment = Experiment(
                api_key="tP7gJL6gsW7R9Mu1CsUKqc022",
                project_name="general",
                workspace="clemenceg"
            )


            beta = lr*alpha
            
            print(f"----------- Training with alpha={alpha}, lr={lr} -----------------")
            # Update optimizer with new hyperparameters
            # model = MLP().to(device)
            model = MLP()
            # model = nn.DataParallel(model, device_ids=[0])
            model = model.to(device)

            # Report multiple hyperparameters using a dictionary:
            hyper_params = {
            "alpha": alpha,
            "lr": lr,
            "beta": beta,
            }   
            experiment.log_parameters(hyper_params)



            
            optimizer = RSAG(model.parameters(), lr=lr, alpha=alpha, beta=beta)
            log = run_training(model, loaders, optimizer, loss_function, device, epochs=5, experiment=experiment)
            # Seamlessly log your Pytorch model
            log_model(model, "TheModel", model)
            experiment.end()


            if log['v_accuracy'][-1] > best_accuracy:
                print(f"Found a new best accuracy: {log['v_accuracy'][-1]}")
                print(f"best alpha: {alpha}, best lr: {lr}")
                best_accuracy = log['v_accuracy'][-1]
                best_alpha = alpha
                best_lr = lr

            # Create a filename based on hyperparameters
            filename = f"../logs/results_lr_{lr}_alpha_{alpha}.csv"
            with open(filename, 'w') as f:
                f.write(f"loss,{log['loss']}\n" ) 
                f.write(f"accuracy,{log['accuracy']}\n" )
                f.write(f"loss,{log['v_loss']}\n" ) 
                f.write(f"accuracy,{log['v_accuracy']}\n" )

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/clemenceg/general/6ec8447465bb4341a06910af29ae6cd4
[1;38;5;39mCOMET INFO:[0m   Metrics [count] (min, max):
[1;38;5;39mCOMET INFO:[0m     accuracy [3] : (96.00833333333334, 96.83333333333333)
[1;38;5;39mCOMET INFO:[0m     loss [3]     : (0.10292138303630054, 0.1331247233785689)
[1;38;5;39mCOMET INFO:[0m     v;loss [3]   : (0.07695864576962777, 0.42371859843066584)
[1;38;5;39mCOMET INFO:[0m   Parameters:
[1;38;5;39mCOMET INFO:[0m     alpha : 0.9
[1;38;5;39mCOMET INFO:[0m     beta  : 0.9
[1;38;5;39mCOMET INFO:[0m     lr  

----------- Training with alpha=0.9, lr=1 -----------------
Starting Epoch 1
Epoch 1 finished
Starting Epoch 2
Epoch 2 finished
Starting Epoch 3
Epoch 3 finished
Starting Epoch 4
Epoch 4 finished
Starting Epoch 5
Epoch 5 finished
Training has completed


AttributeError: 'DataParallel' object has no attribute 'id'

In [46]:

optimizer = RSAG(model.parameters(), lr=1e-4, alpha=.9, beta=9e-5)
# optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, nesterov=True, momentum=0.9)
run_training(model, loaders, optimizer, loss_function, device, epochs=5)

Starting Epoch 1
Epoch 1 finished
loss 4.6072
Accuracy:  7.4900
Starting Epoch 2
Epoch 2 finished
loss 4.6027
Accuracy:  8.8633
Starting Epoch 3
Epoch 3 finished
loss 4.5982
Accuracy:  10.3017
Starting Epoch 4
Epoch 4 finished
loss 4.5937
Accuracy:  11.6917
Starting Epoch 5
Epoch 5 finished
loss 4.5893
Accuracy:  13.1383
Training has completed
