### Dynamic learning-rate scheduler

This notebook will attempt to write functions with approximately the following structure:

```python
class AdaptiveLROptimizer:

    def __init__(self, model, loss_function, ...):
        # TODO

    def compute_alpha_star(self, direction, data, ...):
        """Compute the optimal alpha_* given the direction `direction` on the data `data`."""
        # TODO

    def apply_step(self, direction, data, ...)
        """Compute the optimal alpha_* for the direction `direction` on the data `data`
        and update the parameters of the model."""
        # TODO
    

In [1]:
import torch
import torch.nn as nn
from torch.nn.utils import parameters_to_vector, vector_to_parameters
# from torch.nn.utils import vector_to_parameters  # this is NOT inverse of params_to_vector
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from warnings import warn
import matplotlib.pyplot as plt
import numpy as np
# from tueplots import axes, bundles, fonts, fontsizes, figsizes  # for consistent plotting

from curvlinops import GGNLinearOperator, HessianLinearOperator
from curvlinops.examples.functorch import functorch_ggn, functorch_hessian
from curvlinops.examples.utils import report_nonclose
from source.plotting import plot_data, get_style

### Custom class: AdaptiveLRoptimizer

In [2]:

class AdaptiveLROptimizer :
    def __init__(self, model, loss_function, optimizer):
        self.model = model
        self.loss_function = loss_function
        self.optimizer = optimizer

    def compute_alpha_star(self, data, target):
        """
        Given data and target, computes the update direction d_unnormalized and the "optimal" step size alpha_star
        """
        # Step 1: Save the current parameters (theta_0)
        theta_0 = parameters_to_vector(self.model.parameters())  # brauche ich hier clone? Lukas fragen.

        # Step 2: Zero the gradients
        self.optimizer.zero_grad()

        # Step 3: Forward pass to compute loss
        output = self.model(data)
        loss = self.loss_function(output, target)

        # Step 4: Backward pass to compute gradients
        loss.backward()

        # Step 5: Prepare data for GGN (Generalized Gauss-Newton)
        Data = [(data, target)]  # Data as expected by GGNLinearOperator
        params = [p for p in self.model.parameters() if p.requires_grad]  # Filter for trainable parameters

        # Assuming GGNLinearOperator is already imported and available
        GGN = GGNLinearOperator(self.model, self.loss_function, params, Data)  # Instantiate GGN operator
        
        # Step 6: Extract gradients and convert to a vector, move to 5
        gradients = parameters_to_vector(param.grad for param in self.model.parameters() if param.grad is not None)

        # Step 7: Perform optimizer step (this will change the parameters temporarily)
        self.optimizer.step()

        # Step 8: Compute the direction of adjustment (d_unnormalized)
        d_unnormalized = parameters_to_vector(self.model.parameters()) - theta_0

        ## Step 9: Revert the model parameters back to the original state (theta_0)
        #for param, original_param in zip(self.model.parameters(), theta_0.split([param.numel() for param in self.model.parameters()])):
        #    param.data.copy_(original_param.data)
        vector_to_parameters(theta_0, self.model.parameters())

        GGNd = GGN @ d_unnormalized.detach().numpy()  # Multiply GGN * d, outputs np array

        # Step 10: Compute alpha_* based on the direction (this is where you would implement your custom logic)
        GGNd_tensor = torch.tensor(GGNd) # from_numpy()
        
        dGGNd = torch.dot(GGNd_tensor, d_unnormalized)
        
        dg = - torch.dot(gradients, d_unnormalized)  # numerator: - d^T*g
        
        alpha_star = dg / dGGNd

        return alpha_star, d_unnormalized
    

    def apply_step(self, alpha_star, direction):
        """
        Apply a step in the direction `direction` with step size `alpha_star`.
        This updates the model parameters by taking a step along the direction of adjustment.
        """
        # Step 1: Scale the direction by alpha_star
        step_direction = alpha_star * direction

        # Step 2: Update the parameters in the direction of `step_direction`
        with torch.no_grad():  # Ensure no gradients are tracked
            for param, step in zip(self.model.parameters(), step_direction.split([param.numel() for param in self.model.parameters()])):
                param.add_(step.view_as(param))  # Apply the update to each parameter

        # No need to return anything, as the model parameters are updated in place



### Test 1: Does the class work? Does the update do the expected?

Evaluated through a simple model and checking the values of direction d, alpha star and the consecutive param update

In [3]:
# Define a simple model
class SimpleNet(nn.Module):
    def __init__(self):
        super(SimpleNet, self).__init__()
        self.fc = nn.Linear(10, 1)

    def forward(self, x):
        return self.fc(x)

# Define model, loss function, and optimizer
model = SimpleNet()
loss_function = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)  # Standard optimizer

# Initialize your AdaptiveLROptimizer
adaptive_optimizer = AdaptiveLROptimizer(model, loss_function, optimizer)

# Example input data and target
data = torch.randn(5, 10)  # Batch of 5 samples, each with 10 features
target = torch.randn(5, 1)  # Corresponding target for each sample

# Compute the optimal step size and direction
alpha_star, direction = adaptive_optimizer.compute_alpha_star(data, target)

print("alpha:", alpha_star)
print("direction:", direction)
print("update: d*alpha:", alpha_star*direction)

print("Params before update:")
"""
for name, param in model.named_parameters():
    print(f"Parameter name: {name}")
    print(f"Parameter values: {param.data}")
    print(f"Gradient: {param.grad}")
    print("=" * 50)
"""

print("params as vector:", parameters_to_vector(model.parameters()))

print(" expected updated params:", parameters_to_vector(model.parameters()) + (alpha_star*direction))


# Apply the update step using the computed alpha_star and direction
adaptive_optimizer.apply_step(alpha_star, direction)

print("actual updated params:", parameters_to_vector(model.parameters()))



alpha: tensor(18.2899, grad_fn=<DivBackward0>)
direction: tensor([ 0.0065,  0.0113, -0.0036, -0.0030, -0.0005, -0.0114,  0.0042, -0.0140,
         0.0063,  0.0072, -0.0184], grad_fn=<SubBackward0>)
update: d*alpha: tensor([ 0.1195,  0.2072, -0.0655, -0.0541, -0.0098, -0.2084,  0.0766, -0.2557,
         0.1158,  0.1320, -0.3362], grad_fn=<MulBackward0>)
Params before update:
params as vector: tensor([-0.2978,  0.1916,  0.0600, -0.0790, -0.0622,  0.2537, -0.1593,  0.1321,
        -0.0269,  0.1562, -0.1970], grad_fn=<CatBackward0>)
 expected updated params: tensor([-0.1783,  0.3988, -0.0056, -0.1331, -0.0720,  0.0453, -0.0827, -0.1236,
         0.0889,  0.2882, -0.5332], grad_fn=<AddBackward0>)
actual updated params: tensor([-0.1783,  0.3988, -0.0056, -0.1331, -0.0720,  0.0453, -0.0827, -0.1236,
         0.0889,  0.2882, -0.5332], grad_fn=<CatBackward0>)


### Test 2: One dimensional optimal step

- defining a linear model with one parameter theta = 0, one input x = x, target y = y
- MSE loss and SGD 
- using AdaptiveLROptimizer class for the step

$$
MSE=\frac{1}{N} \sum_{i=1}^n\left(y_i-\hat{y}\right)^2
$$

$\hspace{7cm}$ For $N=1$ :
$$
\begin{aligned}
& MSE=(y-\hat{y})^2 \\
& =(x \cdot \theta-\hat{y})^2 \\
& \frac{\partial M S E}{\partial \theta}=2(x \cdot \theta-\hat{y}) \quad x=0 \\
& \Leftrightarrow \quad 2 x(x \theta-\hat{y})=0 \\
& \Leftrightarrow \quad 2 x^2 \theta-2 x \hat{y}=0 \\
& \Leftrightarrow \quad \theta=\frac{\hat{y}}{x}
\end{aligned}
$$
$\hspace{7cm} \Rightarrow$ After the step $\alpha^* \cdot d  $ we $\operatorname{expect} \theta=\frac{\hat{y}}{x}$

$\hspace{7cm}$ Example: $x=2 ; \hat{y}=5$
$$
\Rightarrow \theta=\frac{5}{2}=2.5
$$

In [None]:
from torch.optim import SGD


def test_adaptive_lr_optimizer():
    # Step 1: Define a simple 1D model
    theta_0 = torch.randn(1)
    theta_old = theta_0.clone()

    class Simple1DModel(nn.Module):
        def __init__(self):
            super(Simple1DModel, self).__init__()
            self.param = nn.Parameter(theta_0)  # Start at w = 0
        
        def forward(self, x):
            return self.param * x  # Linear model: y = w * x

    # Step 2: Use built-in MSELoss, which is a quadratic loss function
    loss_function = nn.MSELoss()


    # Step 3: Set up the model, optimizer, and custom adaptive optimizer
    model = Simple1DModel()
    sgd_optimizer = SGD(model.parameters(), lr=.012)  # Dummy optimizer
    adaptive_optimizer = AdaptiveLROptimizer(model, loss_function, sgd_optimizer)

    # Step 4: 
    target = torch.randn(1) 
    x = torch.randn(1)  # Input (doesn't matter in 1D)
    print("theta before optimizer:", model.param.item())
    # Step 5: Use the AdaptiveLROptimizer to compute alpha_star and direction
    alpha_star, direction = adaptive_optimizer.compute_alpha_star(x, target)
    print(" theta_0 =", model.param.item())
    print("input x =", x.item())
    print("target =", target.item())
    print(f"Computed alpha_star: {alpha_star.item()}")
    print(f"Direction: {direction.item()}")
    print("theta_1 = theta_0 + d * alpha_star =", theta_old.item() + direction.item() * alpha_star.item())


    # Step 6: Apply the step using AdaptiveLROptimizer
    adaptive_optimizer.apply_step(alpha_star, direction)
    
    # Step 7: 
    theta_target = target / x  # Analytical solution for the optimal parameter value
    print(f"Updated parameter value: {model.param.item()}")
    print(f"Target value theta1: {theta_target}")

    assert torch.isclose(model.param, theta_target , atol=1e-6), "The parameter did not step to the right value!"
    print("Test passed: Parameter stepped to: ", model.param.item(), " Analytically it should be:", theta_target.item())

    # Assert that the parameter is now close to the target minimum  
    assert torch.isclose(model(x), target, atol=1e-6), "The parameter model fit is not good enough (should be perfect)!"
    print("Test passed: Model predicts", model(x).item(), "The target is:", target.item())

    assert torch.isclose(loss_function(model(x), target), torch.tensor(0, dtype=torch.float32), atol=1e-6), "The loss is not very close to 0"
    print("Test passed: Loss is :", loss_function(model(x), target).item())

# Run the test
test_adaptive_lr_optimizer()



theta before optimizer: 0.35794949531555176
 theta_0 = 0.35794949531555176
input x = -1.2171871662139893
target = -0.8010745644569397
Computed alpha_star: 28.12376594543457
Direction: 0.010673761367797852
theta_1 = theta_0 + d * alpha_star = 0.6581358617809201
Updated parameter value: 0.6581358909606934
Target value theta1: tensor([0.6581])
Test passed: Parameter stepped to:  0.6581358909606934  Analytically it should be: 0.6581358909606934
Test passed: Model predicts -0.8010745644569397 The target is: -0.8010745644569397
Test passed: Loss is : 0.0


### Update_params method for AlgoPerf

Implement the submission function update_params for testing the algorithm on AlgoPerf using the update by class AdaptiveLROptimizer


```python
def update_params(
    workload: Workload,
    current_param_container: ParameterContainer,
    current_params_types: ParameterTypeTree,
    model_state: ModelAuxiliaryState,
    hyperparameters: Hyperparameters,
    batch: Dict[str, Tensor],
    loss_type: LossType,
    optimizer_state: OptimizerState,
    eval_results: List[Tuple[int, float]],
    global_step: int,
    rng: RandomState
) -> (updated_optimizer_state, updated_variables, updated_model_state)
```

- `current_param_container` is the same kind of nested structure as used by `model_fn` which constitutes a nested collection of `float32` arrays, each endowed with information about what kind of parameter that array represents stored in a parallel structure of `current_params_types`.
  - Parameter kind is one of {"weights", "biases", "embeddings", "conv", "batch norm"}.
- `model_state` holds auxiliary state necessary for some models, such as the current batch norm statistics.
- The loss function will be one of a small set of known possibilities and the update function is allowed to branch on the `loss_type` enum/name.
- The `loss_fn` produces a loss per example and a summed loss (both only for one device), which both can be used.
- Allowed to update state for the optimizer.
- Uses the `model_fn` of the `workload` in order to decouple the loss from the model so that model outputs (forward passes) can be reused (by storing them in the optimizer state).
- The submission can access the target evaluation metric via the `workload` variable.
- **A call to this function will be considered a step**
  - The time between a call to this function and the next call to this function will be considered the per-step time.
- Cannot modify the given hyperparameters in a workload-conditional way (please see the [Valid submission](#valid-submissions) section). This rule is intended to prohibit circumventing the tuning rules by looking up a pre-tuned optimal set of hyperparameters for each workload. It is not intended to prohibit line searches and other similar techniques.
- The fixed `init_model_fn` can optionally be called during training, for example, to reinitialize the model after a failed training effort.
- Cannot replace the model parameters with pre-trained ones.
- This API supports Polyak averaging and similar methods that implement moving averages of model parameters.
- Batch norm should work here because the `model_fn` will return updated batch norm moving averages when it is told to with `update_batch_norm`.




In [6]:
# TO DO
# Need to fork AlgoPerf next to make this work. 
# 


from typing import Dict, List, Tuple  # For type hinting
import torch  # For tensor operations and optimizers
from torch import Tensor  # For the Tensor type
# from your_module import Workload, ParameterContainer, ParameterTypeTree, ModelAuxiliaryState, Hyperparameters, LossType, OptimizerState  # Adjust according to your project structure
# from your_optimizer_module import AdaptiveLROptimizer  # Adjust to import your custom optimizer

def update_params(
    workload: Workload,
    current_param_container: ParameterContainer,
    current_params_types: ParameterTypeTree,
    model_state: ModelAuxiliaryState,
    hyperparameters: Hyperparameters,
    batch: Dict[str, Tensor],
    loss_type: LossType,
    optimizer_state: OptimizerState,
    eval_results: List[Tuple[int, float]],
    global_step: int,
    rng): # rng: RandomState
    
    # Step 1: Initialize your model and loss function
    model = current_param_container.model  # Assuming model is part of your ParameterContainer
    loss_function = loss_type  # This should be defined based on your context

    # Step 2: Create an instance of the AdaptiveLROptimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=hyperparameters.learning_rate)
    adaptive_optimizer = AdaptiveLROptimizer(model, loss_function, optimizer)

    # Step 3: Get the data and target from the batch
    data = batch['data']
    target = batch['target']

    # Step 4: Compute alpha_star and the direction d
    alpha_star, direction = adaptive_optimizer.compute_alpha_star(data, target)

    # Step 5: Apply the update step using the computed values
    adaptive_optimizer.apply_step(alpha_star, direction)

    # Step 6: Update optimizer state if needed
    # (This may depend on how you manage the optimizer state in your framework)

    # Step 7: Return updated states
    updated_optimizer_state = optimizer_state  # Adjust as necessary based on your framework
    updated_variables = current_param_container.variables  # Adjust if necessary
    updated_model_state = model_state  # Update as needed if you modify the model state

    return updated_optimizer_state, updated_variables, updated_model_state


NameError: name 'Workload' is not defined