In [3]:
import math
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
import time as time
import numpy as np
from gradient_descent_the_ultimate_optimizer import gdtuo
from gradient_descent_the_ultimate_optimizer.gdtuo import Optimizable
import os
import matplotlib.pyplot as plt
import imageio
from IPython.display import Video, Image

In [4]:
class AdaRELU(Optimizable):

    def __init__(self, positive_slope, negative_slope, optimizer):

        self.parameters = {'positive_slope': torch.tensor(positive_slope, requires_grad=True),
                           'negative_slope': torch.tensor(negative_slope, requires_grad=True)}
        self.optimizer = optimizer
        self.all_params_with_gradients = [self.parameters['positive_slope'], self.parameters['negative_slope']]
        
        super().__init__(self.parameters, optimizer)

    def __call__(self, input):
        output = torch.where(input >= 0, input * self.parameters['positive_slope'], input * self.parameters['negative_slope'])
        return output
    
    def step(self):
        self.optimizer.step(self.parameters)

adaRELU = AdaRELU(1., 0.01, gdtuo.SGD(alpha = 0.001))
adaRELU.initialize()

x = torch.tensor([[1., 2., 3.], [7., 12., 8.], [-1., -3., -2000.]], requires_grad=True)
y = torch.tensor([[1.5, 3., 4.5], [10.5, 18., 12.], [-.01, -.03, -20.]], requires_grad=True)

criterion = nn.MSELoss()

In [5]:
for i in range(1000):

    adaRELU.begin()
    adaRELU.zero_grad()


    y_hat = adaRELU(x)
    loss = criterion(y_hat, y)
    loss.backward()
    adaRELU.step()

In [6]:
adaRELU.parameters['positive_slope'], adaRELU.parameters['negative_slope']

(tensor(1.5000, grad_fn=<SubBackward0>),
 tensor(0.0100, grad_fn=<SubBackward0>))

In [7]:
class MNIST_CNN(nn.Module):
    def __init__(self, adaRELU):
        super(MNIST_CNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.dropout1 = nn.Dropout2d(0.25)
        self.dropout2 = nn.Dropout2d(0.5)
        self.fc1 = nn.Linear(12544, 128)  # Adjusted input dimensions
        self.fc2 = nn.Linear(128, 10)
        self.adaRELU = adaRELU

    def forward(self, x):
        x = self.conv1(x)
        x = self.adaRELU(x)
        x = self.conv2(x)
        x = self.adaRELU(x)
        x = F.max_pool2d(x, 2)
        x = self.dropout1(x)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = self.adaRELU(x)
        x = self.dropout2(x)
        x = self.fc2(x)
        output = F.log_softmax(x, dim=1)
        return output



BATCH_SIZE = 128
EPOCHS = 5
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(DEVICE)

mnist_train = torchvision.datasets.MNIST('./data', train=True, download=True, transform=torchvision.transforms.ToTensor())
mnist_test = torchvision.datasets.MNIST('./data', train=False, download=True, transform=torchvision.transforms.ToTensor())
dl_train = torch.utils.data.DataLoader(mnist_train, batch_size=BATCH_SIZE, shuffle=True)
dl_test = torch.utils.data.DataLoader(mnist_test, batch_size=BATCH_SIZE, shuffle=False)

adaRELU_MNIST = AdaRELU(1., 0.01, gdtuo.SGD(alpha = 0.001))
#optim = gdtuo.SGD(alpha=0.0769)
model = MNIST_CNN(adaRELU_MNIST).to(DEVICE)
optim = torch.optim.SGD(model.parameters(), lr=0.000769)

#mw = gdtuo.ModuleWrapper(model, optimizer=optim)
adaRELU_MNIST.initialize()
#mw.initialize()

cuda


In [8]:
init_time = time.time()
EPOCHS = 20
negative_slopes_encountered = [model.adaRELU.parameters['negative_slope'].item()]
positive_slopes_encountered = [model.adaRELU.parameters['positive_slope'].item()]
for i in range(1, EPOCHS+1):
    running_acc = 0.0
    running_loss = 0.0
    for j, (features_, labels_) in enumerate(dl_train):
        #mw.begin() # call this before each step, enables gradient tracking on desired params
        adaRELU_MNIST.begin()
        adaRELU_MNIST.zero_grad()
        optim.zero_grad()
        features, labels = features_.to(DEVICE), labels_.to(DEVICE)
        pred = model.forward(features)
        loss = F.nll_loss(pred, labels)
        #mw.zero_grad()

        loss.backward(create_graph=True) # important! use create_graph=True
        #mw.step()
        optim.step()
        adaRELU_MNIST.step()
        running_loss += loss.item() * features_.size(0)
        running_acc += (torch.argmax(pred, dim=1) == labels).sum().item()
    
    negative_slopes_encountered.append(model.adaRELU.parameters['negative_slope'].item())
    positive_slopes_encountered.append(model.adaRELU.parameters['positive_slope'].item())
    train_loss = running_loss / len(dl_train.dataset)
    train_acc = running_acc / len(dl_train.dataset)
    print("EPOCH: {}, TRAIN LOSS: {}, ACC: {}".format(i, train_loss, train_acc))
print("Time taken: {}".format(time.time() - init_time))

  Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


EPOCH: 1, TRAIN LOSS: 2.2784707703908285, ACC: 0.16443333333333332
EPOCH: 2, TRAIN LOSS: 2.148403410847982, ACC: 0.40336666666666665
EPOCH: 3, TRAIN LOSS: 1.3096345356941224, ACC: 0.6470333333333333
EPOCH: 4, TRAIN LOSS: 0.6349108519236246, ACC: 0.8056833333333333
EPOCH: 5, TRAIN LOSS: 0.47963878558476764, ACC: 0.8546166666666667
EPOCH: 6, TRAIN LOSS: 0.41281561846733095, ACC: 0.87565
EPOCH: 7, TRAIN LOSS: 0.37324214283625284, ACC: 0.8898166666666667
EPOCH: 8, TRAIN LOSS: 0.34178474605878195, ACC: 0.89905
EPOCH: 9, TRAIN LOSS: 0.32071906592051186, ACC: 0.90605
EPOCH: 10, TRAIN LOSS: 0.30546625185012816, ACC: 0.90925
EPOCH: 11, TRAIN LOSS: 0.28854260341326393, ACC: 0.9147
EPOCH: 12, TRAIN LOSS: 0.2780913937807083, ACC: 0.91835
EPOCH: 13, TRAIN LOSS: 0.2653726577281952, ACC: 0.92235
EPOCH: 14, TRAIN LOSS: 0.255489746538798, ACC: 0.9246666666666666
EPOCH: 15, TRAIN LOSS: 0.25042732381820676, ACC: 0.9268833333333333
EPOCH: 16, TRAIN LOSS: 0.24121063443024954, ACC: 0.92965
EPOCH: 17, TRAIN 

In [9]:
for i in range(len(negative_slopes_encountered)):
    x = np.linspace(-10, 10, 100)
    y = np.where(x >= 0, x * positive_slopes_encountered[i], x * negative_slopes_encountered[i])
    fig, ax = plt.subplots()
    ax.plot(x, y)
    # set small cross at 0.0
    ax.plot([0.0], [0.0], 'x', color='red')
    ax.set_title('slopes: {:.3f}, {:.3f}'.format(positive_slopes_encountered[i], negative_slopes_encountered[i]))

    ax.set_xlim([-10, 10])
    ax.set_ylim([-3, 10])

    os.makedirs('plots', exist_ok=True)
    plt.savefig('plots/{}.png'.format(i))
    plt.close()

video = './adaRELU.mp4'
imageio.mimsave(video, [imageio.imread('plots/{}.png'.format(i)) for i in range(len(negative_slopes_encountered))], fps = 3)
#play it here
Video(video)

  imageio.mimsave(video, [imageio.imread('plots/{}.png'.format(i)) for i in range(len(negative_slopes_encountered))], fps = 3)


In [10]:
class MNIST_CNN(nn.Module):
    def __init__(self):
        super(MNIST_CNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.dropout1 = nn.Dropout2d(0.25)
        self.dropout2 = nn.Dropout2d(0.5)
        self.fc1 = nn.Linear(12544, 128)  # Adjusted input dimensions
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = F.max_pool2d(x, 2)
        x = self.dropout1(x)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout2(x)
        x = self.fc2(x)
        output = F.log_softmax(x, dim=1)
        return output



BATCH_SIZE = 256
EPOCHS = 20
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(DEVICE)

mnist_train = torchvision.datasets.MNIST('./data', train=True, download=True, transform=torchvision.transforms.ToTensor())
mnist_test = torchvision.datasets.MNIST('./data', train=False, download=True, transform=torchvision.transforms.ToTensor())
dl_train = torch.utils.data.DataLoader(mnist_train, batch_size=BATCH_SIZE, shuffle=True)
dl_test = torch.utils.data.DataLoader(mnist_test, batch_size=256, shuffle=False)


model = MNIST_CNN().to(DEVICE)
optim = torch.optim.SGD(model.parameters(), lr=0.000769)

cuda


In [12]:
init_time = time.time()
EPOCHS = 10
for i in range(1, EPOCHS+1):
    running_acc = 0.0
    running_loss = 0.0
    for j, (features_, labels_) in enumerate(dl_train):
        #mw.begin() # call this before each step, enables gradient tracking on desired params
        optim.zero_grad()

        features, labels = features_.to(DEVICE), labels_.to(DEVICE)
        pred = model.forward(features)
        loss = F.nll_loss(pred, labels)
        #mw.zero_grad()
        loss.backward(create_graph=True) # important! use create_graph=True
        #mw.step()
        optim.step()
        running_loss += loss.item() * features_.size(0)
        running_acc += (torch.argmax(pred, dim=1) == labels).sum().item()
    train_loss = running_loss / len(dl_train.dataset)
    train_acc = running_acc / len(dl_train.dataset)
    print("EPOCH: {}, TRAIN LOSS: {}, ACC: {}".format(i, train_loss, train_acc))
print("Time taken: {}".format(time.time() - init_time))

EPOCH: 1, TRAIN LOSS: 0.8933082212766011, ACC: 0.7489166666666667
EPOCH: 2, TRAIN LOSS: 0.7898162660280863, ACC: 0.77415
EPOCH: 3, TRAIN LOSS: 0.7145514055887858, ACC: 0.7914
EPOCH: 4, TRAIN LOSS: 0.6597309016545614, ACC: 0.8084
EPOCH: 5, TRAIN LOSS: 0.6211429214159647, ACC: 0.81885
EPOCH: 6, TRAIN LOSS: 0.5878936322530111, ACC: 0.82785
EPOCH: 7, TRAIN LOSS: 0.5605762076377868, ACC: 0.8369
EPOCH: 8, TRAIN LOSS: 0.5366340565681458, ACC: 0.8448
EPOCH: 9, TRAIN LOSS: 0.5195656455993652, ACC: 0.8480333333333333
EPOCH: 10, TRAIN LOSS: 0.5052648693084717, ACC: 0.8531666666666666
Time taken: 49.096388816833496
