In [5]:
import math
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
import time as time
from gradient_descent_the_ultimate_optimizer import gdtuo
from gradient_descent_the_ultimate_optimizer.gdtuo import Optimizable


In [120]:
class AdaRELU(Optimizable):

    def __init__(self, positive_slope, negative_slope, optimizer):

        self.parameters = {'positive_slope': torch.tensor(positive_slope, requires_grad=True),
                           'negative_slope': torch.tensor(negative_slope, requires_grad=True)}
        self.optimizer = optimizer
        self.all_params_with_gradients = [self.parameters['positive_slope'], self.parameters['negative_slope']]

        super().__init__(self.parameters, optimizer)

    def __call__(self, input):
        output = torch.where(input >= 0, input * self.parameters['positive_slope'], input * self.parameters['negative_slope'])
        return output
    
    def step(self):
        self.optimizer.step(self.parameters)

adaRELU = AdaRELU(1., 0.01, gdtuo.SGD(alpha = 0.001))
adaRELU.initialize()

x = torch.tensor([[1., 2., 3.], [7., 12., 8.], [-1., -3., -2000.]], requires_grad=True)
y = torch.tensor([[1.5, 3., 4.5], [10.5, 18., 12.], [-.01, -.03, -20.]], requires_grad=True)

criterion = nn.MSELoss()

In [121]:
for i in range(1000):

    adaRELU.begin()
    adaRELU.zero_grad()


    y_hat = adaRELU(x)
    loss = criterion(y_hat, y)
    loss.backward()
    adaRELU.step()

In [122]:
adaRELU.parameters['positive_slope'], adaRELU.parameters['negative_slope']

(tensor(1.5000, grad_fn=<SubBackward0>),
 tensor(0.0100, grad_fn=<SubBackward0>))

In [160]:
class MNIST_FullyConnected(nn.Module):
    """
    A fully-connected NN for the MNIST task. This is Optimizable but not itself
    an optimizer.
    """
    def __init__(self, num_inp, num_hid, num_out, adaRELU):
        super(MNIST_FullyConnected, self).__init__()
        self.layer1 = nn.Linear(num_inp, num_hid)
        self.layer2 = nn.Linear(num_hid, num_out)
        self.adaRELU = adaRELU

    def initialize(self):
        nn.init.kaiming_uniform_(self.layer1.weight, a=math.sqrt(5))
        nn.init.kaiming_uniform_(self.layer2.weight, a=math.sqrt(5))

    def forward(self, x):
        """Compute a prediction."""
        x = self.layer1(x)
        x = self.adaRELU(x)    ## we want that changed
        x = self.layer2(x)
        x = torch.tanh(x)
        x = F.log_softmax(x, dim=1)
        return x



BATCH_SIZE = 256
EPOCHS = 5
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(DEVICE)

mnist_train = torchvision.datasets.MNIST('./data', train=True, download=True, transform=torchvision.transforms.ToTensor())
mnist_test = torchvision.datasets.MNIST('./data', train=False, download=True, transform=torchvision.transforms.ToTensor())
dl_train = torch.utils.data.DataLoader(mnist_train, batch_size=BATCH_SIZE, shuffle=True)
dl_test = torch.utils.data.DataLoader(mnist_test, batch_size=10000, shuffle=False)

adaRELU_MNIST = AdaRELU(1., 0.01, gdtuo.SGD(alpha = 0.001))
#optim = gdtuo.SGD(alpha=0.0769)
model = MNIST_FullyConnected(28 * 28, 128, 10, adaRELU_MNIST).to(DEVICE)
optim = torch.optim.SGD(model.parameters(), lr=0.000769)

#mw = gdtuo.ModuleWrapper(model, optimizer=optim)
adaRELU_MNIST.initialize()
#mw.initialize()

cpu


In [163]:
init_time = time.time()
EPOCHS = 3
for i in range(1, EPOCHS+1):
    running_acc = 0.0
    running_loss = 0.0
    for j, (features_, labels_) in enumerate(dl_train):
        #mw.begin() # call this before each step, enables gradient tracking on desired params
        adaRELU_MNIST.begin()
        features, labels = torch.reshape(features_, (-1, 28 * 28)).to(DEVICE), labels_.to(DEVICE)
        pred = model.forward(features)
        loss = F.nll_loss(pred, labels)
        #mw.zero_grad()
        adaRELU_MNIST.zero_grad()
        loss.backward(create_graph=True) # important! use create_graph=True
        #mw.step()
        optim.step()
        adaRELU_MNIST.step()
        running_loss += loss.item() * features_.size(0)
        running_acc += (torch.argmax(pred, dim=1) == labels).sum().item()
    train_loss = running_loss / len(dl_train.dataset)
    train_acc = running_acc / len(dl_train.dataset)
    print("EPOCH: {}, TRAIN LOSS: {}, ACC: {}".format(i, train_loss, train_acc))
print("Time taken: {}".format(time.time() - init_time))

EPOCH: 1, TRAIN LOSS: 1.0298290298461914, ACC: 0.8464166666666667
EPOCH: 2, TRAIN LOSS: 1.0173378355026246, ACC: 0.8492333333333333
EPOCH: 3, TRAIN LOSS: 1.017896891816457, ACC: 0.84185
Time taken: 12.62617301940918


In [164]:
model.adaRELU.parameters['positive_slope'], model.adaRELU.parameters['negative_slope']

(tensor(1.0327, grad_fn=<SubBackward0>),
 tensor(-0.0065, grad_fn=<SubBackward0>))

In [158]:
class MNIST_FullyConnected(nn.Module):
    """
    A fully-connected NN for the MNIST task. This is Optimizable but not itself
    an optimizer.
    """
    def __init__(self, num_inp, num_hid, num_out):
        super(MNIST_FullyConnected, self).__init__()
        self.layer1 = nn.Linear(num_inp, num_hid)
        self.layer2 = nn.Linear(num_hid, num_out)

    def initialize(self):
        nn.init.kaiming_uniform_(self.layer1.weight, a=math.sqrt(5))
        nn.init.kaiming_uniform_(self.layer2.weight, a=math.sqrt(5))

    def forward(self, x):
        """Compute a prediction."""
        x = self.layer1(x)
        x = F.relu(x)    ## we want that changed
        x = self.layer2(x)
        x = torch.tanh(x)
        x = F.log_softmax(x, dim=1)
        return x



BATCH_SIZE = 256
EPOCHS = 3
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(DEVICE)

mnist_train = torchvision.datasets.MNIST('./data', train=True, download=True, transform=torchvision.transforms.ToTensor())
mnist_test = torchvision.datasets.MNIST('./data', train=False, download=True, transform=torchvision.transforms.ToTensor())
dl_train = torch.utils.data.DataLoader(mnist_train, batch_size=BATCH_SIZE, shuffle=True)
dl_test = torch.utils.data.DataLoader(mnist_test, batch_size=10000, shuffle=False)


model = MNIST_FullyConnected(28 * 28, 128, 10).to(DEVICE)
optim = torch.optim.SGD(model.parameters(), lr=0.000769)



cpu


In [162]:
init_time = time.time()
EPOCHS = 3
for i in range(1, EPOCHS+1):
    running_acc = 0.0
    running_loss = 0.0
    for j, (features_, labels_) in enumerate(dl_train):
        #mw.begin() # call this before each step, enables gradient tracking on desired params
        features, labels = torch.reshape(features_, (-1, 28 * 28)).to(DEVICE), labels_.to(DEVICE)
        pred = model.forward(features)
        loss = F.nll_loss(pred, labels)
        #mw.zero_grad()
        loss.backward(create_graph=True) # important! use create_graph=True
        #mw.step()
        optim.step()
        running_loss += loss.item() * features_.size(0)
        running_acc += (torch.argmax(pred, dim=1) == labels).sum().item()
    train_loss = running_loss / len(dl_train.dataset)
    train_acc = running_acc / len(dl_train.dataset)
    print("EPOCH: {}, TRAIN LOSS: {}, ACC: {}".format(i, train_loss, train_acc))
print("Time taken: {}".format(time.time() - init_time))

EPOCH: 1, TRAIN LOSS: 1.0661608177185058, ACC: 0.8587833333333333
EPOCH: 2, TRAIN LOSS: 1.0500820800145467, ACC: 0.8460166666666666
EPOCH: 3, TRAIN LOSS: 1.0420895022074381, ACC: 0.8225166666666667
Time taken: 14.543699741363525
