In [1]:
import math
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
import time as time
import numpy as np
from gradient_descent_the_ultimate_optimizer import gdtuo
from gradient_descent_the_ultimate_optimizer.gdtuo import Optimizable
import os
import matplotlib.pyplot as plt
import imageio
from IPython.display import Video, Image

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

class CNN(nn.Module):
    def __init__(self, adaGeLU1, adaGeLU2, adaGeLU3):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.dropout1 = nn.Dropout2d(0.25)
        self.dropout2 = nn.Dropout2d(0.5)
        self.fc1 = nn.Linear(16384, 128)  # Adjusted input dimensions
        self.fc2 = nn.Linear(128, 10)
        self.bn1 = nn.BatchNorm2d(3)
        self.bn2 = nn.BatchNorm2d(32)
        self.bn3 = nn.BatchNorm1d(16384)

        self.adaGeLU1 = adaGeLU1
        self.adaGeLU2 = adaGeLU2
        self.adaGeLU3 = adaGeLU3
        self.dict_stats = {}
        self.gather_stats = False

    def forward(self, x):
        x = self.bn1(x)
        x = self.conv1(x)

        x = self.adaGeLU1(x)

        x = self.bn2(x)
        x = self.conv2(x)

        x = self.adaGeLU2(x)
        x = F.max_pool2d(x, 2)
        x = self.dropout1(x)
        x = torch.flatten(x, 1)

        x = self.bn3(x)
        x = self.fc1(x)

        x = self.adaGeLU3(x)
        x = self.dropout2(x)
        x = self.fc2(x)

        output = F.log_softmax(x, dim=1)
        return output

BATCH_SIZE = 256
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(DEVICE)

dataset_train = torchvision.datasets.CIFAR10('./data_cifar', train=True, download=True, transform=torchvision.transforms.ToTensor())
dataset_test = torchvision.datasets.CIFAR10('./data_cifar', train=False, download=True, transform=torchvision.transforms.ToTensor())
dl_train = torch.utils.data.DataLoader(dataset_train, batch_size=BATCH_SIZE, shuffle=True)
dl_test = torch.utils.data.DataLoader(dataset_test, batch_size=256, shuffle=False)

cuda
Files already downloaded and verified
Files already downloaded and verified


In [2]:
class adaGeLU(Optimizable):

    def __init__(self, optimizer):

        self.parameters = {'direct_coef': torch.tensor(0.5, requires_grad=True),
                           'sqr_coef': torch.tensor(np.sqrt(2/np.pi), requires_grad=True),
                           'add_coef': torch.tensor(1., requires_grad=True)}
        self.optimizer = optimizer
        self.all_params_with_gradients = [self.parameters['direct_coef'], self.parameters['sqr_coef'], self.parameters['add_coef']]
        
        super().__init__(self.parameters, optimizer)

    def __call__(self, input):
        output = self.parameters['direct_coef'] * input * (self.parameters['add_coef'] + F.tanh(self.parameters['sqr_coef'] * (input+ 0.044715*input**3)))
        return output
    
    def step(self):
        self.optimizer.step(self.parameters)

adaGeLU1 = adaGeLU(gdtuo.Adam(alpha = 0.001))
adaGeLU2 = adaGeLU(gdtuo.Adam(alpha = 0.001))
adaGeLU3 = adaGeLU(gdtuo.Adam(alpha = 0.001))
#optim = gdtuo.SGD(alpha=0.0769)
model = CNN(adaGeLU1, adaGeLU2, adaGeLU3).to(DEVICE)
optim = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

#mw = gdtuo.ModuleWrapper(model, optimizer=optim)
adaGeLU1.initialize()
adaGeLU2.initialize()
adaGeLU3.initialize()
#mw.initialize()

In [3]:
init_time = time.time()
EPOCHS = 10
direct_coef1 = [adaGeLU1.parameters['direct_coef'].item()]
sqr_coef1 = [adaGeLU1.parameters['sqr_coef'].item()]
add_coef1 = [adaGeLU1.parameters['add_coef'].item()]
direct_coef2 = [adaGeLU2.parameters['direct_coef'].item()]
sqr_coef2 = [adaGeLU2.parameters['sqr_coef'].item()]
add_coef2 = [adaGeLU2.parameters['add_coef'].item()]
direct_coef3 = [adaGeLU3.parameters['direct_coef'].item()]
sqr_coef3 = [adaGeLU3.parameters['sqr_coef'].item()]
add_coef3 = [adaGeLU3.parameters['add_coef'].item()]

for i in range(1, EPOCHS+1):
    running_acc = 0.0
    running_loss = 0.0
    for j, (features_, labels_) in enumerate(dl_train):
        #mw.begin() # call this before each step, enables gradient tracking on desired params
        adaGeLU1.begin()
        adaGeLU2.begin()
        adaGeLU3.begin()
        adaGeLU1.zero_grad()
        adaGeLU2.zero_grad()
        adaGeLU3.zero_grad()
        optim.zero_grad()
        features, labels = features_.to(DEVICE), labels_.to(DEVICE)
        pred = model.forward(features)
        loss = criterion(pred, labels)
        #mw.zero_grad()

        loss.backward(create_graph=True) # important! use create_graph=True
        #mw.step()
        optim.step()
        adaGeLU1.step()
        adaGeLU2.step()
        adaGeLU3.step()

        adaGeLU2.parameters['direct_coef'].clamp_(0.1, 1)

        running_loss += loss.item() * features_.size(0)
        running_acc += (torch.argmax(pred, dim=1) == labels).sum().item()

        if j%50 == 0:
            direct_coef1.append(adaGeLU1.parameters['direct_coef'].item())
            sqr_coef1.append(adaGeLU1.parameters['sqr_coef'].item())
            add_coef1.append(adaGeLU1.parameters['add_coef'].item())
            direct_coef2.append(adaGeLU2.parameters['direct_coef'].item())
            sqr_coef2.append(adaGeLU2.parameters['sqr_coef'].item())
            add_coef2.append(adaGeLU2.parameters['add_coef'].item())
            direct_coef3.append(adaGeLU3.parameters['direct_coef'].item())
            sqr_coef3.append(adaGeLU3.parameters['sqr_coef'].item())
            add_coef3.append(adaGeLU3.parameters['add_coef'].item())

    train_loss = running_loss / len(dl_train.dataset)
    train_acc = running_acc / len(dl_train.dataset)

    running_acc = 0.0
    with torch.no_grad():
        for j, (features_, labels_) in enumerate(dl_test):
            features, labels = features_.to(DEVICE), labels_.to(DEVICE)
            pred = model.forward(features)
            running_acc += (torch.argmax(pred, dim=1) == labels).sum().item()

    test_acc = running_acc / len(dl_test.dataset)
    print("EPOCH: {}, TRAIN LOSS: {}, ACC: {}".format(i, train_loss, train_acc))
    print("EPOCH: {}, TEST ACC: {}\n".format(i, test_acc))
    
print("Time taken: {}".format(time.time() - init_time))

  Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


EPOCH: 1, TRAIN LOSS: 1.5536134815979004, ACC: 0.46686
EPOCH: 1, TEST ACC: 0.5626

EPOCH: 2, TRAIN LOSS: 1.0915542339897155, ACC: 0.61348
EPOCH: 2, TEST ACC: 0.6182

EPOCH: 3, TRAIN LOSS: 0.9311594277572632, ACC: 0.67114
EPOCH: 3, TEST ACC: 0.6379

EPOCH: 4, TRAIN LOSS: 0.8273474590492248, ACC: 0.70864
EPOCH: 4, TEST ACC: 0.6449

EPOCH: 5, TRAIN LOSS: 0.7317616347885132, ACC: 0.73962
EPOCH: 5, TEST ACC: 0.6555

EPOCH: 6, TRAIN LOSS: 0.6512826685905456, ACC: 0.76852
EPOCH: 6, TEST ACC: 0.6511

EPOCH: 7, TRAIN LOSS: 0.5877226601791382, ACC: 0.79058
EPOCH: 7, TEST ACC: 0.6563

EPOCH: 8, TRAIN LOSS: 0.5146693521499633, ACC: 0.81678
EPOCH: 8, TEST ACC: 0.6502

EPOCH: 9, TRAIN LOSS: 0.4661249402618408, ACC: 0.83398
EPOCH: 9, TEST ACC: 0.6461

EPOCH: 10, TRAIN LOSS: 0.4249654502773285, ACC: 0.84772
EPOCH: 10, TEST ACC: 0.6568

Time taken: 94.51405811309814


In [4]:
for i in range(len(sqr_coef1)):
    x = np.linspace(-10, 10, 100)
    y_baseline = F.gelu(torch.tensor(x)).numpy()
    x = torch.tensor(x)
    y1 = direct_coef1[i] * x * (add_coef1[i] + F.tanh(sqr_coef1[i] * x**3)).numpy()
    y2 = direct_coef2[i] * x * (add_coef2[i] + F.tanh(sqr_coef2[i] * x**3)).numpy()
    y3 = direct_coef3[i] * x * (add_coef3[i] + F.tanh(sqr_coef3[i] * x**3)).numpy()
    fig, ax = plt.subplots()
    ax.plot(x, y_baseline)
    ax.plot(x, y1)
    ax.plot(x, y2)
    ax.plot(x, y3)
    # set small cross at 0.0
    ax.plot([0.0], [0.0], 'x', color='red')
    ax.set_title('AdaGELU on all activations for CIFAR10 beats GeLU and ReLU')
    ax.set_xlabel('x')
    ax.set_ylabel('y')

    ax.legend(['baseline (gelu)', 'adagelu 1', 'adagelu 2', 'adagelu 3'])
    ax.set_xlim([-10, 10])
    ax.set_ylim([-3, 10])
    ax.grid()

    os.makedirs('plots', exist_ok=True)
    plt.savefig('plots/{}.png'.format(i))
    plt.close()

video = './adaGELU.mp4'
imageio.mimsave(video, [imageio.imread('plots/{}.png'.format(i)) for i in range(len(sqr_coef1))], fps = 6)
#play it here
Video(video)

  imageio.mimsave(video, [imageio.imread('plots/{}.png'.format(i)) for i in range(len(sqr_coef1))], fps = 6)


In [5]:
sqr_coef2, add_coef2, direct_coef2

([0.7978845608028654,
  0.7983824299135656,
  0.7881588795446307,
  0.77812698639175,
  0.7734198398798313,
  0.7748930476196605,
  0.7719603933776318,
  0.7662261000451971,
  0.7737046920419368,
  0.780141404020544,
  0.7971055583884593,
  0.8005641595716867,
  0.8133305191451519,
  0.8198475592013611,
  0.8259126556782859,
  0.8370892146675083,
  0.8406932326797115,
  0.8440782785466545,
  0.8465875443084728,
  0.8484573837052274,
  0.854029862716974,
  0.8492907574623916,
  0.8476090401540273,
  0.8483904620060905,
  0.8411608766199564,
  0.8391750740717256,
  0.8321658623930624,
  0.8392712288208038,
  0.8341617508511008,
  0.8348716056705565,
  0.8366236705000815,
  0.8354034552872259,
  0.8375741831213982,
  0.8329751385546096,
  0.8292631060180303,
  0.8250319858951315,
  0.8217098056433473,
  0.8113501232339588,
  0.8017637352428097,
  0.8018442648551201,
  0.8069765330956553],
 [1.0,
  1.0008703470230103,
  0.9792245030403137,
  0.9548307061195374,
  0.9351220726966858,
  0.92

: 