In [7]:
import math
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
import time as time
import numpy as np
from gradient_descent_the_ultimate_optimizer import gdtuo
from gradient_descent_the_ultimate_optimizer.gdtuo import Optimizable
import os
import matplotlib.pyplot as plt
import imageio
from IPython.display import Video, Image

os.environ["CUDA_VISIBLE_DEVICES"] = "1"
torch.manual_seed(0)
np.random.seed(0)

class adaGeLU(Optimizable):

    def __init__(self, optimizer):

        self.parameters = {'alpha': torch.tensor(1., requires_grad=True),
                           'beta': torch.tensor(np.sqrt(2/np.pi), requires_grad=True),
                           'gamma': torch.tensor(0.044715, requires_grad=True)}
        self.optimizer = optimizer
        self.all_params_with_gradients = [self.parameters['alpha'], self.parameters['beta'], self.parameters['gamma']]
        
        super().__init__(self.parameters, optimizer)

    def __call__(self, input):
        output = (1/2) * input * (1 + F.tanh(self.parameters['beta'] * 
                                             (self.parameters['alpha']*input + 
                                              self.parameters['gamma']*(self.parameters['alpha']*input)**3)))
        return output
    
    def step(self):
        self.optimizer.step(self.parameters)


class MNIST_CNN(nn.Module):
    def __init__(self):
        super(MNIST_CNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.dropout1 = nn.Dropout2d(0.25)
        self.dropout2 = nn.Dropout2d(0.5)
        self.fc1 = nn.Linear(16384, 128)  # Adjusted input dimensions
        self.fc2 = nn.Linear(128, 10)
        self.bn1 = nn.BatchNorm2d(3)
        self.bn2 = nn.BatchNorm2d(32)
        self.bn3 = nn.BatchNorm1d(16384)

        self.adaGeLU1 = None
        self.adaGeLU2 = None
        self.adaGeLU3 = None
        self.dict_stats = {}
        self.gather_stats = False

    def forward(self, x):
        x = self.bn1(x)
        x = self.conv1(x)
        if self.gather_stats:
            self.dict_stats['conv1_mean'] = x.clone().detach().cpu().numpy().mean()
            self.dict_stats['conv1_std'] = x.clone().detach().cpu().numpy().std()
        x = self.adaGeLU1(x)

        x = self.bn2(x)
        x = self.conv2(x)
        if self.gather_stats:
            self.dict_stats['conv2_mean'] = x.clone().detach().cpu().numpy().mean()
            self.dict_stats['conv2_std'] = x.clone().detach().cpu().numpy().std()
        x = self.adaGeLU2(x)
        x = F.max_pool2d(x, 2)
        x = self.dropout1(x)
        x = torch.flatten(x, 1)

        x = self.bn3(x)
        x = self.fc1(x)
        if self.gather_stats:
            self.dict_stats['fc1_mean'] = x.clone().detach().cpu().numpy().mean()
            self.dict_stats['fc1_std'] = x.clone().detach().cpu().numpy().std()
        x = self.adaGeLU3(x)
        x = self.dropout2(x)
        x = self.fc2(x)
        output = F.log_softmax(x, dim=1)
        return output

BATCH_SIZE = 256
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(DEVICE)

dataset_train = torchvision.datasets.CIFAR10('./data_cifar', train=True, download=True, transform=torchvision.transforms.ToTensor())
dataset_test = torchvision.datasets.CIFAR10('./data_cifar', train=False, download=True, transform=torchvision.transforms.ToTensor())
dl_train = torch.utils.data.DataLoader(dataset_train, batch_size=BATCH_SIZE, shuffle=True)
dl_test = torch.utils.data.DataLoader(dataset_test, batch_size=256, shuffle=False)

cuda
Files already downloaded and verified
Files already downloaded and verified


In [8]:
torch.cuda.empty_cache()

#optim = gdtuo.SGD(alpha=0.0769)
model = MNIST_CNN()
#model.load_state_dict(torch.load('../model/model_cifar.pt'))
model.to(DEVICE)

adaGelu1 = adaGeLU(gdtuo.Adam(alpha = 0.01))
adaGelu2 = adaGeLU(gdtuo.Adam(alpha = 0.01))
adaGelu3 = adaGeLU(gdtuo.Adam(alpha = 0.01))

optim = torch.optim.Adam(model.parameters(), lr=0.001)

model.adaGeLU1 = adaGelu1
model.adaGeLU2 = adaGelu2
model.adaGeLU3 = adaGelu3

criterion = nn.CrossEntropyLoss()

#mw = gdtuo.ModuleWrapper(model, optimizer=optim)
model.adaGeLU1.initialize()
model.adaGeLU2.initialize()
model.adaGeLU3.initialize()
#mw.initialize()

In [9]:
init_time = time.time()
EPOCHS = 10
alpha1 = [model.adaGeLU1.parameters['alpha'].item()]
beta1 = [model.adaGeLU1.parameters['beta'].item()]
gamma1 = [model.adaGeLU1.parameters['gamma'].item()]
alpha2 = [model.adaGeLU2.parameters['alpha'].item()]
beta2 = [model.adaGeLU2.parameters['beta'].item()]
gamma2 = [model.adaGeLU2.parameters['gamma'].item()]
alpha3 = [model.adaGeLU3.parameters['alpha'].item()]
beta3 = [model.adaGeLU3.parameters['beta'].item()]
gamma3 = [model.adaGeLU3.parameters['gamma'].item()]

train_loss_list = []
train_acc_list = []
test_loss_list = []
test_acc_list = []

for i in range(1, EPOCHS+1):
    running_acc = 0.0
    running_loss = 0.0
    model.train()
    for j, (features_, labels_) in enumerate(dl_train):
        #mw.begin() # call this before each step, enables gradient tracking on desired params
        model.adaGeLU1.begin()
        model.adaGeLU2.begin()
        model.adaGeLU3.begin()

        optim.zero_grad()
        features, labels = features_.to(DEVICE), labels_.to(DEVICE)
        pred = model.forward(features)
        loss = criterion(pred, labels)
        #mw.zero_grad()
        
        loss.backward(create_graph=True) # important! use create_graph=True
        #mw.step()
        model.adaGeLU1.step()
        model.adaGeLU2.step()
        model.adaGeLU3.step()
        model.adaGeLU1.zero_grad()
        model.adaGeLU2.zero_grad()
        model.adaGeLU3.zero_grad()
        
        optim.step()

        running_loss += loss.item() * features_.size(0)
        running_acc += (torch.argmax(pred, dim=1) == labels).sum().item()

        if j%50 == 0:
            alpha1.append(model.adaGeLU1.parameters['alpha'].item())
            beta1.append(model.adaGeLU1.parameters['beta'].item())
            gamma1.append(model.adaGeLU1.parameters['gamma'].item())
            alpha2.append(model.adaGeLU2.parameters['alpha'].item())
            beta2.append(model.adaGeLU2.parameters['beta'].item())
            gamma2.append(model.adaGeLU2.parameters['gamma'].item())
            alpha3.append(model.adaGeLU3.parameters['alpha'].item())
            beta3.append(model.adaGeLU3.parameters['beta'].item())
            gamma3.append(model.adaGeLU3.parameters['gamma'].item())
    
    train_loss = running_loss / len(dl_train.dataset)
    train_acc = running_acc / len(dl_train.dataset)
    train_loss_list.append(train_loss)
    train_acc_list.append(train_acc)

    running_acc = 0.0
    running_loss = 0.0
    with torch.no_grad():
        model.eval()
        for j, (features_, labels_) in enumerate(dl_test):
            features, labels = features_.to(DEVICE), labels_.to(DEVICE)
            pred = model.forward(features)
            running_acc += (torch.argmax(pred, dim=1) == labels).sum().item()
            loss = criterion(pred, labels)
            running_loss += loss.item() * features_.size(0)

    test_loss = running_loss / len(dl_test.dataset)
    test_acc = running_acc / len(dl_test.dataset)
    test_loss_list.append(test_loss)
    test_acc_list.append(test_acc)
    print("EPOCH: {}, TRAIN LOSS: {}, ACC: {}".format(i, train_loss, train_acc))
    print("EPOCH: {}, TEST ACC: {}\n".format(i, test_acc))
    
print("Time taken: {}".format(time.time() - init_time))

EPOCH: 1, TRAIN LOSS: 1.487583603477478, ACC: 0.49228
EPOCH: 1, TEST ACC: 0.6428

EPOCH: 2, TRAIN LOSS: 1.0341383296585083, ACC: 0.63588
EPOCH: 2, TEST ACC: 0.6805

EPOCH: 3, TRAIN LOSS: 0.8782855855369568, ACC: 0.69244
EPOCH: 3, TEST ACC: 0.702

EPOCH: 4, TRAIN LOSS: 0.7605376928329468, ACC: 0.73098
EPOCH: 4, TEST ACC: 0.718

EPOCH: 5, TRAIN LOSS: 0.6676008232688904, ACC: 0.7629
EPOCH: 5, TEST ACC: 0.7196

EPOCH: 6, TRAIN LOSS: 0.5919881413936615, ACC: 0.78984
EPOCH: 6, TEST ACC: 0.7288

EPOCH: 7, TRAIN LOSS: 0.521190945854187, ACC: 0.81478
EPOCH: 7, TEST ACC: 0.7242

EPOCH: 8, TRAIN LOSS: 0.4684892978858948, ACC: 0.83106
EPOCH: 8, TEST ACC: 0.7293

EPOCH: 9, TRAIN LOSS: 0.42029150903701784, ACC: 0.8485
EPOCH: 9, TEST ACC: 0.7312

EPOCH: 10, TRAIN LOSS: 0.3856738301849365, ACC: 0.86236
EPOCH: 10, TEST ACC: 0.7325

Time taken: 87.60540199279785


In [11]:
# save training and testing statistics in csv
path = '../results/CIFAR-10'
name = 'adagelu'
np.savetxt(path + '/' + name + '_train_loss.csv', train_loss_list, delimiter=',')
np.savetxt(path + '/' + name + '_train_acc.csv', train_acc_list, delimiter=',')
np.savetxt(path + '/' + name + '_test_loss.csv', test_loss_list, delimiter=',')
np.savetxt(path + '/' + name + '_test_acc.csv', test_acc_list, delimiter=',')

In [4]:
for i in range(len(beta1)):
    x = np.linspace(-10, 10, 100)
    y_baseline = F.gelu(torch.tensor(x)).numpy()
    x = torch.tensor(x)
    y1 = (1/2) * x * (1 + F.tanh(beta1[i] * (alpha1[i]*x + gamma1[i] * (alpha1[i]*x**3)))).numpy()
    y2 = (1/2) * x * (1 + F.tanh(beta2[i] * (alpha2[i]*x + gamma2[i] * (alpha2[i]*x**3)))).numpy()
    y3 = (1/2) * x * (1 + F.tanh(beta3[i] * (alpha3[i]*x + gamma3[i] * (alpha3[i]*x**3)))).numpy()
    fig, ax = plt.subplots()
    ax.plot(x, y_baseline)
    ax.plot(x, y1)
    ax.plot(x, y2)
    ax.plot(x, y3)
    # set small cross at 0.0
    ax.plot([0.0], [0.0], 'x', color='red')
    ax.set_title('AdaGELU on all activations for CIFAR10 beats GeLU and ReLU')
    ax.set_xlabel('x')
    ax.set_ylabel('y')

    ax.legend(['baseline (gelu)', 'adagelu 1', 'adagelu 2', 'adagelu 3'])
    ax.set_xlim([-10, 10])
    ax.set_ylim([-3, 10])
    ax.grid()

    os.makedirs('plots', exist_ok=True)
    plt.savefig('plots/{}.png'.format(i))
    plt.close()

video = './adaGELU.mp4'
imageio.mimsave(video, [imageio.imread('plots/{}.png'.format(i)) for i in range(len(beta1))], fps = 4)
#play it here
Video(video)

  imageio.mimsave(video, [imageio.imread('plots/{}.png'.format(i)) for i in range(len(beta1))], fps = 4)
