In [1]:
import math
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
import time as time
import numpy as np
from gradient_descent_the_ultimate_optimizer import gdtuo
from gradient_descent_the_ultimate_optimizer.gdtuo import Optimizable
import os
import matplotlib.pyplot as plt
import imageio
from IPython.display import Video, Image

os.environ["CUDA_VISIBLE_DEVICES"] = "2"
torch.manual_seed(0)
np.random.seed(0)

class adaGeLU(nn.Module):

    def __init__(self):

        super(adaGeLU, self).__init__()

        self.parameters = {'alpha': torch.tensor(1., requires_grad=True),
                           'beta': torch.tensor(np.sqrt(2/np.pi), requires_grad=True),
                           'gamma': torch.tensor(0.044715, requires_grad=True)}
        self.all_params_with_gradients = [self.parameters['alpha'], self.parameters['beta'], self.parameters['gamma']]
        
    def forward(self, input):
        output = (1/2) * input * (1 + F.tanh(self.parameters['beta'] * 
                                             (self.parameters['alpha']*input + 
                                              self.parameters['gamma']*(self.parameters['alpha']*input)**3)))
        return output


adaGelu1 = adaGeLU()

adaGelu2 = adaGeLU()

adaGelu3 = adaGeLU()

class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.dropout1 = nn.Dropout2d(0.25)
        self.dropout2 = nn.Dropout2d(0.5)
        self.fc1 = nn.Linear(16384, 128)
        self.fc2 = nn.Linear(128, 10)
        self.bn1 = nn.BatchNorm2d(3)
        self.bn2 = nn.BatchNorm2d(32)
        self.bn3 = nn.BatchNorm1d(16384)

    def forward(self, x):
        x = self.bn1(x)
        x = self.conv1(x)

        x = adaGelu1(x)

        x = self.bn2(x)
        x = self.conv2(x)

        x = adaGelu2(x)
        x = F.max_pool2d(x, 2)
        x = self.dropout1(x)
        x = torch.flatten(x, 1)

        x = self.bn3(x)
        x = self.fc1(x)

        x = adaGelu3(x)
        x = self.dropout2(x)
        x = self.fc2(x)
        output = F.log_softmax(x, dim=1)
        return output

BATCH_SIZE = 256
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(DEVICE)

dataset_train = torchvision.datasets.CIFAR10('./data_cifar', train=True, download=True, transform=torchvision.transforms.ToTensor())
dataset_test = torchvision.datasets.CIFAR10('./data_cifar', train=False, download=True, transform=torchvision.transforms.ToTensor())
dl_train = torch.utils.data.DataLoader(dataset_train, batch_size=BATCH_SIZE, shuffle=True)
dl_test = torch.utils.data.DataLoader(dataset_test, batch_size=256, shuffle=False)

cuda
Files already downloaded and verified
Files already downloaded and verified


In [2]:
torch.cuda.empty_cache()

#optim = gdtuo.SGD(alpha=0.0769)
model = CNN()
#model.load_state_dict(torch.load('../model/model_cifar.pt'))
model.to(DEVICE)



optim = torch.optim.Adam(model.parameters(), lr=0.001)

optimAdaGelu1 = torch.optim.Adam(adaGelu1.all_params_with_gradients, lr=0.01)
optimAdaGelu2 = torch.optim.Adam(adaGelu2.all_params_with_gradients, lr=0.01)
optimAdaGelu3 = torch.optim.Adam(adaGelu3.all_params_with_gradients, lr=0.01)
criterion = nn.CrossEntropyLoss()

In [3]:
init_time = time.time()
EPOCHS = 10
alpha1 = [adaGelu1.parameters['alpha'].item()]
beta1 = [adaGelu1.parameters['beta'].item()]
gamma1 = [adaGelu1.parameters['gamma'].item()]
alpha2 = [adaGelu2.parameters['alpha'].item()]
beta2 = [adaGelu2.parameters['beta'].item()]
gamma2 = [adaGelu2.parameters['gamma'].item()]
alpha3 = [adaGelu3.parameters['alpha'].item()]
beta3 = [adaGelu3.parameters['beta'].item()]
gamma3 = [adaGelu3.parameters['gamma'].item()]

train_loss_list = []
train_acc_list = []
test_loss_list = []
test_acc_list = []

for i in range(1, EPOCHS+1):
    running_acc = 0.0
    running_loss = 0.0
    model.train()
    for j, (features_, labels_) in enumerate(dl_train):


        optim.zero_grad()
        features, labels = features_.to(DEVICE), labels_.to(DEVICE)
        pred = model.forward(features)
        loss = criterion(pred, labels)
        
        loss.backward(create_graph=True) # important! use create_graph=True

        optimAdaGelu1.step()
        optimAdaGelu2.step()
        optimAdaGelu3.step()
        optimAdaGelu1.zero_grad()
        optimAdaGelu2.zero_grad()
        optimAdaGelu3.zero_grad()
        
        optim.step()

        running_loss += loss.item() * features_.size(0)
        running_acc += (torch.argmax(pred, dim=1) == labels).sum().item()


        if j%50 == 0:
            alpha1.append(adaGelu1.parameters['alpha'].item())
            beta1.append(adaGelu1.parameters['beta'].item())
            gamma1.append(adaGelu1.parameters['gamma'].item())
            alpha2.append(adaGelu2.parameters['alpha'].item())
            beta2.append(adaGelu2.parameters['beta'].item())
            gamma2.append(adaGelu2.parameters['gamma'].item())
            alpha3.append(adaGelu3.parameters['alpha'].item())
            beta3.append(adaGelu3.parameters['beta'].item())
            gamma3.append(adaGelu3.parameters['gamma'].item())
    
    train_loss = running_loss / len(dl_train.dataset)
    train_acc = running_acc / len(dl_train.dataset)
    train_loss_list.append(train_loss)
    train_acc_list.append(train_acc)

    running_acc = 0.0
    running_loss = 0.0
    with torch.no_grad():
        model.eval()
        for j, (features_, labels_) in enumerate(dl_test):
            features, labels = features_.to(DEVICE), labels_.to(DEVICE)
            pred = model.forward(features)
            running_acc += (torch.argmax(pred, dim=1) == labels).sum().item()
            loss = criterion(pred, labels)
            running_loss += loss.item() * features_.size(0)

    test_loss = running_loss / len(dl_test.dataset)
    test_acc = running_acc / len(dl_test.dataset)
    test_loss_list.append(test_loss)
    test_acc_list.append(test_acc)
    print("EPOCH: {}, TRAIN LOSS: {}, ACC: {}".format(i, train_loss, train_acc))
    print("EPOCH: {}, TEST ACC: {}\n".format(i, test_acc))
    
print("Time taken: {}".format(time.time() - init_time))

  Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


EPOCH: 1, TRAIN LOSS: 1.487693971786499, ACC: 0.4922
EPOCH: 1, TEST ACC: 0.6423

EPOCH: 2, TRAIN LOSS: 1.033458874130249, ACC: 0.63562
EPOCH: 2, TEST ACC: 0.6809

EPOCH: 3, TRAIN LOSS: 0.8784457110404968, ACC: 0.69242
EPOCH: 3, TEST ACC: 0.7029

EPOCH: 4, TRAIN LOSS: 0.7607021634483337, ACC: 0.7313
EPOCH: 4, TEST ACC: 0.7195

EPOCH: 5, TRAIN LOSS: 0.6698470023536682, ACC: 0.76254
EPOCH: 5, TEST ACC: 0.72

EPOCH: 6, TRAIN LOSS: 0.5910242825889588, ACC: 0.78892
EPOCH: 6, TEST ACC: 0.727

EPOCH: 7, TRAIN LOSS: 0.5241399978256226, ACC: 0.8128
EPOCH: 7, TEST ACC: 0.7249

EPOCH: 8, TRAIN LOSS: 0.46582451416015624, ACC: 0.83322
EPOCH: 8, TEST ACC: 0.7295

EPOCH: 9, TRAIN LOSS: 0.42236141349792483, ACC: 0.84768
EPOCH: 9, TEST ACC: 0.7275

EPOCH: 10, TRAIN LOSS: 0.3882057455158234, ACC: 0.85976
EPOCH: 10, TEST ACC: 0.728

Time taken: 96.2098798751831


In [4]:
for i in range(len(beta3)):
    x = np.linspace(-10, 10, 100)
    y_baseline = F.gelu(torch.tensor(x)).numpy()
    x = torch.tensor(x)
    y1 = (1/2) * x * (1 + F.tanh(beta1[i] * (alpha1[i]*x + gamma1[i] * (alpha1[i]*x**3)))).numpy()
    y2 = (1/2) * x * (1 + F.tanh(beta2[i] * (alpha2[i]*x + gamma2[i] * (alpha2[i]*x**3)))).numpy()
    y3 = (1/2) * x * (1 + F.tanh(beta3[i] * (alpha3[i]*x + gamma3[i] * (alpha3[i]*x**3)))).numpy()
    fig, ax = plt.subplots()
    ax.plot(x, y_baseline)
    ax.plot(x, y1)
    ax.plot(x, y2)
    ax.plot(x, y3)
    # set small cross at 0.0
    ax.plot([0.0], [0.0], 'x', color='red')
    ax.set_title('AdaGELU on all activations for CIFAR10 beats GeLU and ReLU')
    ax.set_xlabel('x')
    ax.set_ylabel('y')

    ax.legend(['baseline (gelu)', 'adagelu 1', 'adagelu 2', 'adagelu 3'])
    ax.set_xlim([-10, 10])
    ax.set_ylim([-3, 10])
    ax.grid()

    os.makedirs('plots', exist_ok=True)
    plt.savefig('plots/{}.png'.format(i))
    plt.close()

video = './adaGELU.mp4'
imageio.mimsave(video, [imageio.imread('plots/{}.png'.format(i)) for i in range(len(beta3))], fps = 4)
#play it here
Video(video)

  imageio.mimsave(video, [imageio.imread('plots/{}.png'.format(i)) for i in range(len(beta3))], fps = 4)
