In [1]:
import math
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
from torchvision.transforms import Lambda
import time as time
import numpy as np
from gradient_descent_the_ultimate_optimizer import gdtuo
from gradient_descent_the_ultimate_optimizer.gdtuo import Optimizable
import os
import matplotlib.pyplot as plt
import imageio
from IPython.display import Video, Image

os.environ["CUDA_VISIBLE_DEVICES"] = "1"

DEVICE = 'cuda:0' if torch.cuda.is_available() else 'cpu'
print('Using {} device'.format(DEVICE))
torch.manual_seed(0)
np.random.seed(0)

class adaGeLU(nn.Module):

    def __init__(self):

        super(adaGeLU, self).__init__()

        self.parameters = {'alpha': nn.Parameter(torch.tensor(1., requires_grad=True)),
                           'beta': nn.Parameter(torch.tensor(np.sqrt(2/np.pi), requires_grad=True)),
                           'gamma': nn.Parameter(torch.tensor(0.044715, requires_grad=True))}
        self.all_params_with_gradients = [self.parameters['alpha'], self.parameters['beta'], self.parameters['gamma']]
        
    def forward(self, input):
        output = (1/2) * input * (1 + F.tanh(self.parameters['beta'] * 
                                             (self.parameters['alpha']*input + 
                                              self.parameters['gamma']*(self.parameters['alpha']*input)**3)))
        return output

Using cuda:0 device


In [2]:
adaGELURes1 = adaGeLU().to(DEVICE)
adaGELURes2 = adaGeLU().to(DEVICE)

class ResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride = 1, downsample = None):
        super(ResidualBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size = 3, stride = stride, padding = 1)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size = 3, stride = 1, padding = 1)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.downsample = downsample
        self.out_channels = out_channels
        
    def forward(self, x):
        residual = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = adaGELURes2(out)
        out = self.conv2(out)
        out = self.bn2(out)
        if self.downsample:
            residual = self.downsample(x)
        out += residual
        out = adaGELURes1(out)
        return out
    
adaGELUNet = adaGeLU().to(DEVICE)
class ResNet(nn.Module):
    def __init__(self, block, layers, num_classes = 257):
        super(ResNet, self).__init__()
        self.inplanes = 64
        self.conv1 = nn.Conv2d(3, 64, kernel_size = 7, stride = 2, padding = 3)
        self.bn1 = nn.BatchNorm2d(64)
        self.maxpool = nn.MaxPool2d(kernel_size = 3, stride = 2, padding = 1)
        self.layer0 = self._make_layer(block, 64, layers[0], stride = 1)
        self.layer1 = self._make_layer(block, 128, layers[1], stride = 2)
        self.layer2 = self._make_layer(block, 256, layers[2], stride = 2)
        self.layer3 = self._make_layer(block, 512, layers[3], stride = 2)
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) 
        self.fc = nn.Linear(512, num_classes)
        
    def _make_layer(self, block, planes, blocks, stride=1):
        downsample = None
        if stride != 1 or self.inplanes != planes:
            
            downsample = nn.Sequential(
                nn.Conv2d(self.inplanes, planes, kernel_size=1, stride=stride),
                nn.BatchNorm2d(planes),
            )
        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample))
        self.inplanes = planes
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes))

        return nn.Sequential(*layers)    

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = adaGELUNet(x)
        x = self.maxpool(x)
        x = self.layer0(x)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)

        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)

        return x

In [3]:
torch.cuda.empty_cache()

model = ResNet(ResidualBlock, [3, 4, 6, 3])

if torch.cuda.device_count() > 1:
    print("Using", torch.cuda.device_count(), "GPUs")
    model = nn.DataParallel(model)

model.to(DEVICE)

optim = torch.optim.Adam(model.parameters(), lr=0.0007)

optimAdaGelu1 = torch.optim.Adam(adaGELURes1.all_params_with_gradients, lr=0.01)
optimAdaGelu2 = torch.optim.Adam(adaGELURes2.all_params_with_gradients, lr=0.01)
optimAdaGelu3 = torch.optim.Adam(adaGELUNet.all_params_with_gradients, lr=0.01)
criterion = nn.CrossEntropyLoss()

In [4]:
BATCH_SIZE = 256

transforms = torchvision.transforms.Compose([
    torchvision.transforms.Resize((224, 224)),
    Lambda(lambda x: x.convert("RGB")),
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize(
        mean=[0.485, 0.456, 0.406], 
        std=[0.229, 0.224, 0.225]
    )
])

dataset_glob = torchvision.datasets.Caltech256('./data_caltech256', download=True, transform=transforms)
dataset_train, dataset_test = torch.utils.data.random_split(dataset_glob, [int(len(dataset_glob)*0.8), len(dataset_glob) - int(len(dataset_glob)*0.8)])
dl_train = torch.utils.data.DataLoader(dataset_train, batch_size=BATCH_SIZE, shuffle=True, num_workers=6)
dl_test = torch.utils.data.DataLoader(dataset_test, batch_size=256, shuffle=False, num_workers=6)

Files already downloaded and verified


In [5]:
init_time = time.time()
EPOCHS = 20
alpha1 = [adaGELURes1.parameters['alpha'].item()]
beta1 = [adaGELURes1.parameters['beta'].item()]
gamma1 = [adaGELURes1.parameters['gamma'].item()]
alpha2 = [adaGELURes2.parameters['alpha'].item()]
beta2 = [adaGELURes2.parameters['beta'].item()]
gamma2 = [adaGELURes2.parameters['gamma'].item()]
alpha3 = [adaGELUNet.parameters['alpha'].item()]
beta3 = [adaGELUNet.parameters['beta'].item()]
gamma3 = [adaGELUNet.parameters['gamma'].item()]

train_loss_list = []
train_acc_list = []
test_loss_list = []
test_acc_list = []

for i in range(1, EPOCHS+1):
    running_acc = 0.0
    running_loss = 0.0
    model.train()
    for j, (features_, labels_) in enumerate(dl_train):


        optim.zero_grad()
        features, labels = features_.to(DEVICE), labels_.to(DEVICE)
        pred = model.forward(features)
        loss = criterion(pred, labels)
        
        loss.backward(create_graph=True) # important! use create_graph=True

        optimAdaGelu1.step()
        optimAdaGelu2.step()
        optimAdaGelu3.step()
        optimAdaGelu1.zero_grad()
        optimAdaGelu2.zero_grad()
        optimAdaGelu3.zero_grad()
        
        optim.step()

        running_loss += loss.item() * features_.size(0)
        running_acc += (torch.argmax(pred, dim=1) == labels).sum().item()

        if j%50 == 0:
            print("EPOCH: {}, BATCH: {}".format(i, j))
            alpha1.append(adaGELURes1.parameters['alpha'].item())
            beta1.append(adaGELURes1.parameters['beta'].item())
            gamma1.append(adaGELURes1.parameters['gamma'].item())
            alpha2.append(adaGELURes2.parameters['alpha'].item())
            beta2.append(adaGELURes2.parameters['beta'].item())
            gamma2.append(adaGELURes2.parameters['gamma'].item())
            alpha3.append(adaGELUNet.parameters['alpha'].item())
            beta3.append(adaGELUNet.parameters['beta'].item())
            gamma3.append(adaGELUNet.parameters['gamma'].item())
    
    train_loss = running_loss / len(dl_train.dataset)
    train_acc = running_acc / len(dl_train.dataset)
    train_loss_list.append(train_loss)
    train_acc_list.append(train_acc)

    running_acc = 0.0
    running_loss = 0.0
    with torch.no_grad():
        model.eval()
        for j, (features_, labels_) in enumerate(dl_test):
            features, labels = features_.to(DEVICE), labels_.to(DEVICE)
            pred = model.forward(features)
            running_acc += (torch.argmax(pred, dim=1) == labels).sum().item()
            loss = criterion(pred, labels)
            running_loss += loss.item() * features_.size(0)
            
    test_loss = running_loss / len(dl_test.dataset)
    test_acc = running_acc / len(dl_test.dataset)
    test_loss_list.append(test_loss)
    test_acc_list.append(test_acc)
    print("EPOCH: {}, TRAIN LOSS: {}, ACC: {}".format(i, train_loss, train_acc))
    print("EPOCH: {}, TEST ACC: {}\n".format(i, test_acc))
    
print("Time taken: {}".format(time.time() - init_time))

  Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


OutOfMemoryError: CUDA out of memory. Tried to allocate 98.00 MiB (GPU 0; 39.39 GiB total capacity; 38.45 GiB already allocated; 76.88 MiB free; 38.79 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
# save training and testing statistics in csv
path = '../results/caltech256'
name = 'adagelu'
np.savetxt(path + '/' + name + '_train_loss.csv', train_loss_list, delimiter=',')
np.savetxt(path + '/' + name + '_train_acc.csv', train_acc_list, delimiter=',')
np.savetxt(path + '/' + name + '_test_loss.csv', test_loss_list, delimiter=',')
np.savetxt(path + '/' + name + '_test_acc.csv', test_acc_list, delimiter=',')

In [None]:
for i in range(len(beta3)):
    x = np.linspace(-10, 10, 100)
    y_baseline = F.gelu(torch.tensor(x)).numpy()
    x = torch.tensor(x)
    y1 = (1/2) * x * (1 + F.tanh(beta1[i] * (alpha1[i]*x + gamma1[i] * (alpha1[i]*x**3)))).numpy()
    y2 = (1/2) * x * (1 + F.tanh(beta2[i] * (alpha2[i]*x + gamma2[i] * (alpha2[i]*x**3)))).numpy()
    y3 = (1/2) * x * (1 + F.tanh(beta3[i] * (alpha3[i]*x + gamma3[i] * (alpha3[i]*x**3)))).numpy()
    fig, ax = plt.subplots()
    ax.plot(x, y_baseline)
    ax.plot(x, y1)
    ax.plot(x, y2)
    ax.plot(x, y3)
    # set small cross at 0.0
    ax.plot([0.0], [0.0], 'x', color='red')
    ax.set_title('AdaGELU on all activations for CIFAR10 beats GeLU and ReLU')
    ax.set_xlabel('x')
    ax.set_ylabel('y')

    ax.legend(['baseline (gelu)', 'adagelu 1', 'adagelu 2', 'adagelu 3'])
    ax.set_xlim([-10, 10])
    ax.set_ylim([-3, 10])
    ax.grid()

    os.makedirs('plots', exist_ok=True)
    plt.savefig('plots/{}.png'.format(i))
    plt.close()

video = './adaGELU.mp4'
imageio.mimsave(video, [imageio.imread('plots/{}.png'.format(i)) for i in range(len(beta3))], fps = 4)
#play it here
Video(video)

  imageio.mimsave(video, [imageio.imread('plots/{}.png'.format(i)) for i in range(len(beta3))], fps = 4)
