In [21]:
import math
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
import time as time
import numpy as np
import os
import matplotlib.pyplot as plt
import imageio
from IPython.display import Video, Image

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(DEVICE)

class AdaRELU(nn.Module):

    def __init__(self):
        super(AdaRELU, self).__init__()

        self.parameters = {'positive_slope': torch.tensor(1., requires_grad=True),
                           'negative_slope': torch.tensor(0.1, requires_grad=True)}
        self.all_params_with_gradients = [self.parameters['positive_slope'], self.parameters['negative_slope']]
        

    def forward(self, input):
        output = torch.where(input >= 0, input * self.parameters['positive_slope'], input * self.parameters['negative_slope'])
        return output


adaRELU1 = AdaRELU().to(DEVICE)
"""adaRELU2 = AdaRELU().to(DEVICE)
adaRELU3 = AdaRELU().to(DEVICE)"""

class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.dropout1 = nn.Dropout2d(0.25)
        self.dropout2 = nn.Dropout2d(0.5)
        self.fc1 = nn.Linear(16384, 128)  # Adjusted input dimensions
        self.fc2 = nn.Linear(128, 10)
        self.bn1 = nn.BatchNorm2d(3)
        self.bn2 = nn.BatchNorm2d(32)
        self.bn3 = nn.BatchNorm1d(16384)

        self.dict_stats = {}
        self.gather_stats = False

    def forward(self, x):
        x = self.bn1(x)
        x = self.conv1(x)
        if self.gather_stats:
            self.dict_stats['conv1_mean'] = x.clone().detach().cpu().numpy().mean()
            self.dict_stats['conv1_std'] = x.clone().detach().cpu().numpy().std()
        x = adaRELU1(x)

        x = self.bn2(x)
        x = self.conv2(x)
        if self.gather_stats:
            self.dict_stats['conv2_mean'] = x.clone().detach().cpu().numpy().mean()
            self.dict_stats['conv2_std'] = x.clone().detach().cpu().numpy().std()
        x = adaRELU1(x)
        x = F.max_pool2d(x, 2)
        x = self.dropout1(x)
        x = torch.flatten(x, 1)

        x = self.bn3(x)
        x = self.fc1(x)
        if self.gather_stats:
            self.dict_stats['fc1_mean'] = x.clone().detach().cpu().numpy().mean()
            self.dict_stats['fc1_std'] = x.clone().detach().cpu().numpy().std()
        x = adaRELU1(x)
        x = self.dropout2(x)
        x = self.fc2(x)
        output = F.log_softmax(x, dim=1)
        return output

BATCH_SIZE = 256

dataset_train = torchvision.datasets.CIFAR10('./data_cifar', train=True, download=True, transform=torchvision.transforms.ToTensor())
dataset_test = torchvision.datasets.CIFAR10('./data_cifar', train=False, download=True, transform=torchvision.transforms.ToTensor())
dl_train = torch.utils.data.DataLoader(dataset_train, batch_size=BATCH_SIZE, shuffle=True)
dl_test = torch.utils.data.DataLoader(dataset_test, batch_size=256, shuffle=False)

ImportError: cannot import name 'Optimizable' from 'gradient_descent_the_ultimate_optimizer' (/home/infres/egardes-21/.local/lib/python3.10/site-packages/gradient_descent_the_ultimate_optimizer/__init__.py)

In [16]:
model = CNN().to(DEVICE)

optim = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

optimAdaRelu1 = torch.optim.Adam(adaRELU1.all_params_with_gradients, lr=0.01)
"""optimAdaRelu2 = torch.optim.Adam(adaRELU2.all_params_with_gradients, lr=0.01)
optimAdaRelu3 = torch.optim.Adam(adaRELU3.all_params_with_gradients, lr=0.01)"""

'optimAdaRelu2 = torch.optim.Adam(adaRELU2.all_params_with_gradients, lr=0.01)\noptimAdaRelu3 = torch.optim.Adam(adaRELU3.all_params_with_gradients, lr=0.01)'

In [17]:
init_time = time.time()
EPOCHS = 10

train_loss_list = []
train_acc_list = []
test_loss_list = []
test_acc_list = []

for i in range(1, EPOCHS+1):
    running_acc = 0.0
    running_loss = 0.0
    model.train()
    for j, (features_, labels_) in enumerate(dl_train):
        #mw.begin() # call this before each step, enables gradient tracking on desired params

        optim.zero_grad()
        features, labels = features_.to(DEVICE), labels_.to(DEVICE)
        pred = model.forward(features)
        loss = criterion(pred, labels)
        #mw.zero_grad()

        loss.backward(create_graph=True) # important! use create_graph=True

        optimAdaRelu1.step()
        """optimAdaRelu2.step()
        optimAdaRelu3.step()"""

        optimAdaRelu1.zero_grad()
        """optimAdaRelu2.zero_grad()
        optimAdaRelu3.zero_grad()"""
        
        optim.step()


        running_loss += loss.item() * features_.size(0)
        running_acc += (torch.argmax(pred, dim=1) == labels).sum().item()


    train_loss = running_loss / len(dl_train.dataset)
    train_acc = running_acc / len(dl_train.dataset)
    train_loss_list.append(train_loss)
    train_acc_list.append(train_acc)

    running_acc = 0.0
    running_loss = 0.0
    with torch.no_grad():
        model.eval()
        for j, (features_, labels_) in enumerate(dl_test):
            features, labels = features_.to(DEVICE), labels_.to(DEVICE)
            pred = model.forward(features)
            running_acc += (torch.argmax(pred, dim=1) == labels).sum().item()
            loss = criterion(pred, labels)
            running_loss += loss.item() * features_.size(0)

    test_loss = running_loss / len(dl_test.dataset)
    test_acc = running_acc / len(dl_test.dataset)
    test_loss_list.append(test_loss)
    test_acc_list.append(test_acc)
    print("EPOCH: {}, TRAIN LOSS: {}, ACC: {}".format(i, train_loss, train_acc))
    print("EPOCH: {}, TEST ACC: {}\n".format(i, test_acc))
    
print("Time taken: {}".format(time.time() - init_time))



EPOCH: 1, TRAIN LOSS: 1.481005996170044, ACC: 0.4851
EPOCH: 1, TEST ACC: 0.6269

EPOCH: 2, TRAIN LOSS: 1.0969895317268372, ACC: 0.61392
EPOCH: 2, TEST ACC: 0.6749

EPOCH: 3, TRAIN LOSS: 0.9527389910888672, ACC: 0.66662
EPOCH: 3, TEST ACC: 0.6907

EPOCH: 4, TRAIN LOSS: 0.8571641386222839, ACC: 0.69694
EPOCH: 4, TEST ACC: 0.6976

EPOCH: 5, TRAIN LOSS: 0.7815189984321594, ACC: 0.725
EPOCH: 5, TEST ACC: 0.7072

EPOCH: 6, TRAIN LOSS: 0.7082464732170105, ACC: 0.74882
EPOCH: 6, TEST ACC: 0.7079

EPOCH: 7, TRAIN LOSS: 0.6522045014381409, ACC: 0.76802
EPOCH: 7, TEST ACC: 0.7142

EPOCH: 8, TRAIN LOSS: 0.5995901018905639, ACC: 0.78654
EPOCH: 8, TEST ACC: 0.7097

EPOCH: 9, TRAIN LOSS: 0.5522414917755127, ACC: 0.80252
EPOCH: 9, TEST ACC: 0.7119

EPOCH: 10, TRAIN LOSS: 0.5124345553779602, ACC: 0.81712
EPOCH: 10, TEST ACC: 0.7156

Time taken: 91.41461443901062


In [18]:
# save training and testing statistics in csv
path = '../results/CIFAR-10'
name = 'adarelu'
np.savetxt(path + '/' + name + '_train_loss.csv', train_loss_list, delimiter=',')
np.savetxt(path + '/' + name + '_train_acc.csv', train_acc_list, delimiter=',')
np.savetxt(path + '/' + name + '_test_loss.csv', test_loss_list, delimiter=',')
np.savetxt(path + '/' + name + '_test_acc.csv', test_acc_list, delimiter=',')

In [19]:
for i in range(len(negative_slopes_encountered1)):
    x = np.linspace(-10, 10, 100)
    y_baseline = np.where(x >= 0, x, 0.1*x)
    y1 = np.where(x >= 0, x * positive_slopes_encountered1[i], x * negative_slopes_encountered1[i])
    y2 = np.where(x >= 0, x * positive_slopes_encountered2[i], x * negative_slopes_encountered2[i])
    y3 = np.where(x >= 0, x * positive_slopes_encountered3[i], x * negative_slopes_encountered3[i])
    fig, ax = plt.subplots()
    ax.plot(x, y_baseline)
    ax.plot(x, y1)
    ax.plot(x, y2)
    ax.plot(x, y3)
    # set small cross at 0.0
    ax.plot([0.0], [0.0], 'x', color='red')
    ax.set_title('AdaRELU on all activations for CIFAR10 beats GeLU and ReLU')
    ax.set_xlabel('x')
    ax.set_ylabel('y')

    ax.legend(['baseline (leaky-relu)', 'adarelu 1', 'adarelu 2', 'adarelu 3'])
    ax.set_xlim([-10, 10])
    ax.set_ylim([-3, 10])
    ax.grid()

    os.makedirs('plots', exist_ok=True)
    plt.savefig('plots/{}.png'.format(i))
    plt.close()

video = './adaRELU.mp4'
imageio.mimsave(video, [imageio.imread('plots/{}.png'.format(i)) for i in range(len(negative_slopes_encountered1))], fps = 3)
#play it here
Video(video)

  imageio.mimsave(video, [imageio.imread('plots/{}.png'.format(i)) for i in range(len(negative_slopes_encountered1))], fps = 3)
