In [1]:
import math
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
import time as time
import numpy as np
from gradient_descent_the_ultimate_optimizer import gdtuo
from gradient_descent_the_ultimate_optimizer.gdtuo import Optimizable
import os
import matplotlib.pyplot as plt
import imageio
from IPython.display import Video, Image
from poly_fit_relu import train_poly_fit_relu as pfr
from poly_fit_relu import plot_poly_fit_relu as ppfr

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

class MNIST_CNN(nn.Module):
    def __init__(self, poly_act1, poly2, poly3):
        super(MNIST_CNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.dropout1 = nn.Dropout2d(0.25)
        self.dropout2 = nn.Dropout2d(0.5)
        self.fc1 = nn.Linear(16384, 128)  # Adjusted input dimensions
        self.fc2 = nn.Linear(128, 10)
        self.bn1 = nn.BatchNorm2d(3)
        self.bn2 = nn.BatchNorm2d(32)
        self.bn3 = nn.BatchNorm1d(16384)

        self.poly_act1 = poly_act1
        self.poly2 = poly2
        self.poly3 = poly3
        self.dict_stats = {}
        self.gather_stats = False

    def forward(self, x):
        x = self.bn1(x)
        x = self.conv1(x)
        if self.gather_stats:
            self.dict_stats['conv1_mean'] = x.clone().detach().cpu().numpy().mean()
            self.dict_stats['conv1_std'] = x.clone().detach().cpu().numpy().std()
        x = self.poly_act1(x)

        x = self.bn2(x)
        x = self.conv2(x)
        if self.gather_stats:
            self.dict_stats['conv2_mean'] = x.clone().detach().cpu().numpy().mean()
            self.dict_stats['conv2_std'] = x.clone().detach().cpu().numpy().std()
        x = self.poly2(x)
        x = F.max_pool2d(x, 2)
        x = self.dropout1(x)
        x = torch.flatten(x, 1)

        x = self.bn3(x)
        x = self.fc1(x)
        if self.gather_stats:
            self.dict_stats['fc1_mean'] = x.clone().detach().cpu().numpy().mean()
            self.dict_stats['fc1_std'] = x.clone().detach().cpu().numpy().std()
        x = self.poly3(x)
        x = self.dropout2(x)
        x = self.fc2(x)
        output = F.log_softmax(x, dim=1)
        return output

BATCH_SIZE = 256
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(DEVICE)

dataset_train = torchvision.datasets.CIFAR10('./data_cifar', train=True, download=True, transform=torchvision.transforms.ToTensor())
dataset_test = torchvision.datasets.CIFAR10('./data_cifar', train=False, download=True, transform=torchvision.transforms.ToTensor())
dl_train = torch.utils.data.DataLoader(dataset_train, batch_size=BATCH_SIZE, shuffle=True)
dl_test = torch.utils.data.DataLoader(dataset_test, batch_size=256, shuffle=False)

  Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


(<poly_fit_relu.PolyAct object at 0x7f9967ca40a0>, tensor([ 3.7504e-01,  5.0000e-01,  1.1718e-01, -3.1369e-08]))
cuda
Files already downloaded and verified
Files already downloaded and verified


In [2]:
class PolyAct(Optimizable):
        def __init__(self, optimizer, coefs = torch.zeros(6)):
            self.n = coefs.shape[0]
            self.coefs = nn.Parameter(coefs)
            self.parameters = {'coefs': self.coefs}
            self.optimizer = optimizer
            self.all_params_with_gradients = [self.parameters['coefs']]
            super().__init__(self.parameters, self.optimizer)

        def __call__(self, x):
            out = 0
            for i in range(self.n):
                out += self.parameters['coefs'][i] * x ** i
            return out
        
        def step(self):
            self.optimizer.step(self.parameters)


poly_act1, coefs_init = pfr(4)
poly2, _ = pfr(4)
poly3, _ = pfr(4)
poly_act_init = PolyAct(gdtuo.Adam(0.0001), coefs_init)
poly_act1.optimizer = gdtuo.Adam(0.001)
poly2.optimizer = gdtuo.Adam(0.001)
poly3.optimizer = gdtuo.Adam(0.001)

poly_act1.initialize()
poly2.initialize()
poly3.initialize()

model = MNIST_CNN(poly_act1, poly2, poly3).to(DEVICE)
optim = torch.optim.Adam(model.parameters(), lr=0.001)

In [3]:
init_time = time.time()
EPOCHS = 10
coefs_list = [coefs_init.detach().cpu().numpy()]
for i in range(1, EPOCHS+1):
    running_acc = 0.0
    running_loss = 0.0
    model.train()
    for j, (features_, labels_) in enumerate(dl_train):

        if j % 100 == 0:
            print('coefs so far', poly_act1.parameters['coefs'].detach())
            coefs_list.append(poly_act1.parameters['coefs'].detach().cpu().numpy())
        poly_act1.begin()
        poly2.begin()
        poly3.begin()
        optim.zero_grad()
        poly_act1.zero_grad()
        poly2.zero_grad()
        poly3.zero_grad()
        features, labels = features_.to(DEVICE), labels_.to(DEVICE)
        pred = model.forward(features)
        loss = F.nll_loss(pred, labels)
        loss.backward(create_graph=True)

        optim.step()
        poly_act1.step()
        poly2.step()
        poly3.step()
        running_loss += loss.item() * features_.size(0)
        running_acc += (torch.argmax(pred, dim=1) == labels).sum().item()
    train_loss = running_loss / len(dl_train.dataset)
    train_acc = running_acc / len(dl_train.dataset)

    running_acc = 0.0
    with torch.no_grad():
        model.eval()
        for j, (features_, labels_) in enumerate(dl_test):
            features, labels = features_.to(DEVICE), labels_.to(DEVICE)
            pred = model.forward(features)
            running_acc += (torch.argmax(pred, dim=1) == labels).sum().item()

    test_acc = running_acc / len(dl_test.dataset)
    print("EPOCH: {}, TRAIN LOSS: {}, ACC: {}".format(i, train_loss, train_acc))
    print("EPOCH: {}, TEST ACC: {}\n".format(i, test_acc))

print("Time taken: {}".format(time.time() - init_time))

coefs so far tensor([ 3.7504e-01,  5.0000e-01,  1.1718e-01, -3.1369e-08])




coefs so far tensor([ 0.3750,  0.4378,  0.1392, -0.0667])
EPOCH: 1, TRAIN LOSS: 1.7395512361907959, ACC: 0.42018
EPOCH: 1, TEST ACC: 0.5917

coefs so far tensor([ 0.3750,  0.3658,  0.1379, -0.1109])
coefs so far tensor([ 0.3751,  0.3442,  0.1464, -0.1217])
EPOCH: 2, TRAIN LOSS: 1.1689998084259032, ACC: 0.58892
EPOCH: 2, TEST ACC: 0.6547

coefs so far tensor([ 0.3751,  0.3222,  0.1592, -0.1336])
coefs so far tensor([ 0.3751,  0.3203,  0.1618, -0.1348])
EPOCH: 3, TRAIN LOSS: 1.0174419497871399, ACC: 0.64468
EPOCH: 3, TEST ACC: 0.68

coefs so far tensor([ 0.3751,  0.3130,  0.1635, -0.1407])
coefs so far tensor([ 0.3751,  0.3108,  0.1700, -0.1404])
EPOCH: 4, TRAIN LOSS: 0.9159049110412598, ACC: 0.68076
EPOCH: 4, TEST ACC: 0.6888

coefs so far tensor([ 0.3750,  0.3074,  0.1786, -0.1434])
coefs so far tensor([ 0.3750,  0.3068,  0.1747, -0.1463])
EPOCH: 5, TRAIN LOSS: 0.8001279209136963, ACC: 0.71754
EPOCH: 5, TEST ACC: 0.6885

coefs so far tensor([ 0.3751,  0.3062,  0.1805, -0.1440])
coefs s

In [4]:
x = np.linspace(-4, 4, 1000)

for i in range(len(coefs_list)):
    curr_poly_act = PolyAct(gdtuo.Adam(0.001), torch.tensor(coefs_list[i]))
    y = curr_poly_act(torch.tensor(x)).detach()

    fig, ax = plt.subplots()
    ax.plot(x, y)
    ax.plot(x, np.maximum(x, 0))
    # set small cross at 0.0
    ax.plot([0.0], [0.0], 'x', color='red')

    ax.set_xlim([-4, 4])
    ax.set_ylim([-10, 10])
    ax.set_yscale('linear')
    os.makedirs('plots', exist_ok=True)
    plt.savefig('plots/{}.png'.format(i))
    plt.close()

video = './polyact.mp4'
imageio.mimsave(video, [imageio.imread('plots/{}.png'.format(i)) for i in range(len(coefs_list))], fps = 3)
#play it here
Video(video)

  imageio.mimsave(video, [imageio.imread('plots/{}.png'.format(i)) for i in range(len(coefs_list))], fps = 3)


In [5]:
mean_weight_CNN = 0
std_weight_CNN = 0

for name, param in model.named_parameters():
    if 'weight' in name:
        mean_weight_CNN += param.data.mean()
        
print(mean_weight_CNN) 


tensor(2.9783, device='cuda:0')
