In [1]:
import sys
sys.path.insert(0, './../../../Models')
from sphere_points import generate_points

import numpy as np
np.random.seed(0)
import matplotlib.pyplot as plt

import torch
torch.manual_seed(0)
import torch.nn as nn
from tqdm import tqdm
from torch.optim import SGD
from torch.nn.functional import normalize, one_hot
# import torch.nn.functional as F
import torchvision
from torchvision import transforms
# from torch.optim.lr_scheduler import StepLR
from torch.utils.data import DataLoader

In [2]:
# device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
# device = torch.device("cpu")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [3]:
def plot_losses(losses):
    losses = np.array(losses)
    n_dims, epochs = losses.shape
    plt.figure(figsize = (12, 5))
    for l in range(n_dims):
        plt.subplot(1, n_dims, l + 1)
        plt.plot(1 + np.arange(epochs), losses[l])
        plt.title(f"Layer {l + 1} Loss")
        plt.xlabel("Epochs")
        plt.ylabel("Loss")
    plt.tight_layout()
    plt.show()

In [4]:
# model_loss = nn.CrossEntropyLoss()
num_classes = 10

In [5]:
def classifier_head_train(inp_embedding, classifier_weights, labels):
    # model_loss = nn.CrossEntropyLoss()
    inp_embedding = normalize(inp_embedding, p=2, dim=-1)
    classifier_output = torch.mm(inp_embedding, classifier_weights)
    # classifier_output = 1 - (torch.acos(classifier_output)/np.pi)
    classifier_output = classifier_output * one_hot(labels, num_classes = num_classes).type(torch.float32)
    # classifier_output = torch.softmax(classifier_output, dim=-1)
    # loss = model_loss(classifier_output, one_hot(labels, num_classes = num_classes).type(torch.float32))
    theta = 1
    loss = torch.mean(torch.log(2 - (theta * torch.sum(classifier_output,1))))
    return loss

In [6]:
def classifier_head(inp_embedding, classifier_weights, labels):
    inp_embedding = normalize(inp_embedding, p=2, dim=-1)
    classifier_output = torch.mm(inp_embedding, classifier_weights)
    # classifier_output = 1 - (torch.acos(classifier_output)/np.pi)
    # classifier_output = torch.softmax(classifier_output, dim=-1)
    # loss = model_loss(classifier_output, one_hot(labels, num_classes = num_classes).type(torch.float32))
    return torch.argmax(classifier_output, dim=1).tolist()

In [7]:
initial = None
# num_classes = 10

# Data dimension
# (num_data, num_features) => no dimension for batch size please
class Layer(nn.Linear):
    def __init__(self, in_features, out_features, bias, device, lr, apply_dropout=False):
        super().__init__(in_features, out_features, bias, device)
        self.out_features = out_features
        self.bias_flag = bias
        self.lr = lr
        self.num_classes = num_classes
        self.dimension = out_features
        # self.activation = DyT(1)
        # self.activation = nn.LeakyReLU(negative_slope=0.001)
        # self.leaky_relu = nn.LeakyReLU(negative_slope=0.001)
        # self.activation = nn.GELU()
        # self.opt = Adam(self.parameters(), lr = self.lr)
        # self.opt = SGD(self.parameters(), lr=self.lr, momentum=0.9)
        # self.scheduler = torch.optim.lr_scheduler.MultiStepLR(self.opt, 
        #                                                       milestones=[60], gamma=0.1)
        # nn.init.kaiming_normal_(self.weight, mode='fan_in')
        
        fc1_limit = np.sqrt(6.0 / in_features)
        torch.nn.init.uniform_(self.weight, a=-fc1_limit, b=fc1_limit)
        
        self.dropout = nn.Dropout(0.1)
        self.apply_dropout = apply_dropout
        global initial
        self.directions = generate_points(self.num_classes, self.dimension, steps = 10000)
        initial = np.array(self.directions)
        self.directions = [torch.tensor(t, dtype = torch.float32).to(device) for t in self.directions]
        self.direction_weights = torch.zeros((len(self.directions[0]), len(self.directions)), device=device, 
                                             requires_grad=False)
        for i in range(len(self.directions)):
            self.direction_weights[:, i] = normalize(self.directions[i], p = 2, dim=-1)

    def train(self, x, labels):
        if self.apply_dropout:
            x = self.dropout(x)
        # activation = nn.ELU()
        activation = nn.LeakyReLU(negative_slope=0.001)
        # activation = nn.Tanh()
        # activation = self.activation
        # opt = SGD(self.parameters(), lr=self.lr, momentum=0.9)
        opt = SGD(self.parameters(), lr=self.lr)
        # activation = nn.GELU()
        # activation = nn.ReLU()
        # opt = Adam(self.parameters(), lr=self.lr)
        # print(f"gpu used {torch.cuda.max_memory_allocated(device=None)} memory")
        if self.bias_flag:
            y = activation(torch.mm(x, self.weight.T) + self.bias.unsqueeze(0))
        else:
            y = activation(torch.mm(x, self.weight.T))
        # if self.apply_dropout:
        #     x = self.dropout(x)
        # y = self.forward(x) # shape: (num_data, out_features)
        '''
        y = normalize(y, p = 2, dim = 1)
        '''
        # import pdb;pdb.set_trace()
        loss = classifier_head_train(y, self.direction_weights, labels)
        
        '''
        directions = torch.zeros_like(y)
        for i in range(y.shape[0]):
            directions[i, :] = self.directions[label[i]]
        
        loss = loss_layer(y, directions)
        '''
        opt.zero_grad(set_to_none=True)
        loss.backward(retain_graph = False)
        opt.step()
        # self.scheduler.step()
        
        # normalize the directions
        # self.directions.data = normalize(self.directions.data, p = 2, dim = 1)

        return loss.detach().item(), y
    
    def test(self, x, labels):
        with torch.no_grad():
            # activation = nn.ELU()
            # y = self.forward(x)
            activation = nn.LeakyReLU(negative_slope=0.001)
            # activation = nn.ReLU()
            # activation = nn.GELU()
            # activation = nn.Tanh()
            # activation = self.activation
            if self.bias_flag:
                y = activation(torch.mm(x, self.weight.T) + self.bias.unsqueeze(0))
            else:
                y = activation(torch.mm(x, self.weight.T))
        max_idx_list =[]
        max_idx_list = classifier_head(y, self.direction_weights, labels)
        '''
        for dat in range(y.shape[0]):
            max = -np.inf
            max_idx = 0
            for i in range(self.num_classes):
                cos_sim = cos_similarity(y[dat, :].unsqueeze(0), self.directions[i].reshape(1, -1))
                if cos_sim > max:
                    max = cos_sim
                    max_idx = i
            max_idx_list.append(max_idx)
        '''
        return torch.tensor(max_idx_list, device=device), y

In [8]:
class Net(nn.Module):
    def __init__(self, dims_list, bias, epochs, lr, device):
        super(Net, self).__init__()
        self.dims_list = dims_list
        self.bias = bias
        self.epochs = epochs
        self.lr = lr
        self.device = device
        self.layers = []
        # self.sigmoid = nn.Sigmoid()
        global initial
        for d in range(len(self.dims_list) - 1):
            print(f"Initialization {d + 1} / {len(self.dims_list) - 1}")
            self.layers += [Layer(self.dims_list[d], self.dims_list[d + 1], self.bias, self.device, self.lr)]
            print("Complete\n")
        
    def train(self, train_loader, test_loader):
        layer_loss_list = []
        acc_train = []
        acc_test = []
        layer_w = [[] for _ in range(len(self.dims_list) - 1)]
        
        for i in range(len(self.layers)):
            layer_loss_list.append([])
        pbar = tqdm(total = self.epochs * len(train_loader) * len(self.layers), 
                    desc = f"Training", position = 0, leave = True)
        
        # # Test the network
        # with torch.no_grad():
           
        #     for i in range(len(self.layers)):
        #         layer_w[i].append(torch.norm(self.layers[i].weight, p=2).item())
        #     acc_train.append(net.test(trainloader))
        #     acc_test.append(net.test(testloader))
        
        for epoch in range(self.epochs):

            # if epoch and not (epoch % 10): 
            #     # learning rate decay
            #     for i in range(len(self.layers)):
            #         self.layers[i].lr = self.layers[i].lr - 0.1
            #         print('lr decreased to ', self.layers[i].lr)
            
            loss_agg = [0] * len(self.layers)
            for dat in train_loader:
                x, label = dat
                x = x.to(device)
                label = label.to(device)
                for i in range(len(self.layers)):
                    
                    loss, y = self.layers[i].train(x, label)
                    self.layers[i].zero_grad(set_to_none=True)
                    x = y.detach()
                    loss_agg[i] += loss / len(train_loader)
                    del y
                    pbar.update(1)
            pbar.set_postfix(epoch = epoch + 1, loss = loss_agg)
            for i in range(len(self.layers)):
                layer_loss_list[i].append(loss_agg[i])
                
        # Test the network
        with torch.no_grad():
            for i in range(len(self.layers)):
                layer_w[i].append(torch.norm(self.layers[i].weight, p=2).item())
            acc_train.append(net.test(train_loader))
            acc_test.append(net.test(test_loader))  
                
        pbar.close()
        return [layer_loss_list, acc_train, acc_test, layer_w]
    
    def test(self, data_loader):
        all_accuracy = []
        correct = [0 for _ in range(len(self.layers))]
        total = [0 for _ in range(len(self.layers))]
        for dat in data_loader:
            x = dat[0]
            label = dat[1]
            x = x.to(device)
            label = label.to(device)
            num = label
            preds = []
            
            for i in range(len(self.layers)):
                pred, x = self.layers[i].test(x, label)
                preds.append(pred)
            
            for i in range(len(preds)):
                correct[i] += (preds[i] == num).sum().item()
                total[i] += num.shape[0]
                
        all_accuracy.append(np.array(correct) / total[-1])
        return all_accuracy

In [9]:
flatten_transform = transforms.Lambda(lambda x: x.view(x.size(0), -1).squeeze())
'''
# Define data transformations
transform = transforms.Compose([
    transforms.ToTensor(),
    flatten_transform
])
'''

# # Define data transformations
# transform = transforms.Compose([
#     transforms.ToTensor(),
#     transforms.Normalize((0.,), (0.5,)),
#     flatten_transform
# ])

transform = transforms.Compose([transforms.ToTensor(), flatten_transform]) # this normalizes to [0,1]

trainset = torchvision.datasets.MNIST(root='./../../../Data', train=True, download=True, transform=transform)
testset = torchvision.datasets.MNIST(root='./../../../Data', train=False, download=True, transform=transform)
batch_size = 50
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=False)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle = False)

# full_data_set = torch.utils.data.ConcatDataset([trainset, testset])

In [10]:
# ratio = [0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
ratio = [0.2]
test_acc = []
train_acc = []
w_layers = []
loss = []

num_runs = 2
learning_rate = [0.01, 0.1, 1, 1.5, 2.5, 3]

# for r in ratio:
    # print(f"train ratio: {r}")
'''    
test_size = int(r * len(full_data_set))
train_size = len(full_data_set) - test_size
train_dataset, test_dataset = torch.utils.data.random_split(full_data_set, [train_size, test_size])

# Create DataLoader for training set
trainloader = DataLoader(train_dataset, batch_size = 64, shuffle=True)

# Create DataLoader for test set
testloader = DataLoader(test_dataset, batch_size = 64, shuffle=False)
'''
for lr in learning_rate:
    for _ in range(num_runs):
        
        dims_list = [784, 1024, 10]
        # dims_list = [784, 1000, 34]
        bias = True
        epochs = 200
        # lr  = 2.5
        num_classes = 10
        net = Net(dims_list, bias, epochs, lr, device)
        
        # Train the network
        layer_loss_list = net.train(trainloader, testloader)
        
        # plot_losses(layer_loss_list[0])
        
        '''
        # Test the network
        acc_train = net.test(trainloader)
        print(f"Train accuracy: {acc_train * 100:.2f}%")
        
        acc_test = net.test(testloader)
        print(f"Test accuracy: {acc_test * 100:.2f}%")
        '''
        # loss.append(layer_loss_list[0])
        train_acc.append(layer_loss_list[1])
        test_acc.append(layer_loss_list[2])
        # w_layers.append(layer_loss_list[3])

Initialization 1 / 2
Complete

Initialization 2 / 2
Complete



Training: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 480000/480000 [11:16<00:00, 709.65it/s, epoch=200, loss=[0.3999005068590242, 0.27978336771329226]]


Initialization 1 / 2
Complete

Initialization 2 / 2
Complete



Training: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 480000/480000 [12:03<00:00, 663.58it/s, epoch=200, loss=[0.3993406244864071, 0.22044970137377567]]


Initialization 1 / 2
Complete

Initialization 2 / 2
Complete



Training: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 480000/480000 [12:45<00:00, 627.16it/s, epoch=200, loss=[0.35869873131314933, 0.2430819298078617]]


Initialization 1 / 2
Complete

Initialization 2 / 2
Complete



Training: 100%|█| 480000/480000 [13:02<00:00, 613.34it/s, epoch=200, loss=[0.3590691192199787, 0.266


Initialization 1 / 2
Complete

Initialization 2 / 2
Complete



Training: 100%|█| 480000/480000 [13:16<00:00, 602.59it/s, epoch=200, loss=[0.3474518474936487, 0.298


Initialization 1 / 2
Complete

Initialization 2 / 2
Complete



Training: 100%|█| 480000/480000 [13:39<00:00, 585.92it/s, epoch=200, loss=[0.345446881478031, 0.2451


Initialization 1 / 2
Complete

Initialization 2 / 2
Complete



Training: 100%|█| 480000/480000 [13:35<00:00, 588.77it/s, epoch=200, loss=[0.34670595996081793, 0.25


Initialization 1 / 2
Complete

Initialization 2 / 2
Complete



Training: 100%|█| 480000/480000 [13:26<00:00, 595.42it/s, epoch=200, loss=[0.34734285498658884, 0.28


Initialization 1 / 2
Complete

Initialization 2 / 2
Complete



Training: 100%|█| 480000/480000 [12:57<00:00, 617.01it/s, epoch=200, loss=[0.34422465180357315, 0.24


Initialization 1 / 2
Complete

Initialization 2 / 2
Complete



Training: 100%|█| 480000/480000 [13:24<00:00, 596.68it/s, epoch=200, loss=[0.3438337584336595, 0.293


Initialization 1 / 2
Complete

Initialization 2 / 2
Complete



Training: 100%|█| 480000/480000 [13:00<00:0


Initialization 1 / 2
Complete

Initialization 2 / 2
Complete



Training: 100%|█| 480000/480000 [13:14<00:0


In [11]:
print(test_acc)
print(np.max(test_acc))

[[[array([0.8889, 0.8628])]], [[array([0.8896, 0.9033])]], [[array([0.9124, 0.9437])]], [[array([0.9121, 0.9499])]], [[array([0.9104, 0.9335])]], [[array([0.9114, 0.9266])]], [[array([0.9077, 0.9415])]], [[array([0.9103, 0.9214])]], [[array([0.909 , 0.9047])]], [[array([0.9099, 0.952 ])]], [[array([0.9108, 0.9379])]], [[array([0.9081, 0.9287])]]]
0.952


In [12]:
print(train_acc)
print(np.max(train_acc))

[[[array([0.88256667, 0.86833333])]], [[array([0.88383333, 0.9113    ])]], [[array([0.90856667, 0.94943333])]], [[array([0.90786667, 0.96106667])]], [[array([0.91211667, 0.9386    ])]], [[array([0.91325, 0.93835])]], [[array([0.91051667, 0.94865   ])]], [[array([0.91183333, 0.93308333])]], [[array([0.91178333, 0.90926667])]], [[array([0.91241667, 0.96355   ])]], [[array([0.912  , 0.94515])]], [[array([0.91111667, 0.93833333])]]]
0.96355


In [13]:
np.save("./../new_data/ablation_lr_mnist_train_acc_1.npy", np.array(train_acc))
np.save("./../new_data/ablation_lr_mnist_test_acc_1.npy", np.array(test_acc))