In [1]:
import torch
import torch.nn as nn
import torchvision
import torch.optim as optim
from torch.optim import lr_scheduler
from torchvision import datasets, transforms
import matplotlib.pyplot as plt
import torch.nn.functional as F
import numpy as np
import math

''' Parameters (CHange Anything Here!) '''
transform = transforms.ToTensor()
batch_size = 3
# lifetime Sparcity
k_percent = 5


''' Code Starts Here '''
# Data MNIST
mnist_data = datasets.MNIST(root='./data', train = True, download = True, transform = transform)
data_loader = torch.utils.data.DataLoader(dataset= mnist_data, batch_size = batch_size, shuffle = True)
dataiter = iter(data_loader)
images, labels = dataiter.next()


# testing model
''' Conv 2d Layer 
#         Accessible Variables: .weights(Tensor), .bias(Tensor)
#         parameters :
#         torch.nn.Conv2d(in_channels, out_channels, 
#                         kernel_size, stride=1, padding=0, 
#                         dilation=1, groups=1, bias=True, 
#                         padding_mode='zeros')
'''
# CONV-WTA CRITERIA
# - zero padded, so that each feature map has the same size as the input
# - hidden representation is mapped linearly to the output using a deconvolution operation
# - Parameters are optimized to reduce the mean squared error MSE
# - Conv layer is 5 x5, DECONVOLUTION layer is using filters of 11x 11
### In this implementation, I will not use deconvolution, but transpose convolution to ease process
class Autoencoder_Test(nn.Module):
    def __init__(self):
        super().__init__()

        #Image size:N, 28, 28
        self.conv1      = nn.Conv2d(1, 2, 5, stride=1, padding = 2) 
        self.transConv1 = nn.ConvTranspose2d(in_channels=2, out_channels=3, kernel_size=11, stride =1, padding = 5) # padding will decrease output size
        
    def forward(self, x):
        encoded = self.conv1(x) # encode, output: torch.Size([3, 2, 26, 26])
        hidden, winners = self.spatial_sparsity_(encoded)
        hidden = self.lifetime_sparsity_(hidden, winner, k_percent = 0.1)
        decoded = self.transConv1(hidden)
        return decoded
    
    # Spatial Sparsity reconstructs the activation map, remain only one winner neuron of each feature map and rest to 0
    # with torch.no_grad() temporarily sets all of the requires_grad flags to false
    def spatial_sparsity_(self, hiddenMaps):
        with torch.no_grad():
            shape = hiddenMaps.shape  #torch.Size([batch_size, feature_num, 26, 26])
            n_batches = shape[0]
            n_features = shape[1]
            size = shape[2]
            
            # Step 1: flatten it out, find max_vals
            flatten = hiddenMaps.view(n_batches, n_features, -1)
            max_val, batch_idx = torch.max(flatten, 0) # max_val return size[n_batches, n_features]
            
            # Step 2: creating "drop" Array to be multiplied into featureMaps, dropping loser values
            maxval, _ = torch.max(flatten, 2)
            maxval_p = torch.reshape(maxval, (n_batches, n_features, 1, 1))
            drop = torch.where(hiddenMaps < maxval_p, 
                               torch.zeros((n_batches, n_features, size, size)), 
                               torch.ones((n_batches,n_features2, size, size)))
            
            return hiddenMaps*drop, maxval
    # Only retain the top-k percent of the winners for every feature. The rest will be zeroed out
    def lifetime_sparsity_(self, hiddenMaps, maxval, k_percent):
        with torch.no_grad():
            shape = hiddenMaps.shape  #torch.Size([batch_size, feature_num, 26, 26])
            n_batches = shape[0]
            n_features = shape[1]
            size = shape[2]
            k = math.floor(n_batches * k_percent)

            top_k, _ = torch.topk(maxval, k, 0) #c, k

            # Step 2: creating "drop" Array to be multiplied into featureMaps, dropping loser values
            drop = torch.where(maxval < top_k[k-1:k, :],  
                               torch.zeros((n_batches, n_features)), 
                               torch.ones((n_batches, n_features)))

            # drop = drop.transpose(0, 1)
            # dropping all them loser batches to zero
            return hiddenMaps * drop.reshape(n_batches, n_features, 1, 1)
    
model = Autoencoder_Test()
generator = model.parameters() #(returns a generator)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3, weight_decay = 1e-5)

In [24]:
# def _lifetime_sparsity(self, h, winner, rate):
#     shape = tf.shape(winner)
#     n = shape[0]
#     c = shape[1]
#     k = tf.cast(rate * tf.cast(n, tf.float32), tf.int32)

#     winner = tf.transpose(winner) # c, n
#     th_k, _ = tf.nn.top_k(winner, k) # c, k

#     shape_t = tf.stack([c, n])
#     drop = tf.where(winner < th_k[:,k-1:k], # c, n
#       tf.zeros(shape_t, tf.float32), tf.ones(shape_t, tf.float32))
#     drop = tf.transpose(drop) # n, c
#     return h * tf.reshape(drop, tf.stack([n, 1, 1, c]))
import math
k_percent = 0.5
hiddenMaps = torch.arange(4*3*4*4).view(4,3,4,4)
# print(hiddenMaps)
shape = hiddenMaps.shape  #torch.Size([batch_size, feature_num, 26, 26])
n_batches = shape[0]
n_features = shape[1]
size = shape[2]
k = math.floor(n_batches * k_percent)
# k = torch.tensor([2])

# from spatial sparsity
flatten = hiddenMaps.view(n_batches, n_features, -1)
maxval, _ = torch.max(flatten, 2)
print(maxval)

# Step 1: pick the max of dim-0, along the batch axis
# if input size is (n, c), returns (k, c) if operate over dim-0
# maxval_t = torch.transpose(maxval, 0, 1) # c, n
# print(maxval_t)
# top_k, _ = torch.topk(maxval_t, k.item(), 1) #c, k
# print(top_k)
# print(top_k[:,k-1:k])
top_k, _ = torch.topk(maxval, k, 0) #k, c, maxval: n, c
print(top_k)
# print(top_k)
maxval_t = torch.transpose(maxval, 0,1) 
top_k = torch.transpose(top_k, 0,1)
# print(top_k)
# print(maxval_t)
print(maxval_t)
print(top_k[:, k-1:k])
# Step 2: creating "drop" Array to be multiplied into featureMaps, dropping loser values
drop = torch.where(maxval_t < top_k[:, k-1:k],  
                   torch.zeros((n_features, n_batches)), 
                   torch.ones((n_features, n_batches)))

print(drop)
print(drop.transpose(0, 1))
drop = drop.transpose(0, 1)
print(drop.reshape(n_batches, n_features, 1, 1))

# dropping all them loser batches to zero
# hidden map size = (4,3,4,4), drop size = (4, 3, 1, 1)
hiddenMaps * drop.reshape(n_batches, n_features, 1, 1)

tensor([[ 15,  31,  47],
        [ 63,  79,  95],
        [111, 127, 143],
        [159, 175, 191]])
tensor([[159, 175, 191],
        [111, 127, 143],
        [ 63,  79,  95]])
tensor([[ 15,  63, 111, 159],
        [ 31,  79, 127, 175],
        [ 47,  95, 143, 191]])
tensor([[63],
        [79],
        [95]])
tensor([[0., 1., 1., 1.],
        [0., 1., 1., 1.],
        [0., 1., 1., 1.]])
tensor([[0., 0., 0.],
        [1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]])
tensor([[[[0.]],

         [[0.]],

         [[0.]]],


        [[[1.]],

         [[1.]],

         [[1.]]],


        [[[1.]],

         [[1.]],

         [[1.]]],


        [[[1.]],

         [[1.]],

         [[1.]]]])


tensor([[[[  0.,   0.,   0.,   0.],
          [  0.,   0.,   0.,   0.],
          [  0.,   0.,   0.,   0.],
          [  0.,   0.,   0.,   0.]],

         [[  0.,   0.,   0.,   0.],
          [  0.,   0.,   0.,   0.],
          [  0.,   0.,   0.,   0.],
          [  0.,   0.,   0.,   0.]],

         [[  0.,   0.,   0.,   0.],
          [  0.,   0.,   0.,   0.],
          [  0.,   0.,   0.,   0.],
          [  0.,   0.,   0.,   0.]]],


        [[[ 48.,  49.,  50.,  51.],
          [ 52.,  53.,  54.,  55.],
          [ 56.,  57.,  58.,  59.],
          [ 60.,  61.,  62.,  63.]],

         [[ 64.,  65.,  66.,  67.],
          [ 68.,  69.,  70.,  71.],
          [ 72.,  73.,  74.,  75.],
          [ 76.,  77.,  78.,  79.]],

         [[ 80.,  81.,  82.,  83.],
          [ 84.,  85.,  86.,  87.],
          [ 88.,  89.,  90.,  91.],
          [ 92.,  93.,  94.,  95.]]],


        [[[ 96.,  97.,  98.,  99.],
          [100., 101., 102., 103.],
          [104., 105., 106., 107.],
          [1

In [40]:
import math
k_percent = 0.5
# hiddenMaps = torch.arange(4*3*2*2).view(4,3,2,2)

# Four batches , 3 featureMaps generated per batch

# Batch 1
hiddenMaps = torch.tensor([[[[0,   1],
                             [99,  3]],
                            
                            [[4,   5],
                             [96,  7]],

                            [[8,  97],
                             [10, 5]]],

# Batch 2
                           [[[12, 13],
                             [98, 15]],

                            [[95, 17],
                             [18, 19]],

                            [[20, 21],
                             [22, 90]]],

# Batch 3
                           [[[24, 89],
                             [26, 27]],

                            [[28, 29],
                             [93, 31]],

                            [[32, 94],
                             [34, 35]]],

# Batch 4
                           [[[36, 37],
                             [91, 39]],

                            [[88, 41],
                             [42, 43]],

                            [[92, 45],
                             [46, 47]]]])
# print(hiddenMaps)
spatial = hiddenMaps.clone()
shape = hiddenMaps.shape  #torch.Size([batch_size, feature_num, 26, 26])
n_batches = shape[0]
n_features = shape[1]
size = shape[2]
k = 2
# k = torch.tensor([2])

# from spatial sparsity
flatten = hiddenMaps.view(n_batches, n_features, -1)
maxval, _ = torch.max(flatten, 2)
maxval_p = torch.reshape(maxval, (n_batches, n_features, 1, 1))
drop = torch.where(hiddenMaps < maxval_p, 
                   torch.zeros((n_batches, n_features, size, size)), 
                   torch.ones((n_batches,n_features, size, size)))
spatial.data = hiddenMaps.data*drop.data
print(spatial)
# Step 1: pick the max of dim-0, along the batch axis
# if input size is (n, c), returns (k, c) if operate over dim-0
# maxval_t = torch.transpose(maxval, 0, 1) # c, n
# print(maxval_t)
# top_k, _ = torch.topk(maxval_t, k.item(), 1) #c, k
# print(top_k)
# print(top_k[:,k-1:k])
top_k, _ = torch.topk(maxval, k, 0) #k, c, maxval: n, c
# print(top_k)
# print(top_k)
# print(top_k)
# print(top_k[k-1:k,: ])
# temp = torch.tensor([[64, 79, 95]])
# Step 2: creating "drop" Array to be multiplied into featureMaps, dropping loser values
drop = torch.where(maxval < top_k[k-1:k,: ],  
                   torch.zeros((n_batches, n_features)), 
                   torch.ones((n_batches, n_features)))
# print(drop)
# dropping all them loser batches to zero
# hidden map size = (4,3,4,4), drop size = (4, 3, 1, 1)
spatial * drop.reshape(n_batches, n_features, 1, 1)

tensor([[[[ 0.,  0.],
          [99.,  0.]],

         [[ 0.,  0.],
          [96.,  0.]],

         [[ 0., 97.],
          [ 0.,  0.]]],


        [[[ 0.,  0.],
          [98.,  0.]],

         [[95.,  0.],
          [ 0.,  0.]],

         [[ 0.,  0.],
          [ 0., 90.]]],


        [[[ 0., 89.],
          [ 0.,  0.]],

         [[ 0.,  0.],
          [93.,  0.]],

         [[ 0., 94.],
          [ 0.,  0.]]],


        [[[ 0.,  0.],
          [91.,  0.]],

         [[88.,  0.],
          [ 0.,  0.]],

         [[92.,  0.],
          [ 0.,  0.]]]])


tensor([[[[ 0.,  0.],
          [99.,  0.]],

         [[ 0.,  0.],
          [96.,  0.]],

         [[ 0., 97.],
          [ 0.,  0.]]],


        [[[ 0.,  0.],
          [98.,  0.]],

         [[95.,  0.],
          [ 0.,  0.]],

         [[ 0.,  0.],
          [ 0.,  0.]]],


        [[[ 0.,  0.],
          [ 0.,  0.]],

         [[ 0.,  0.],
          [ 0.,  0.]],

         [[ 0., 94.],
          [ 0.,  0.]]],


        [[[ 0.,  0.],
          [ 0.,  0.]],

         [[ 0.,  0.],
          [ 0.,  0.]],

         [[ 0.,  0.],
          [ 0.,  0.]]]])

In [10]:
# Spatial Sparsity test:
hiddenMaps = torch.arange(3*2*4*4).view(3,2,4,4)
shape = hiddenMaps.shape  #torch.Size([batch_size, feature_num, 26, 26])
n_batches = shape[0]
n_features = shape[1]
size = shape[2]

# Step 1: flatten it out, find max_vals
flatten = hiddenMaps.view(n_batches, n_features, -1)
max_val, batch_idx = torch.max(flatten, 0) # max_val return size[n_batches, n_features]

# Step 2: creating "drop" Array to be multiplied into featureMaps, dropping loser values
maxval, _ = torch.max(flatten, 2)
maxval_p = torch.reshape(maxval, (n_batches, n_features, 1, 1))
drop = torch.where(hiddenMaps < maxval_p, 
                   torch.zeros((n_batches, n_features, size, size)), 
                   torch.ones((n_batches,n_features, size, size)))
print(hiddenMaps)
print(hiddenMaps*drop,"\n", maxval)

tensor([[[[ 0,  1,  2,  3],
          [ 4,  5,  6,  7],
          [ 8,  9, 10, 11],
          [12, 13, 14, 15]],

         [[16, 17, 18, 19],
          [20, 21, 22, 23],
          [24, 25, 26, 27],
          [28, 29, 30, 31]]],


        [[[32, 33, 34, 35],
          [36, 37, 38, 39],
          [40, 41, 42, 43],
          [44, 45, 46, 47]],

         [[48, 49, 50, 51],
          [52, 53, 54, 55],
          [56, 57, 58, 59],
          [60, 61, 62, 63]]],


        [[[64, 65, 66, 67],
          [68, 69, 70, 71],
          [72, 73, 74, 75],
          [76, 77, 78, 79]],

         [[80, 81, 82, 83],
          [84, 85, 86, 87],
          [88, 89, 90, 91],
          [92, 93, 94, 95]]]])
tensor([[[[ 0.,  0.,  0.,  0.],
          [ 0.,  0.,  0.,  0.],
          [ 0.,  0.,  0.,  0.],
          [ 0.,  0.,  0., 15.]],

         [[ 0.,  0.,  0.,  0.],
          [ 0.,  0.,  0.,  0.],
          [ 0.,  0.,  0.,  0.],
          [ 0.,  0.,  0., 31.]]],


        [[[ 0.,  0.,  0.,  0.],
          [ 0.,  