# LiSET

Versions of libraries used:
- `torch==1.9.0`
- `torchvision==0.9.1+cu102`
- `numpy==1.18.4`
- `tqdm==4.60.0`

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import torchvision
from torchvision import transforms
from torchvision.datasets import MNIST
from torch.utils.data import Dataset, DataLoader

from tqdm import tqdm
import numpy as np

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

# Datasets

- MNIST - provided by torchvision https://pytorch.org/vision/0.8/datasets.html#mnist
- ISOLET - https://archive.ics.uci.edu/ml/datasets/ISOLET

In [None]:
DATABASE = 'MNIST'
batch_size = 100

if DATABASE == 'MNIST':
    transform = transforms.Compose([transforms.ToTensor()])

    train_set = MNIST("./", train=True, download=True, transform=transform)
    test_set = MNIST("./", train=False, download=True, transform=transform)

    train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, shuffle=True)
    test_loader = torch.utils.data.DataLoader(test_set, batch_size=batch_size, shuffle=False)

elif DATABASE == "ISOLET":
    class TrainLoader(Dataset):
        def __init__(self):
            data = pd.read_csv("uci-isolet/original/isolet1+2+3+4.data", index_col=False).to_numpy(dtype=np.float)
            self.x, self.y = torch.from_numpy(data[:, :-1]).float(), torch.from_numpy(data[:, -1] - 1).type(torch.LongTensor)

        def __getitem__(self, idx):
            return self.x[idx], self.y[idx]

        def __len__(self):
            return len(self.x)

    class TestLoader(Dataset):
        def __init__(self):
            data = pd.read_csv("uci-isolet/original/isolet5.data", index_col=False).to_numpy(dtype=np.float)
            self.x, self.y = torch.from_numpy(data[:, :-1]).float(), torch.from_numpy(data[:, -1] - 1).type(torch.LongTensor)

        def __getitem__(self, idx):
            return self.x[idx], self.y[idx]

        def __len__(self):
            return len(self.x)

    train_loader = DataLoader(TrainLoader(), batch_size=100, shuffle=True)
    test_loader = DataLoader(TestLoader(), batch_size=100, shuffle=False)

# Function used to initialize spare matrices

In [None]:
def create_sparse_weights(epsilon, rows, cols):
    prob = epsilon * (rows + cols) / (rows * cols)
    mask_weights = (torch.rand(cols, rows) < prob).float()
    print(f"Created {cols} x {rows} sparse matrix with {torch.sum(mask_weights)} parameters, {rows} rows, {cols} cols")
    return mask_weights

# Implementation of LiSET for MLP
`PyTorch provides ambiguous support for unstructured sparsity, but it has not been used in this implementation. Sparsity was achieved by applying mask on matrices.`

In [None]:
class LiSET_MLP(nn.Module):
    def __init__(self, *dims, EPSILON=20, ZETA=0.3, GAMMA=0, device="cpu"):
        assert len(dims) > 2
        super().__init__()
        
        self.dims = dims
        self.ZETA = ZETA
        self.GAMMA = GAMMA
        self.device=device
        
        self.layers = nn.ModuleList([nn.Linear(dims[i], dims[i + 1]) for i in range(len(dims) - 1)]).to(device)
        self.skip_layers = nn.ModuleList([nn.Linear(dims[i], dims[-1]) for i in range(len(dims) - 1)]).to(device)
        self.masks = [create_sparse_weights(EPSILON, dims[i], dims[i + 1]).to(device) for i in range(len(dims) - 2)]
        self.apply_masks()
    
    def forward(self, x, dropout=0.3, with_skip=True, skip_dropout=0.5):
        self.apply_masks()
        x = torch.flatten(x, 1)
        
        if with_skip:
            result = torch.zeros(x.shape[0], self.dims[-1]).to(x)
        for i, l in enumerate(self.layers[:-1]):
            if with_skip and np.random.rand() > skip_dropout:
                result += self.skip_layers[i](x)
            
            x = F.leaky_relu(l(x))
            if dropout > 0:
                x = F.dropout(x)
        
        x = self.layers[-1](x)
        return F.log_softmax(x + result if with_skip else x, dim=1)
    
    def apply_mask(self, layer, mask):
        assert layer.weight.data.shape == mask.shape
        with torch.no_grad():
            layer.weight.data *= mask
    
    def apply_masks(self):
        for l, m in zip(self.layers[:-1], self.masks):
            self.apply_mask(l, m)
    
    def remove_weights(self, weights):
        sorted_weights = weights.reshape(-1).sort().values
        
        negative_weights = torch.masked_select(sorted_weights, sorted_weights < -0)
        largest_negative = negative_weights[int((1 - self.ZETA) * len(negative_weights))]
        
        positive_weights = torch.masked_select(sorted_weights, sorted_weights > 0)
        smallest_positive = positive_weights[int(self.ZETA * len(positive_weights))]
        
        weight_mask_core = ((weights < largest_negative) + (weights > smallest_positive)).float()
        n_removed = ((torch.abs(weights) > 0).float() - weight_mask_core).nonzero().shape[0]
        
        return weight_mask_core, n_removed
    
    def add_weights(self, mask, n_to_add, skip):
        w = torch.norm(skip, p=1, dim=0)
        w_star = torch.sigmoid(w / torch.max(w) * self.GAMMA)
        
        probs = (mask == 0) * torch.rand(mask.shape).to(self.device) * w_star
        new_weights = probs > torch.sort(probs.reshape(-1), descending=True).values[n_to_add]
        
        return new_weights.float()
    
    def evolution(self):
        for layer, skip, mask in zip(self.layers[:-1], self.skip_layers, self.masks):
            self.apply_mask(layer, mask)
            mask_core, n_removed = self.remove_weights(layer.weight.data)
            new_weights_mask = self.add_weights(mask_core, n_removed, skip.weight.data)
            mask = mask_core + new_weights_mask
            
            limit = np.sqrt(1. / float(layer.weight.data.shape[0]))
            with torch.no_grad():
                layer.weight.data = layer.weight.data * mask_core + \
                                    new_weights_mask * torch.empty(mask.shape).to(self.device).uniform_(-limit, limit)
                skip.weight.data = torch.empty(skip.weight.data.shape).to(self.device).uniform_(-limit/100, limit/100)
    
    def calculate_sparsity_percentage(self):
        self.apply_masks()
        zero_weights = 0
        total_weights = 0
        for l in self.layers[:-1]:
            vals = l.weight.data.reshape(-1)
            zero_weights += torch.sum(vals == 0).item()
            total_weights += vals.shape[0]
        return round(zero_weights / total_weights * 100, 2)

In [None]:
net = LiSET_MLP(28 * 28, 300, 100, 10, EPSILON=20, GAMMA=0, device=device)
net.calculate_sparsity_percentage()

In [None]:
EPOCHS = 30

optimizer = optim.SGD(net.parameters(), lr=0.01)

pbar = tqdm(range(EPOCHS))
for epoch in pbar:
    for data in train_loader:
        X, y = data[0].to(device), data[1].to(device)
        net.zero_grad()
        output = net(X)
        loss = F.nll_loss(output, y)
        loss.backward()
        optimizer.step()
    
    if epoch < (EPOCHS - 1):
        net.evolution()
    
    pbar.set_description(f"Loss {round(loss.item(), 4)}")

In [None]:
correct = 0
total = 0
with torch.no_grad():
    for data in test_loader:
        X, y = data[0].to(device), data[1].to(device)
        output = net(X, dropout=0, with_skip=False)
        for idx, i in enumerate(output):
            if torch.argmax(i) == y[idx]:
                correct += 1
            total += 1

print(f"Accuracy: {round(correct / total * 100, 3)}")

<br>

## Function of proposed evaluation metric to calculate speed up in convergence

In [None]:
def calculate_ara(accuracy_list, b=0.5):
    n = len(accuracy_list)
    s = 0
    
    a_const = (n - n * b) / (n - 1)
    b_const = n - a_const * n
    for i in range(1, n + 1):
        s += accuracy_list[i - 1] * (i * a_const + b_const)
    return s / (n ** 2 * (b + 1) / 2)