# Homework 5

Starting from the implementation contained within the notebook `05-pruning.ipynb`, extend the  `magnitude_pruning` function to allow for incremental (iterative) pruning. Right now, the magnitude_pruning routine is thought for one-shot pruning. If you try pruning one more time, you'll notice that it will not work as there's no way to communicate to the future calls of magnitude_pruning to ignore the parameters which have already been pruned. Find a way to enhance the routine s.t. it can effectively prune networks in a sequential fashion (i.e., if we passed an MLP already pruned of 20% of its parameters, we want to prune *another* 20% of parameters).

Hint: use the mask!

In [1]:
import torch

sys.path.append("../labs")
from scripts import mnist, train_utils, architectures, train
from scripts.train_utils import accuracy, AverageMeter

In [2]:
layers = [
    {"n_in": 784, "n_out": 16, "batchnorm": False},
    {"n_out": 32, "batchnorm": True},
    {"n_out": 64, "batchnorm": True},
    {"n_out": 10, "batchnorm": True}
]
net = architectures.MLPCustom(layers)
print(net)

MLPCustom(
  (layers): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Linear(in_features=784, out_features=16, bias=True)
    (2): ReLU()
    (3): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (4): Linear(in_features=16, out_features=32, bias=True)
    (5): ReLU()
    (6): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Linear(in_features=32, out_features=64, bias=True)
    (8): ReLU()
    (9): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): Linear(in_features=64, out_features=10, bias=True)
    (11): ReLU()
  )
)


In [3]:
def magnitude_pruning(model, pruning_rate, layers_to_prune=["1", "4", "7", "10"]):
    # 1. vectorize distribution of abs(parameter)
    '''
    model.named_parameters returns a tuple:
      element 0 is the name of the parameter
      element 1 is the parameter itself
    any(<list of booleans>)
    checks whether any of the conditions inside the list is True.
    [l in pars[0] for l in layers_to_prune] checks whether "1", "4", "7", and "10" are inside the parameter name
    hence, there will be up to 1 True in the corresponding list: in that case, any(...) returns True
    '''
    params_to_prune = [pars[1] for pars in model.named_parameters() if any([l in pars[0] for l in layers_to_prune])]
    flat = torch.cat([pars.abs().flatten() for pars in params_to_prune], dim=0)

    # 2. sort this distribution
    flat = flat.sort()[0]

    # 3. obtain the threshold
    position = int(pruning_rate * flat.shape[0])
    thresh = flat[position]

    # 4. binarize the parameters & 5. compose these booleans into the mask
    # 6. obtain the new structure of parameters
    '''
    I do this process with a for cycle instead of a list comprehension for clarity
    * if the layer is a layer to prune → populate the mask with 1s and 0s
    * otherwise → just populate the mask with ones
    By doing so, I can immediately apply the mask to the model as well...
    '''
    mask = []
    for pars in model.named_parameters():
        if any([l in pars[0] for l in layers_to_prune]):
            m = torch.where(pars[1].abs() >= thresh, 1, 0)
            mask.append(m)
            pars[1].data *= m
        else:
            mask.append(torch.ones_like(pars[1]))
            # no need to multiply as it's all 1s

    # 7. what do we need to return?
    return mask

In [4]:
mask = magnitude_pruning(net, .2)

In [5]:
def number_of_ones_in_mask(mask):
    return sum([m.sum().item() for m in mask]) / sum([m.numel() for m in mask])

In [6]:
# Proportion of parameters which have survived the pruning
print("Number of ones in mask:", number_of_ones_in_mask(mask), "\n")
# I'm pruning the first 20 parameters of the network and the mask to see if it actually works
print(mask[0][0,:20], "\n", next(net.parameters())[0,:20])

Number of ones in mask: 0.8027967681789931 

tensor([1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1]) 
 tensor([ 0.0147, -0.0278,  0.0000, -0.0165, -0.0348, -0.0251, -0.0291,  0.0202,
        -0.0220, -0.0000, -0.0103,  0.0091,  0.0000, -0.0341, -0.0285, -0.0217,
         0.0174, -0.0110,  0.0000,  0.0265], grad_fn=<SliceBackward>)


In [7]:
trainloader, testloader, _, _ = mnist.get_data(data_root="../labs/datasets/")
loss_fn = torch.nn.CrossEntropyLoss()

In [8]:
# load pretrained model
state_dict = torch.load("../labs/models_push/mlp_custom_mnist/mlp_custom_mnist.pt")
net.load_state_dict(state_dict)

<All keys matched successfully>

In [9]:
train.test_model(net, testloader, loss_fn=loss_fn)

  return torch._C._cuda_getDeviceCount() > 0
TESTING - loss 7349.624471664429 - performance 0.9631


(7349.624471664429, 0.9631333333333333)

In [10]:
mask = magnitude_pruning(net, pruning_rate=0.25)
print("Number of ones in mask:", number_of_ones_in_mask(mask), "\n")

0.7535114978247358

In [11]:
train.test_model(net, testloader, loss_fn=loss_fn)

TESTING - loss 7674.807939052582 - performance 0.9612


(7674.807939052582, 0.9612333333333334)

We see that there's a drop in performance. This is a characteristic of pruning. Rarely a simple application of pruning leaves the performance untouched or better. Due to that, we must procede with a re-training of the ANN.

In [12]:
def train_epoch(model, dataloader, loss_fn, optimizer, loss_meter, performance_meter, performance, device, mask, layers_to_prune=["1", "4", "7", "10"]):
    for X, y in dataloader:
        X = X.to(device)
        y = y.to(device)

        optimizer.zero_grad() 

        y_hat = model(X)

        loss = loss_fn(y_hat, y)

        loss.backward()

        ##### we must neutralize the gradient on the pruned params before the optimizer takes a step ####

        if mask is not None:
            for (name, param), m in zip(model.named_parameters(), mask):
                if any([l in name for l in layers_to_prune]):
                    param.grad *= m

        ######

        optimizer.step()

        acc = performance(y_hat, y)

        loss_meter.update(val=loss.item(), n=X.shape[0])
        performance_meter.update(val=acc, n=X.shape[0])

In [13]:
def train_model(model, dataloader, loss_fn, optimizer, num_epochs, checkpoint_loc=None, checkpoint_name="checkpoint.pt", performance=accuracy, lr_scheduler=None, device=None, mask=None, params_type_to_prune=["weight", "bias"]):
    if checkpoint_loc is not None:
        os.makedirs(checkpoint_loc, exist_ok=True)

    if device is None:
        device = use_gpu_if_possible()
    
    model = model.to(device)
    model.train()

    for epoch in range(num_epochs):
        loss_meter = AverageMeter()
        performance_meter = AverageMeter()

        print(f"Epoch {epoch+1} --- learning rate {optimizer.param_groups[0]['lr']:.5f}")

        train_epoch(model, dataloader, loss_fn, optimizer, loss_meter, performance_meter, performance, device, mask)

        print(f"Epoch {epoch+1} completed. Loss - total: {loss_meter.sum} - average: {loss_meter.avg}; Performance: {performance_meter.avg}")

        if checkpoint_name is not None and checkpoint_loc is not None:
            checkpoint_dict = {
                "parameters": model.state_dict(),
                "optimizer": optimizer.state_dict(),
                "epoch": epoch
            }
            torch.save(checkpoint_dict, os.path.join(checkpoint_loc, checkpoint_name))
        
        if lr_scheduler is not None:
            lr_scheduler.step()

    return loss_meter.sum, performance_meter.avg

In [14]:
num_epochs = 10
lr = 0.01
optimizer = torch.optim.Adam(net.parameters(), lr = lr)

train_model(net, trainloader, loss_fn, optimizer, num_epochs, device="cpu", mask=mask)

Epoch 1 --- learning rate 0.01000
Epoch 1 completed. Loss - total: 12252.428242206573 - average: 0.20420713737010956; Performance: 0.9375666666666667
Epoch 2 --- learning rate 0.01000
Epoch 2 completed. Loss - total: 9087.08480644226 - average: 0.15145141344070434; Performance: 0.9534833333333333
Epoch 3 --- learning rate 0.01000
Epoch 3 completed. Loss - total: 8068.963097572327 - average: 0.13448271829287212; Performance: 0.9578833333333333
Epoch 4 --- learning rate 0.01000
Epoch 4 completed. Loss - total: 7334.235938549042 - average: 0.12223726564248404; Performance: 0.9618
Epoch 5 --- learning rate 0.01000
Epoch 5 completed. Loss - total: 6821.295444488525 - average: 0.11368825740814209; Performance: 0.9642666666666667
Epoch 6 --- learning rate 0.01000
Epoch 6 completed. Loss - total: 6592.9484832286835 - average: 0.10988247472047806; Performance: 0.9655
Epoch 7 --- learning rate 0.01000
Epoch 7 completed. Loss - total: 6120.685695171356 - average: 0.10201142825285593; Performance:

(5353.642367362976, 0.9707333333333333)

In [15]:
train.test_model(net, testloader, loss_fn=loss_fn)

TESTING - loss 3899.6647787094116 - performance 0.9791


(3899.6647787094116, 0.9790666666666666)

In [16]:
mask = magnitude_pruning(net, pruning_rate=0.25)
print("Number of ones in mask:", number_of_ones_in_mask(mask), "\n")

0.7535114978247358

In [17]:
train.test_model(net, testloader, loss_fn=loss_fn)

TESTING - loss 3899.6647787094116 - performance 0.9791


(3899.6647787094116, 0.9790666666666666)

We can see that the number of ones in the mask is the same as before! So we couldn't prune another 25% of the parameters.

## Iterative Magnitude Pruning (IMP)

To be able to prune again the parameters in the network, we compute the threshold removing the parameters that are already zero, so that we compute the pruning rate only on the non-zero parameters.

In [61]:
def magnitude_pruning(model, pruning_rate, layers_to_prune=["1", "4", "7", "10"]):
    # 1. vectorize distribution of abs(parameter)
    '''
    model.named_parameters returns a tuple:
      element 0 is the name of the parameter
      element 1 is the parameter itself
    any(<list of booleans>)
    checks whether any of the conditions inside the list is True.
    [l in pars[0] for l in layers_to_prune] checks whether "1", "4", "7", and "10" are inside the parameter name
    hence, there will be up to 1 True in the corresponding list: in that case, any(...) returns True
    '''
    params_to_prune = [pars[1] for pars in model.named_parameters() if any([l in pars[0] for l in layers_to_prune])]
    flat = torch.cat([pars.abs().flatten() for pars in params_to_prune], dim=0)
    print("Parameters dimensions: ", end="")
    print(flat.shape)

    # 2. sort this distribution
    flat = flat.sort()[0]

    # 2.5. remove the zeros, thus the parameters already pruned
    flat = flat[flat.nonzero().detach()]
    print("Pruned parameters dimensions: ", end="")
    print(flat.shape)

    # 3. obtain the threshold
    position = int(pruning_rate * flat.shape[0])
    thresh = flat[position]

    # 4. binarize the parameters & 5. compose these booleans into the mask
    # 6. obtain the new structure of parameters
    '''
    I do this process with a for cycle instead of a list comprehension for clarity
    * if the layer is a layer to prune → populate the mask with 1s and 0s
    * otherwise → just populate the mask with ones
    By doing so, I can immediately apply the mask to the model as well...
    '''
    mask = []
    for pars in model.named_parameters():
        if any([l in pars[0] for l in layers_to_prune]):
            m = torch.where(pars[1].abs() >= thresh, 1, 0)
            mask.append(m)
            pars[1].data *= m
        else:
            mask.append(torch.ones_like(pars[1]))
            # no need to multiply as it's all 1s

    # 7. what do we need to return?
    return mask

In [62]:
# load pretrained model
state_dict = torch.load("../labs/models_push/mlp_custom_mnist/mlp_custom_mnist.pt")
net.load_state_dict(state_dict)

train.test_model(net, testloader, loss_fn=loss_fn)

TESTING - loss 7349.624471664429 - performance 0.9631


(7349.624471664429, 0.9631333333333333)

In [63]:
for i in range(10):
    print(f"Iteration {i+1}.\n")
    mask = magnitude_pruning(net, pruning_rate=0.25)
    print("Number of ones in mask:", number_of_ones_in_mask(mask), "\n")

    optimizer = torch.optim.Adam(net.parameters(), lr = lr)
    train_model(net, trainloader, loss_fn, optimizer, num_epochs, device="cpu", mask=mask)
    train.test_model(net, testloader, loss_fn=loss_fn)
    print("\n")

Iteration 1.

Parameters dimensions: torch.Size([15866])
Pruned parameters dimensions: torch.Size([15866, 1])
Number of ones in mask: 0.7535114978247358 

Epoch 1 --- learning rate 0.01000
Epoch 1 completed. Loss - total: 12903.265724182129 - average: 0.21505442873636882; Performance: 0.93435
Epoch 2 --- learning rate 0.01000
Epoch 2 completed. Loss - total: 9346.30019569397 - average: 0.15577166992823283; Performance: 0.9516166666666667
Epoch 3 --- learning rate 0.01000
Epoch 3 completed. Loss - total: 7972.33712720871 - average: 0.13287228545347848; Performance: 0.9576666666666667
Epoch 4 --- learning rate 0.01000
Epoch 4 completed. Loss - total: 7332.6753578186035 - average: 0.12221125596364339; Performance: 0.9619166666666666
Epoch 5 --- learning rate 0.01000
Epoch 5 completed. Loss - total: 6764.026443362236 - average: 0.11273377405603727; Performance: 0.9646
Epoch 6 --- learning rate 0.01000
Epoch 6 completed. Loss - total: 6354.145076751709 - average: 0.10590241794586182; Perfor

We can see that 10 iterations of IMP result in a final sparsity of the ANN of about $0.75^{10}\approx 0.0563$ (ours is a bit higher since we prune only the parameters in the Linear layers and not in the BatchNorm layers).

We can also notice that with iteration 5 the performance starts to drop, expecially in the last iteration where we have a drammatic drop to a test accuracy of $0.0986$.