# Auto Compression

- [Tutorial](https://github.com/hyperopt/hyperopt/wiki/FMin)

In [1]:
from hyperopt import hp
from hyperopt import fmin
import hyperopt
import math
import numpy as np
import argparse
import time
import datetime
import os
import sys
import torch
import torch.nn as nn
import torch.optim
from torch.autograd import Variable
# script_dir = os.path.dirname(__file__)
# module_path = os.path.abspath(os.path.join(script_dir, '..', '..'))
# try:
#     import distiller
# except ImportError:
#     sys.path.append(module_path)
#     import distiller
import distiller
import apputils
from models import ALL_MODEL_NAMES, create_model

In [12]:
def float_range(val_str):
    val = float(val_str)
    if val < 0 or val >= 1:
        raise argparse.ArgumentTypeError('Must be >= 0 and < 1 (received {0})'.format(val_str))
    return val

def get_space():
    space = {}
    for name, parameter in model.named_parameters():
        if 'conv' in name and 'weight' in name:
            space[name] = hp.uniform(name, 0.01, 0.99)
    return space

## Argument Settings

In [13]:
parser = argparse.ArgumentParser(description='Distiller image classification model compression')
parser.add_argument('data', metavar='DIR', help='path to dataset')
parser.add_argument('--arch', '-a', metavar='ARCH', default='resnet20_cifar',
                    choices=ALL_MODEL_NAMES,
                    help='model architecture: ' +
                    ' | '.join(ALL_MODEL_NAMES) +
                    ' (default: resnet20_cifar)')
parser.add_argument('-r', '--rounds', default=10, type=int,
                    metavar='R', help='max rounds (default: 10)')
parser.add_argument('--epochs', default=120, type=int,
                    metavar='E', help='epochs (default: 120)')
parser.add_argument('-j', '--workers', default=1, type=int, metavar='N',
                    help='number of data loading workers (default: 1)')
parser.add_argument('-b', '--batch-size', default=128, type=int,
                    metavar='N', help='mini-batch size (default: 128)')
parser.add_argument('--gpus', metavar='DEV_ID', default=None,
                    help='Comma-separated list of GPU device IDs to be used (default is to use all available devices)')
parser.add_argument('--lr', '--learning-rate', default=0.01, type=float,
                    metavar='LR', help='initial learning rate')
parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
                    help='momentum')
parser.add_argument('--weight-decay', '--wd', default=1e-4, type=float,
                    metavar='W', help='weight decay (default: 1e-4)')
parser.add_argument('--validation-size', '--vs', type=float_range, default=0.1,
                    help='Portion of training dataset to set aside for validation')
parser.add_argument('--deterministic', '--det', action='store_true',
                    help='Ensure deterministic execution for re-producible results.')

# [Manual setting hyperparameters]
# If execute in command line, use the following line instead:
#    args = parser.parse_args()  
args = parser.parse_args(args=[
    '/tmp/dataset-nctu',
    '-a', 'resnet56_cifar',
    '--gpus', '0'
])

## Data Loading & Model Creation

In [14]:
args.dataset = 'cifar10' if 'cifar' in args.arch else 'imagenet'
if args.gpus is not None:
    try:
        args.gpus = [int(s) for s in args.gpus.split(',')]
    except ValueError:
        exit(1)
    available_gpus = torch.cuda.device_count()
    for dev_id in args.gpus:
        if dev_id >= available_gpus:
            exit(1)
    # Set default device in case the first one on the list != 0
    torch.cuda.set_device(args.gpus[0])

model = create_model(False, args.dataset, args.arch, device_ids=args.gpus) # Get arch state_dict
train_loader, val_loader, test_loader, _ = apputils.load_data(
        args.dataset, os.path.expanduser(args.data), args.batch_size,
        args.workers, args.validation_size, args.deterministic)

Files already downloaded and verified
Files already downloaded and verified


## Functions

###  1. Trainning and Validation

In [15]:
def train(epoch, criterion, optimizer, compression_scheduler):
    correct = 0
    total = 0
    total_samples = len(train_loader.sampler)
    batch_size = train_loader.batch_size
    steps_per_epoch = math.ceil(total_samples / batch_size)
    for train_step, (inputs, targets) in enumerate(train_loader):
        compression_scheduler.on_minibatch_begin(epoch, train_step, steps_per_epoch, optimizer)
        inputs, targets = inputs.cuda(), targets.cuda()
        outputs = model(inputs.cuda())
        _, predicted = torch.max(outputs.data, 1)
        total += targets.size(0)
        correct += predicted.eq(targets.data).cpu().sum().data.numpy()
        loss = criterion(outputs, targets)
        compression_scheduler.before_backward_pass(epoch, train_step, steps_per_epoch, loss,
                                                   optimizer=optimizer, return_loss_components=True)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        compression_scheduler.on_minibatch_end(epoch, train_step, steps_per_epoch, optimizer)
    accuracy = 100. * correct / total    
    return accuracy

In [16]:
def validate():
    model.eval() 
    correct = 0
    total = 0
    with torch.no_grad():
        for test_step, (inputs, targets) in enumerate(val_loader):
            inputs, targets = inputs.cuda(), targets.cuda()
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += targets.size(0)
            correct += predicted.eq(targets.data).cpu().sum().data.numpy()
    accuracy = 100. * correct / total    
    return accuracy

### 2. Testing

In [17]:
def test():
    model.eval() 
    correct = 0
    total = 0
    with torch.no_grad():
        for test_step, (inputs, targets) in enumerate(test_loader):
            inputs, targets = inputs.cuda(), targets.cuda()
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += targets.size(0)
            correct += predicted.eq(targets.data).cpu().sum().data.numpy()
    accuracy = 100. * correct / total    
    return accuracy

### 3. Objective Function

In [18]:
count = 0
def objective(space):
    global model
    global count
    #Explore new model
    model = create_model(False, args.dataset, args.arch, device_ids=args.gpus)
    count += 1
    # Objective function: F(Acc, Lat) = (1 - Acc.) + (alpha * Sparsity)
    accuracy = 0
    alpha = 0.2 # Super-parameter: the importance of inference time
    latency = 0.0
    sparsity = 0.0
    # Training hyperparameter
    criterion = nn.CrossEntropyLoss().cuda()
    optimizer = torch.optim.SGD(model.parameters(), lr=args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)
    """
    distiller/distiller/config.py
        # Element-wise sparsity
        sparsity_levels = {net_param: sparsity_level}
        pruner = distiller.pruning.SparsityLevelParameterPruner(name='sensitivity', levels=sparsity_levels)
        policy = distiller.PruningPolicy(pruner, pruner_args=None)
        scheduler = distiller.CompressionScheduler(model)
        scheduler.add_policy(policy, epochs=[0, 2, 4])
        # Local search 
        add multiple pruner for each layer
    """
    sparsity_levels = {}
    for key, value in space.items():
        sparsity_levels[key] = value
    pruner = distiller.pruning.SparsityLevelParameterPruner(name='sensitivity', levels=sparsity_levels)
    policy = distiller.PruningPolicy(pruner, pruner_args=None)
    lrpolicy = distiller.LRPolicy(torch.optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1))
    compression_scheduler = distiller.CompressionScheduler(model)
    compression_scheduler.add_policy(policy, epochs=[90])
    compression_scheduler.add_policy(lrpolicy, starting_epoch=0, ending_epoch=90, frequency=1)
    """
    distiller/example/classifier_compression/compress_classifier.py
    For each epoch:
        compression_scheduler.on_epoch_begin(epoch)
        train()
        save_checkpoint()
        compression_scheduler.on_epoch_end(epoch)

    train():
        For each training step:
            compression_scheduler.on_minibatch_begin(epoch)
            output = model(input)
            loss = criterion(output, target)
            compression_scheduler.before_backward_pass(epoch)
            loss.backward()
            optimizer.step()
            compression_scheduler.on_minibatch_end(epoch)
    """
    for i in range(args.epochs):
        compression_scheduler.on_epoch_begin(i)
        train_accuracy = train(i,criterion, optimizer, compression_scheduler)
        val_accuracy = validate() # Validate hyperparameter setting
        t, sparsity = distiller.weights_sparsity_tbl_summary(model, return_total_sparsity=True)
        compression_scheduler.on_epoch_end(i, optimizer)
        apputils.save_checkpoint(i, args.arch, model, optimizer, compression_scheduler, train_accuracy, False,
                                         'hyperopt', './')
    test_accuracy = test() # Validate hyperparameter setting
    score = (1-(val_accuracy/100.)) + (alpha * (1-sparsity/100.)) # objective funtion here
    print('{} trials: score: {:.4f}\ttrain acc:{:.4f}\tval acc:{:.4f}\ttest acc:{:.4f}\tsparsity:{:.4f}'.format(count, 
                                      score, 
                                      train_accuracy, 
                                      val_accuracy, 
                                      test_accuracy,
                                      sparsity))
    return score

## A. Algorithm: TPE

### 1. No Constraint

In [19]:
def main():
    space = get_space()
    best = fmin(objective, space, algo=hyperopt.tpe.suggest, max_evals=args.rounds)
    print(best)

In [20]:
main()

1 trials: score: 0.2734	train acc:84.8556	val acc:82.5400	test acc:81.9300	sparsity:50.6034
2 trials: score: 0.2712	train acc:84.3733	val acc:81.7000	test acc:81.5400	sparsity:55.8900
3 trials: score: 0.2945	train acc:82.1822	val acc:79.9400	test acc:80.3600	sparsity:53.0353
4 trials: score: 0.2776	train acc:83.3778	val acc:81.7600	test acc:81.4100	sparsity:52.4167
5 trials: score: 0.2688	train acc:87.3289	val acc:83.7800	test acc:84.0000	sparsity:46.6922
6 trials: score: 0.2716	train acc:87.0333	val acc:83.4400	test acc:84.0700	sparsity:47.0190
7 trials: score: 0.2830	train acc:85.8867	val acc:82.8600	test acc:83.1400	sparsity:44.1788
8 trials: score: 0.2770	train acc:86.4289	val acc:82.9200	test acc:83.1200	sparsity:46.8751
9 trials: score: 0.2883	train acc:85.6889	val acc:82.7200	test acc:82.9700	sparsity:42.2566
10 trials: score: 0.2723	train acc:85.5978	val acc:83.1400	test acc:83.1300	sparsity:48.1252
{'module.conv1.weight': 0.31489126241236465, 'module.layer1.0.conv1.weight': 0.

## B. Algorithm: Random Search

### 1. No Constraint

In [None]:
def main():
    space = get_space()
    best = fmin(objective, space, algo=hyperopt.random.suggest, max_evals=args.rounds)
    print(best)

In [None]:
main()