In [1]:
import logging
import sys
from pathlib import Path


logging.basicConfig(
    level=logging.DEBUG,
    format="%(levelname)s: %(message)s",
    force=True,
)

logger = logging.getLogger(__name__)

# Add the src directory to the Python path
sys.path.append(str(Path.cwd().parent.parent))

In [2]:
from src.datasets.vertebral_dataset import VertebralDataset

from src.models.mlp import (
    MLPEvaluator,
    MLPParams,
    FCParams,
    FCLayerParams,
    WeightQuantMode,
)
from src.models.nn import ActivationModule, ActivationParams, NNTrainParams
from src.models.quant.enums import QMode

DatasetClass = VertebralDataset
train_loader, test_loader = DatasetClass.get_dataloaders(batch_size=32)

train_params = NNTrainParams(
    train_loader,
    test_loader,
    epochs=15,
    learning_rate=0.01,
    weight_decay=0.0001,
    early_stop_patience=10,
)
fc_params = FCParams(
    layers=[
        FCLayerParams(DatasetClass.input_size, WeightQuantMode.NBITS, 16),
        FCLayerParams(32, WeightQuantMode.BINARY),
        FCLayerParams(32, WeightQuantMode.BINARY),
        FCLayerParams(DatasetClass.output_size, WeightQuantMode.BINARY),
    ],
    activation=ActivationParams(ActivationModule.BINARIZE_RESTE),
    qmode=QMode.DET,
    dropout_rate=0.0,
)
mlp_params = MLPParams(fc=fc_params, train=train_params)

evaluator = MLPEvaluator(mlp_params)
evaluator.evaluate_model(times=1)

INFO: Loading cached vertebral from /home/nur/Projects/vut-ip1-nn-quantization/datasets_cache/vertebral_cache.pkl
DEBUG: Train Epoch:  1 [  32/248] Loss: 1.0933
DEBUG: Train Epoch:  1 [ 192/248] Loss: 0.8740
DEBUG: Test set: Average loss: 0.9553, Accuracy: 36/62 (58.06%)
DEBUG: Train Epoch:  2 [  32/248] Loss: 0.8578
DEBUG: Train Epoch:  2 [ 192/248] Loss: 0.9169
DEBUG: Test set: Average loss: 0.6077, Accuracy: 45/62 (72.58%)
DEBUG: Train Epoch:  3 [  32/248] Loss: 0.9042
DEBUG: Train Epoch:  3 [ 192/248] Loss: 0.8703
DEBUG: Test set: Average loss: 0.6276, Accuracy: 40/62 (64.52%)
DEBUG: Train Epoch:  4 [  32/248] Loss: 0.9083
DEBUG: Train Epoch:  4 [ 192/248] Loss: 0.9114
DEBUG: Test set: Average loss: 0.5405, Accuracy: 49/62 (79.03%)
DEBUG: Train Epoch:  5 [  32/248] Loss: 0.8263
DEBUG: Train Epoch:  5 [ 192/248] Loss: 0.8478
DEBUG: Test set: Average loss: 0.5457, Accuracy: 46/62 (74.19%)
DEBUG: Train Epoch:  6 [  32/248] Loss: 0.8802
DEBUG: Train Epoch:  6 [ 192/248] Loss: 0.8124
DE

{'max': 79.03225806451613,
 'mean': np.float64(79.03225806451613),
 'std': np.float64(0.0)}

In [3]:
from src.constants import DEVICE
from src.datasets.mnist_dataset import MiniMNISTDataset
from src.models.cnn import CNN, CNNParams, ConvLayerParams, ConvParams

CNNDatasetClass = MiniMNISTDataset
cnn_train_loader, cnn_test_loader = CNNDatasetClass.get_dataloaders()

conv_params = ConvParams(
    in_channels=CNNDatasetClass.input_channels,
    in_dimensions=CNNDatasetClass.input_dimensions,
    in_bitwidth=8,
    out_height=CNNDatasetClass.output_size,
    layers=[
        ConvLayerParams(channels=16, kernel_size=3, stride=1, padding=1),
        ConvLayerParams(channels=32, kernel_size=3, stride=1, padding=1, pooling_kernel_size=2),
    ],
    activation=ActivationModule.BINARIZE,
    qmode=QMode.DET,
    dropout_rate=0.1,
)
cnn_fc_params = FCParams(
    layers=[
        FCLayerParams(-1, WeightQuantMode.NBITS, 16),
        FCLayerParams(32, WeightQuantMode.BINARY),
        FCLayerParams(32, WeightQuantMode.BINARY),
        FCLayerParams(CNNDatasetClass.output_size, WeightQuantMode.BINARY),
    ],
    activation=ActivationParams(ActivationModule.BINARIZE_RESTE),
    qmode=QMode.DET,
    dropout_rate=0.0,
)
cnn_train_params = NNTrainParams(
    cnn_train_loader,
    cnn_test_loader,
    epochs=1,
    learning_rate=0.01,
    weight_decay=0.0001,
    early_stop_patience=10,
)
cnn_params = CNNParams(
    in_bitwidth=8,
    conv=conv_params,
    fc=cnn_fc_params,
    train=cnn_train_params,
)
cnn = CNN(cnn_params).to(DEVICE)

In [4]:
cnn.inspect_conv_layers()

INFO: Inspecting convolutional layers...
INFO: Next layer shape: torch.Size([1, 16, 26, 26]), equating to 10816 inputs
INFO: Next layer shape: torch.Size([1, 32, 12, 12]), equating to 4608 inputs
INFO: FC input size is 4608


In [5]:
from src.models.cnn import CNNEvaluator


cnn_evaluator = CNNEvaluator(cnn_params)
cnn_evaluator.evaluate_model()


DEBUG: Train Epoch:  1 [ 128/4000] Loss: 2.4356
DEBUG: Train Epoch:  1 [ 768/4000] Loss: 1.5990
DEBUG: Train Epoch:  1 [1408/4000] Loss: 0.9766
DEBUG: Train Epoch:  1 [2048/4000] Loss: 0.8390
DEBUG: Train Epoch:  1 [2688/4000] Loss: 0.7298
DEBUG: Train Epoch:  1 [3328/4000] Loss: 0.4537
DEBUG: Train Epoch:  1 [3968/4000] Loss: 0.5184
DEBUG: Test set: Average loss: 0.5844, Accuracy: 654/800 (81.75%)
DEBUG: Test set: Average loss: 0.5844, Accuracy: 654/800 (81.75%)


{'max': 81.75, 'mean': np.float64(81.75), 'std': np.float64(0.0)}

# Quantization Notes

## Techniques

- Bitwidth quantization
- BNN: Activation func (Using STE for gradients)
- BNN_ReSTE: Activation func (Changing gradients: ReSTE)
- TNN: Activation func (Differentiable? I'll need to implement it)
- ... Find more?

## How to integrate them

Bitwidth quantization needs:
- Quantization mode (Deterministic, Stochastic)
- Per-layer: quantization level
- Advances: Per perceptron quantization level. At least for the first layer

Input bitwidth quantization. Closely tied to BNNs, as every other layer works with binary inputs
- Per-input-neuron quantization level

### BNN
The idea is that every hidden layer only computes using binary (0/1, -1/+1) inputs & weights. Afaik bias isn't present. FC layer is simplified using popcount + ... operation. CNN in a similar mannet.
- In the paper, they use hardtanh, ReLU. They also state that binarization itself is a form of non-linearity (used for hidden units). They use hardtanh even though they state that binarization itsef if a form of non-linearity. That's weird.

Parameters:
- None, I suppose.

### BNN ReSTE
Same as BNN, except for that:
1. ReSTE is used instead of STE. ReSTE specifies a function to better estimate the quantized activation gradient.
2. Gradients smaller than -1 and bigger than 1 are set to 0

Parameters
- o: Used for backprop. Modifies the approximated gradient.
- t: Threshold
- ...

### TWN, Ternary weight networks
Weights are one of: (-1,0,1). Paper focuses in CNNs. Nothing in mentioned of activation quantization. Typical pipeline is used: Conv -> BatchNorm -> Activation -> Maybe Pooling (every 2 conv layers?). Then FC layers.

Notes:
- The paper uses SGD.
- We will likely combine it with neuron quantization.
- FC & Conv do not have a bias.
- I could apply something like ReSTE to this by using $y=\frac{2arctan(10x^3)} {\pi}$

Parameters:
- Threshold within which weight is set to 0?
- ...

### Idea: adaptive quantization -> static quantization
During training use adaptive quantization, and for inference convert it to static quantization.
- A potential problem: overflow, underflow?
- ...

### Some recap

TWN and BNN do the same thing at its core: quantize weights. Extra things can be added like:
- binary or ternary activation (activation returns either (-1/1) or (-1/0/1)), or ReLU + bitwidth quantization.
- Input layer bitwidth quantization. (I should prefer per input quantization)
