In [5]:
import logging
import sys
from pathlib import Path


logging.basicConfig(
    level=logging.DEBUG,
    format="%(levelname)s: %(message)s",
    force=True,
)

logger = logging.getLogger(__name__)

# Add the src directory to the Python path
sys.path.append(str(Path.cwd().parent.parent))

In [15]:
from src.datasets.breast_cancer_dataset import BreastCancerDataset
from src.datasets.cardio_dataset import CardioDataset
from src.datasets.vertebral_dataset import VertebralDataset
from src.models.eval import NNEvaluator
from src.models.mlp import (
    MLPParams,
    FCParams,
    FCLayerParams,
    WeightQuantMode,
)
from src.models.nn import ActivationModule, ActivationParams, NNTrainParams
from src.models.quant.enums import QMode

DatasetClass = CardioDataset
train_loader, test_loader = DatasetClass.get_dataloaders(batch_size=32)

train_params = NNTrainParams(
    DatasetClass,
    train_loader,
    test_loader,
    epochs=15,
    learning_rate=0.01,
    weight_decay=0.0001,
    early_stop_patience=10,
)
fc_params = FCParams(
    layers=[
        FCLayerParams(DatasetClass.input_size, WeightQuantMode.NBITS, 32),
        FCLayerParams(32, WeightQuantMode.BINARY),
        FCLayerParams(32, WeightQuantMode.BINARY),
        FCLayerParams(DatasetClass.output_size, WeightQuantMode.BINARY),
    ],
    activation=ActivationParams(ActivationModule.BINARIZE),
    qmode=QMode.DET,
    dropout_rate=0.0,
)
mlp_params = MLPParams(fc=fc_params, train=train_params)

evaluator = NNEvaluator(mlp_params)
evaluator.evaluate_model(times=1)

DEBUG: Train Epoch:  1 [  32/1700] Loss: 2.3406
DEBUG: Train Epoch:  1 [ 192/1700] Loss: 2.2761
DEBUG: Train Epoch:  1 [ 352/1700] Loss: 2.2498
DEBUG: Train Epoch:  1 [ 512/1700] Loss: 2.2506
DEBUG: Train Epoch:  1 [ 672/1700] Loss: 2.2468
DEBUG: Train Epoch:  1 [ 832/1700] Loss: 2.2235
DEBUG: Train Epoch:  1 [ 992/1700] Loss: 2.2529
DEBUG: Train Epoch:  1 [1152/1700] Loss: 2.2088
DEBUG: Train Epoch:  1 [1312/1700] Loss: 2.2309
DEBUG: Train Epoch:  1 [1472/1700] Loss: 2.2440
DEBUG: Train Epoch:  1 [1632/1700] Loss: 2.2237
DEBUG: Test set: Average loss: 1.9310, Accuracy: 126/426 (29.58%)
DEBUG: Train Epoch:  2 [  32/1700] Loss: 2.2159
DEBUG: Train Epoch:  2 [ 192/1700] Loss: 2.2257
DEBUG: Train Epoch:  2 [ 352/1700] Loss: 2.2079
DEBUG: Train Epoch:  2 [ 512/1700] Loss: 2.1841
DEBUG: Train Epoch:  2 [ 672/1700] Loss: 2.2087
DEBUG: Train Epoch:  2 [ 832/1700] Loss: 2.2451
DEBUG: Train Epoch:  2 [ 992/1700] Loss: 2.2073
DEBUG: Train Epoch:  2 [1152/1700] Loss: 2.1827
DEBUG: Train Epoch:  2

{'max': 23.474178403755868,
 'mean': np.float64(23.474178403755868),
 'std': np.float64(0.0),
 'accuracies': [23.474178403755868]}

In [13]:
for x in train_loader:
    print(x[0][0], x[1][0])
    break

tensor([133.0000,  -0.3333,   1.3333,  -0.6000,   0.0000,   0.0000,   0.0000,
         53.0000,   0.6000,  13.0000,   9.9000,  22.0000, 125.0000, 147.0000,
          2.0000,   0.0000, 137.0000, 136.0000, 137.0000,   1.0000,   0.0000]) tensor(0)


In [14]:
from src.models.eval import KFoldNNEvaluator


KFoldNNEvaluator(mlp_params).evaluate_model(1)

DEBUG: Train Epoch:  1 [  32/1700] Loss: 2.2793
DEBUG: Train Epoch:  1 [ 192/1700] Loss: 2.2707
DEBUG: Train Epoch:  1 [ 352/1700] Loss: 2.2429
DEBUG: Train Epoch:  1 [ 512/1700] Loss: 2.2157
DEBUG: Train Epoch:  1 [ 672/1700] Loss: 2.2517
DEBUG: Train Epoch:  1 [ 832/1700] Loss: 2.2598
DEBUG: Train Epoch:  1 [ 992/1700] Loss: 2.2356
DEBUG: Train Epoch:  1 [1152/1700] Loss: 2.2262
DEBUG: Train Epoch:  1 [1312/1700] Loss: 2.2187
DEBUG: Train Epoch:  1 [1472/1700] Loss: 2.2034
DEBUG: Train Epoch:  1 [1632/1700] Loss: 2.2012
DEBUG: Test set: Average loss: 1.9607, Accuracy: 155/426 (36.38%)
DEBUG: Train Epoch:  2 [  32/1700] Loss: 2.2184
DEBUG: Train Epoch:  2 [ 192/1700] Loss: 2.2041
DEBUG: Train Epoch:  2 [ 352/1700] Loss: 2.2157
DEBUG: Train Epoch:  2 [ 512/1700] Loss: 2.2072
DEBUG: Train Epoch:  2 [ 672/1700] Loss: 2.1964
DEBUG: Train Epoch:  2 [ 832/1700] Loss: 2.2123
DEBUG: Train Epoch:  2 [ 992/1700] Loss: 2.1833
DEBUG: Train Epoch:  2 [1152/1700] Loss: 2.1929
DEBUG: Train Epoch:  2

{'max': 37.88235294117647,
 'mean': np.float64(30.385086992543496),
 'std': np.float64(4.662392316841478),
 'accuracies': [31.690140845070424,
  37.88235294117647,
  27.294117647058822,
  31.058823529411764,
  24.0]}

In [5]:
from src.constants import DEVICE
from src.datasets.mnist_dataset import MiniMNISTDataset
from src.models.cnn import CNN, CNNParams, ConvLayerParams, ConvParams

CNNDatasetClass = MiniMNISTDataset
cnn_train_loader, cnn_test_loader = CNNDatasetClass.get_dataloaders()

conv_params = ConvParams(
    in_channels=CNNDatasetClass.input_channels,
    in_dimensions=CNNDatasetClass.input_dimensions,
    in_bitwidth=8,
    out_height=CNNDatasetClass.output_size,
    layers=[
        ConvLayerParams(channels=16, kernel_size=3, stride=1, padding=1),
        ConvLayerParams(channels=32, kernel_size=3, stride=1, padding=1, pooling_kernel_size=2),
    ],
    activation=ActivationModule.BINARIZE,
    qmode=QMode.DET,
    reste_o=3,
    reste_threshold=1.5,
    dropout_rate=0.1,
)
cnn_fc_params = FCParams(
    layers=[
        FCLayerParams(-1, WeightQuantMode.NBITS, 16),
        FCLayerParams(32, WeightQuantMode.BINARY),
        FCLayerParams(32, WeightQuantMode.BINARY),
        FCLayerParams(CNNDatasetClass.output_size, WeightQuantMode.BINARY),
    ],
    activation=ActivationParams(ActivationModule.BINARIZE_RESTE),
    qmode=QMode.DET,
    dropout_rate=0.0,
)
cnn_train_params = NNTrainParams(
    CNNDatasetClass,
    cnn_train_loader,
    cnn_test_loader,
    epochs=1,
    learning_rate=0.01,
    weight_decay=0.0001,
    early_stop_patience=10,
)
cnn_params = CNNParams(
    in_bitwidth=8,
    conv=conv_params,
    fc=cnn_fc_params,
    train=cnn_train_params,
)
cnn = CNN(cnn_params).to(DEVICE)

  self.X = torch.tensor(X, dtype=torch.float32)
  self.y = torch.tensor(y, dtype=torch.int64)


In [6]:
for x in cnn_train_loader:
    print(x[1])
    break

tensor([7, 5, 6, 7, 6, 5, 3, 0, 7, 8, 7, 3, 1, 8, 1, 3, 2, 4, 9, 8, 2, 5, 8, 1,
        0, 5, 1, 3, 7, 6, 3, 3, 6, 7, 8, 4, 5, 6, 6, 3, 9, 7, 6, 1, 3, 3, 3, 2,
        5, 3, 1, 8, 4, 2, 9, 6, 6, 9, 0, 6, 9, 0, 3, 9, 6, 3, 5, 4, 5, 3, 7, 7,
        2, 7, 5, 3, 2, 3, 9, 0, 6, 2, 5, 9, 0, 0, 4, 1, 1, 3, 3, 5, 3, 1, 3, 0,
        1, 3, 7, 3, 0, 8, 6, 8, 9, 2, 8, 0, 7, 7, 3, 2, 3, 0, 5, 3, 3, 1, 1, 4,
        4, 4, 7, 2, 1, 7, 8, 7])


In [7]:
cnn.conv_layers[0][0].weight[0]

tensor([[[ 0.1702,  0.1702, -0.1702],
         [-0.1702, -0.1702, -0.1702],
         [ 0.1702, -0.1702,  0.1702]]], grad_fn=<SelectBackward0>)

In [10]:
from src.models.cnn import CNNEvaluator


CNNEvaluator(cnn_params).evaluate_model(times=1)

DEBUG: Train Epoch:  1 [ 128/3200] Loss: 2.2389
DEBUG: Train Epoch:  1 [ 768/3200] Loss: 1.9715
DEBUG: Train Epoch:  1 [1408/3200] Loss: 1.1675
DEBUG: Train Epoch:  1 [2048/3200] Loss: 0.8088
DEBUG: Train Epoch:  1 [2688/3200] Loss: 0.6602
DEBUG: Test set: Average loss: 0.8538, Accuracy: 580/800 (72.50%)
DEBUG: Test set: Average loss: 0.8538, Accuracy: 580/800 (72.50%)


{'max': 72.5,
 'mean': np.float64(72.5),
 'std': np.float64(0.0),
 'accuracies': [72.5]}

In [None]:
from src.models.nn import save_model


save_model(cnn, "cnn_model.pth")

In [None]:
from src.models.nn import load_model


cnn2 = load_model(cnn, "cnn_model.pth")

In [None]:
cnn2.conv_layers[0][0].weight[0]

tensor([[[-0.1843, -0.1843, -0.1843],
         [ 0.1843,  0.1843,  0.1843],
         [ 0.1843,  0.1843,  0.1843]]], grad_fn=<SelectBackward0>)

In [None]:
cnn.inspect_conv_layers()

INFO: Inspecting convolutional layers...
INFO: Next layer shape: torch.Size([1, 16, 28, 28]), equating to 12544 inputs
INFO: Next layer shape: torch.Size([1, 32, 14, 14]), equating to 6272 inputs
INFO: FC input size is 6272


In [None]:
from src.models.cnn import CNNEvaluator


cnn_evaluator = CNNEvaluator(cnn_params)
cnn_evaluator.evaluate_model()


DEBUG: Train Epoch:  1 [ 128/4000] Loss: 2.4665
DEBUG: Train Epoch:  1 [ 768/4000] Loss: 1.4402
DEBUG: Train Epoch:  1 [1408/4000] Loss: 1.1744
DEBUG: Train Epoch:  1 [2048/4000] Loss: 1.0672
DEBUG: Train Epoch:  1 [2688/4000] Loss: 0.8749
DEBUG: Train Epoch:  1 [3328/4000] Loss: 0.5647
DEBUG: Train Epoch:  1 [3968/4000] Loss: 0.4820
DEBUG: Test set: Average loss: 0.8719, Accuracy: 579/800 (72.38%)
DEBUG: Test set: Average loss: 0.8719, Accuracy: 579/800 (72.38%)


{'max': 72.375, 'mean': np.float64(72.375), 'std': np.float64(0.0)}

# Quantization Notes

## Techniques

- Bitwidth quantization
- BNN: Activation func (Using STE for gradients)
- BNN_ReSTE: Activation func (Changing gradients: ReSTE)
- TNN: Activation func (Differentiable? I'll need to implement it)
- ... Find more?

## How to integrate them

Bitwidth quantization needs:
- Quantization mode (Deterministic, Stochastic)
- Per-layer: quantization level
- Advances: Per perceptron quantization level. At least for the first layer

Input bitwidth quantization. Closely tied to BNNs, as every other layer works with binary inputs
- Per-input-neuron quantization level

### BNN
The idea is that every hidden layer only computes using binary (0/1, -1/+1) inputs & weights. Afaik bias isn't present. FC layer is simplified using popcount + ... operation. CNN in a similar mannet.
- In the paper, they use hardtanh, ReLU. They also state that binarization itself is a form of non-linearity (used for hidden units). They use hardtanh even though they state that binarization itsef if a form of non-linearity. That's weird.

Parameters:
- None, I suppose.

### BNN ReSTE
Same as BNN, except for that:
1. ReSTE is used instead of STE. ReSTE specifies a function to better estimate the quantized activation gradient.
2. Gradients smaller than -1 and bigger than 1 are set to 0

Parameters
- o: Used for backprop. Modifies the approximated gradient.
- t: Threshold
- ...

### TWN, Ternary weight networks
Weights are one of: (-1,0,1). Paper focuses in CNNs. Nothing in mentioned of activation quantization. Typical pipeline is used: Conv -> BatchNorm -> Activation -> Maybe Pooling (every 2 conv layers?). Then FC layers.

Notes:
- The paper uses SGD.
- We will likely combine it with neuron quantization.
- FC & Conv do not have a bias.
- I could apply something like ReSTE to this by using $y=\frac{2arctan(10x^3)} {\pi}$

Parameters:
- Threshold within which weight is set to 0?
- ...

### Idea: adaptive quantization -> static quantization
During training use adaptive quantization, and for inference convert it to static quantization.
- A potential problem: overflow, underflow?
- ...

### Some recap

TWN and BNN do the same thing at its core: quantize weights. Extra things can be added like:
- binary or ternary activation (activation returns either (-1/1) or (-1/0/1)), or ReLU + bitwidth quantization.
- Input layer bitwidth quantization. (I should prefer per input quantization)
