In [1]:
!rm -rf sngp_wrapper

This is custom implementation of spectral normalization and gaussian process by laplace approximization.




In [2]:
!git clone https://github.com/iamownt/sngp_wrapper.git
!mv sngp_wrapper/sngp_wrapper/* sngp_wrapper/
!rm -r sngp_wrapper/sngp_wrapper
! pip install timm

Cloning into 'sngp_wrapper'...
remote: Enumerating objects: 39, done.[K
remote: Counting objects: 100% (39/39), done.[K
remote: Compressing objects: 100% (29/29), done.[K
remote: Total 39 (delta 8), reused 36 (delta 8), pack-reused 0 (from 0)[K
Unpacking objects: 100% (39/39), 21.44 KiB | 510.00 KiB/s, done.
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [3]:
import timm
import torch.nn as nn
import torch
import torchvision
from sngp_wrapper.covert_utils import convert_to_sn_my, replace_layer_with_gaussian


class ConvNextTinyGP(nn.Module): # hate sn
    def __init__(self, num_classes: int):
        super(ConvNextTinyGP, self).__init__()
        feature_extractor = torchvision.models.convnext_tiny(weights="ConvNeXt_Tiny_Weights.IMAGENET1K_V1")
        feature_extractor.classifier = nn.Identity()
        self.feature_extractor = feature_extractor
        self.flatten = nn.Flatten(start_dim=1, end_dim=-1)
        self.classifier = nn.Linear(768, num_classes) # please determine 768 by the classifier/head of the model

    def forward(self, x, kwargs):
        features = self.flatten(self.feature_extractor(x))
        output = self.classifier(features, **kwargs)
        return output

model = ConvNextTinyGP(num_classes=1000)
print("parameters before conversion", sum(p.numel() for p in model.parameters()))
sigma_reparam_model = convert_to_sn_my(model, spec_norm_replace_list=["Linear", "Conv2D"], spec_norm_bound=2.)
print("parameters after conversion", sum(p.numel() for p in sigma_reparam_model.parameters()))
# print(sigma_reparam_model)

parameters before conversion 28587592
parameters after conversion 28587592


In [4]:
GP_KWARGS = {
    'num_inducing': 2048,
    'gp_scale': 1.0,
    'gp_bias': 0.,
    'gp_kernel_type': 'gaussian', # 'linear'
    'gp_input_normalization': True,
    'gp_cov_discount_factor': -1,
    'gp_cov_ridge_penalty': 1.,
    'gp_output_bias_trainable': False,
    'gp_scale_random_features': False,
    'gp_use_custom_random_features': True,
    'gp_random_feature_type': 'orf',
    'gp_output_imagenet_initializer': True,
    'num_classes': 1000,
}
replace_layer_with_gaussian(container=sigma_reparam_model, signature="classifier", **GP_KWARGS)

Model is equipped with gaussian process (laplace approximation)

In [5]:
print(model)

ConvNextTinyGP(
  (feature_extractor): ConvNeXt(
    (features): Sequential(
      (0): Conv2dNormActivation(
        (0): Conv2d(3, 96, kernel_size=(4, 4), stride=(4, 4))
        (1): LayerNorm2d((96,), eps=1e-06, elementwise_affine=True)
      )
      (1): Sequential(
        (0): CNBlock(
          (block): Sequential(
            (0): Conv2d(96, 96, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=96)
            (1): Permute()
            (2): LayerNorm((96,), eps=1e-06, elementwise_affine=True)
            (3): ParametrizedLinear(
              in_features=96, out_features=384, bias=True
              (parametrizations): ModuleDict(
                (weight): ParametrizationList(
                  (0): _SpectralNorm()
                )
              )
            )
            (4): GELU(approximate='none')
            (5): ParametrizedLinear(
              in_features=384, out_features=96, bias=True
              (parametrizations): ModuleDict(
                (weight): P

In [6]:
kwargs = {"return_random_features": False, "return_covariance": False,
          "update_precision_matrix": False, "update_covariance_matrix": False}
output = sigma_reparam_model(torch.randn(10, 3, 224, 224), kwargs)
print(output)

tensor([[ 0.2264, -0.4514, -0.0212,  ...,  0.4668,  0.1289,  0.0222],
        [ 0.2262, -0.4517, -0.0217,  ...,  0.4671,  0.1295,  0.0222],
        [ 0.2252, -0.4493, -0.0212,  ...,  0.4673,  0.1247,  0.0394],
        ...,
        [ 0.2240, -0.4505, -0.0231,  ...,  0.4672,  0.1316,  0.0290],
        [ 0.2262, -0.4519, -0.0215,  ...,  0.4671,  0.1295,  0.0221],
        [ 0.2280, -0.4500, -0.0224,  ...,  0.4689,  0.1285,  0.0234]],
       grad_fn=<AddBackward0>)


**Simple Example**

In [7]:
ind_data = torch.randn(10, 3, 224, 224)
ood_data = torch.randn(10, 3, 224, 224) + 1

for _ in range(10):
    sigma_reparam_model(ind_data, {"update_precision_matrix": True}) # we remember the in-domain data
sigma_reparam_model.classifier.update_covariance_matrix()

ind_output = sigma_reparam_model(ind_data, {"update_precision_matrix": False, "return_covariance": True,})
ood_output = sigma_reparam_model(ood_data, {"update_precision_matrix": False, "return_covariance": True,})
ind_prob, ind_cov = ind_output
ood_prob, ood_cov = ood_output


we see significant difference of uncertainty mean value between ind and ood data

In [8]:
ind_uncertainty = torch.diagonal(ind_cov, 0)
ood_uncertainty = torch.diagonal(ood_cov, 0)
print("ind_uncertainty", ind_uncertainty, "ind mean", torch.mean(ind_uncertainty))
print("ood_uncertainty", ood_uncertainty, "ood mean", torch.mean(ood_uncertainty))

ind_uncertainty tensor([0.0240, 0.0131, 0.1162, 0.0139, 0.0626, 0.0131, 0.0585, 0.0295, 0.0833,
        0.0132], grad_fn=<DiagonalBackward0>) ind mean tensor(0.0427, grad_fn=<MeanBackward0>)
ood_uncertainty tensor([0.1526, 0.0941, 0.0273, 0.0205, 0.0270, 0.0205, 0.0225, 0.0205, 0.0205,
        0.0205], grad_fn=<DiagonalBackward0>) ood mean tensor(0.0426, grad_fn=<MeanBackward0>)


**Important Notes for users:**

1. the rff-gp is implemented based on tfm.nlp.layers.RandomFeatureGaussianProcess, i have test several foundation models across 0.5B, 1B (or inception architecture), things work well.
2. in the forward process, you can set update_precision_matrix to update the precision matrix (default True)
3. remember you should use model.classifier.update_covariance_matrix() **once** when you want to evaluate the model with uncertainty quantification ability.
4. remember you should use model.classifier.reset_covariance_matrix() **at the beginning of each epoch**
5. when you want to test ood ability, set return_covariance=True and you will get the covariance matrix of each input, then you can take the diagnal point as the uncertainty value.
6. when you struggle to tune the hyperparameter, please note that when you set higher spectral norm value, the bound gets loose the may not affect the model. And you can safely change the gaussian kernel to linear kernel, the model will recover to the original model.
7. we can absorb the sn module in the inference stage to accelerate the inference speed, but i haven't implemented in this repository (maybe in others)