<h1 align=center>Compute the Effective Dimensionality of a Model</h1>

> :warning: The following notebook is wrong as it compute the singular values of each batch and does the sum of all the singular values across batches.
> The correct way to do it is to stack all the batches on top of each other and compute the singular values. See [ed-calculation.ipynb](./ed-calculation.ipynb)

**TODO**: Add comments and description in the notebook

In [None]:
import os
import numpy as np

In [None]:
models = {
  'alexnet': ['features.2', 'features.7', 'features.7', 'features.12'],
  'deit_base_imagenet_full_seed-0': ['blocks.3.mlp.fc1', 'blocks.8.norm2', 'blocks.3.mlp.act', 'blocks.9.norm2'],
  'resnet-50-robust': ['layer3.0.downsample.0', 'layer4.0.downsample.0', 'layer3.0.downsample.0', 'layer4.0.downsample.0'],
  'deit_large_imagenet_full_seed-0': ['blocks.4.norm1', 'blocks.18.norm2', 'blocks.9.norm1', 'blocks.20.norm2'],
  'resnet152_imagenet_full': ['layer1.0.bn1', 'layer3.3.bn3', 'layer3.0.bn3', 'layer3.34.bn3'],
  'resnext101_32x32d_wsl': ['layer1.0.relu', 'layer3.0.relu', 'layer2.0.relu', 'layer3.21.relu'],
  'convnext_small_imagenet_100_seed-0': ['features.5.2.block.0', 'features.5.17.block.0', 'features.4.0', 'features.5.9.block.0'],
  'convnext_small_imagenet_10_seed-0': ['features.5.2.block.0', 'features.5.17.block.0', 'features.4.0', 'features.5.9.block.0'],
  'resnext101_32x48d_wsl': ['layer2.2.relu', 'layer3.0.relu', 'layer2.0.relu', 'layer3.20.relu'],
  'resnet50_ecoset_full': ['layer1.0.bn1', 'layer4.0.conv2', 'layer3.0.conv1', 'layer4.0.relu'],
  'resnet50_imagenet_100_seed-0': ['layer1.0.conv1', 'layer3.5.bn3', 'layer3.0.conv1', 'layer4.0.relu'],
  'resnet101_ecoset_full': ['layer1.0.bn1', 'layer3.4.relu', 'layer3.0.bn3', 'layer4.0.relu'],
  'resnext101_32x8d_wsl': ['layer2.3.relu', 'layer3.4.relu', 'layer2.1.relu', 'layer3.3.relu'],
  'convnext_small_imagenet_full_seed-0': ['features.5.2.block.0', 'features.5.17.block.0', 'features.4.0', 'features.5.9.block.0'],
  'convnext_tiny_imagenet_full_seed-0': ['features.6.0', 'features.5.4.block.0', 'features.4.0', 'features.5.4.block.0'],
  'deit_small_imagenet_100_seed-0': ['blocks.2.norm1', 'blocks.6.norm2', 'blocks.5.norm1', 'blocks.9.norm2'],
  'convnext_base_imagenet_full_seed-0': ['features.5.7.block.0', 'features.5.12.block.0', 'features.4.0', 'features.5.11.block.0'],
  'resnet50_tutorial': ['layer2', 'layer2', 'layer2', 'layer3'],
  'resnet101_imagenet_full': ['layer1.0.bn1', 'layer4.0.bn1', 'layer3.0.bn3', 'layer4.0.relu'],
  'convnext_large_imagenet_full_seed-0': ['features.5.7.block.5', 'features.5.7.block.0', 'features.4.1', 'features.5.11.block.0'],
  'resnet50_imagenet_full': ['layer1.0.conv1', 'layer3.5.bn3', 'layer3.0.conv1', 'layer4.0.relu'],
  'resnet18_imagenet_full': ['layer1.0.bn1', 'layer3.0.conv2', 'layer2.0.bn2', 'layer4.0.bn1'],
  'resnet152_ecoset_full': ['layer1.0.bn1', 'layer3.3.bn3', 'layer3.0.bn3', 'layer4.0.relu'],
  'resnet18_ecoset_full': ['layer1.0.conv1', 'layer3.0.conv1', 'layer2.0.bn2', 'layer4.0.bn1'],
  'resnet-152_v2_pytorch': ['avgpool', 'layer4.1.relu', 'layer4.1.relu', 'layer4.1.bn2'],
  'resnet34_ecoset_full': ['layer1.0.bn1', 'layer3.1.conv1', 'layer3.0.conv1', 'layer4.0.conv1'],
  'resnet18_imagenet21kP': ['layer2.0.relu', 'layer2.0.relu', 'layer2.0.relu', 'layer4.0.relu'],
  'deit_small_imagenet_full_seed-0': ['blocks.2.norm1', 'blocks.6.norm2', 'blocks.5.norm1', 'blocks.9.norm2']
}

In [None]:
def flatten(layer_output):
  return layer_output.reshape(layer_output.shape[0], -1)

In [None]:
def global_avg_pooling(layer_output):
  if len(layer_output.shape) != 4:
    raise ValueError(f"Input features must be a 4D array instead of {layer_output.shape}D")
  return layer_output.mean(axis=(2, 3))

In [None]:
# Define the directory containing .npz files
output_dir = f'{ os.getcwd() }/data/output'

In [None]:
aggregated_sums_num = {}
aggregated_sums_denom = {}
file_count = 0

In [None]:
use_global_avg_pooling, use_flatten = True, False

In [None]:
def compute(key):
  all_data = {}
  folder = os.path.join(output_dir, key)
  for file_name in os.listdir(folder):
    if file_name.endswith('.npz'):
      file_path = os.path.join(folder, file_name)
      print(f'loading {file_path}')
      data = np.load(file_path)
      for key in data.files:
        if key not in all_data:
          all_data[key] = []
        array = data[key] 
        if use_global_avg_pooling:
          array = global_avg_pooling(array)
        if use_flatten:
          array = flatten(array)
        print(array.shape)
        all_data[key].append(array)
    
      # Concatenate all arrays for each key
      for key in all_data:
        all_data[key] = np.vstack(all_data[key])
    effective_dimensionality = {}
    for key, array in all_data.items():
      print(array.shape)
      singular_values = np.linalg.svd(array, compute_uv=False)
      effective_dimensionality[key] = (singular_values.sum())**2 / (np.sum(singular_values**2))

  output_file = os.path.join(folder, 'ed.npz')
  print(effective_dimensionality)
  np.savez(output_file, **effective_dimensionality)

In [None]:
for key, layers in models.items():
  compute(key)