In [1]:
import numpy as np
import os

def flatten(layer_output):
    return layer_output.reshape(layer_output.shape[0], -1)

def global_avg_pooling(layer_output):
    if len(layer_output.shape) != 4:
        raise ValueError(f"Input features must be a 4D array instead of {layer_output.shape}D")
    return layer_output.mean(axis=(2, 3))

# Define the directory containing .npz files
folder_path = 'extracted_data'
output_path = f'{folder_path}/ed'

use_global_avg_pooling = False
use_flatten = False

# Initialize a dictionary to store covariance matrices
covariance_matrices = {}
total_samples = {}

for file_name in os.listdir(folder_path):
    if file_name.endswith('.npz'):
        file_path = os.path.join(folder_path, file_name)
        print(f'loading {file_path}')
        data = np.load(file_path)
        for key in data.files:
            array = data[key]
            array = array[:, :1024]

            if use_global_avg_pooling:
                array = global_avg_pooling(array)

            if use_flatten:
                array = flatten(array)

            if key not in covariance_matrices:
                # Initialize covariance matrix and sample count for this key
                feature_dim = array.shape[1]
                covariance_matrices[key] = np.zeros((feature_dim, feature_dim))
                total_samples[key] = 0

            # Update covariance matrix incrementally
            covariance_matrices[key] += array.T @ array
            total_samples[key] += array.shape[0]

# Compute effective dimensionality
effective_dimensionality = {}
for key, cov_matrix in covariance_matrices.items():
    # Normalize by total samples to get the final covariance matrix
    cov_matrix /= total_samples[key]

    # Compute singular values (sqrt of eigenvalues)
    singular_values = np.sqrt(np.linalg.eigvalsh(cov_matrix))
    effective_dimensionality[key] = (singular_values.sum())**2 / (np.sum(singular_values**2))

# Save results
output_file = os.path.join(output_path, 'new-ed.npz')
print(effective_dimensionality)
np.savez(output_file, **effective_dimensionality)

loading extracted_data/deit_large_imagenet_full_seed-0-11.npz
loading extracted_data/deit_large_imagenet_full_seed-0-18.npz
loading extracted_data/deit_large_imagenet_full_seed-0-39.npz
loading extracted_data/deit_large_imagenet_full_seed-0-14.npz
loading extracted_data/deit_large_imagenet_full_seed-0-49.npz
loading extracted_data/deit_large_imagenet_full_seed-0-24.npz
loading extracted_data/deit_large_imagenet_full_seed-0-32.npz
loading extracted_data/deit_large_imagenet_full_seed-0-19.npz
loading extracted_data/deit_large_imagenet_full_seed-0-34.npz
loading extracted_data/deit_large_imagenet_full_seed-0-8.npz
loading extracted_data/deit_large_imagenet_full_seed-0-50.npz
loading extracted_data/deit_large_imagenet_full_seed-0-28.npz
loading extracted_data/deit_large_imagenet_full_seed-0-46.npz
loading extracted_data/deit_large_imagenet_full_seed-0-26.npz
loading extracted_data/deit_large_imagenet_full_seed-0-7.npz
loading extracted_data/deit_large_imagenet_full_seed-0-45.npz
loading ex

  singular_values = np.sqrt(np.linalg.eigvalsh(cov_matrix))


KeyboardInterrupt: 

In [2]:
print(total_samples)

{'blocks.0.norm1': 50000, 'blocks.0.norm2': 50000, 'blocks.1.norm1': 50000, 'blocks.1.norm2': 50000, 'blocks.2.norm1': 50000, 'blocks.2.norm2': 50000, 'blocks.3.norm1': 50000, 'blocks.3.norm2': 50000, 'blocks.4.norm1': 50000, 'blocks.4.norm2': 50000, 'blocks.5.norm1': 50000, 'blocks.5.norm2': 50000, 'blocks.6.norm1': 50000, 'blocks.6.norm2': 50000, 'blocks.7.norm1': 50000, 'blocks.7.norm2': 50000, 'blocks.8.norm1': 50000, 'blocks.8.norm2': 50000, 'blocks.9.norm1': 50000, 'blocks.9.norm2': 50000, 'blocks.10.norm1': 50000, 'blocks.10.norm2': 50000, 'blocks.11.norm1': 50000, 'blocks.11.norm2': 50000, 'blocks.12.norm1': 50000, 'blocks.12.norm2': 50000, 'blocks.13.norm1': 50000, 'blocks.13.norm2': 50000, 'blocks.14.norm1': 50000, 'blocks.14.norm2': 50000, 'blocks.15.norm1': 50000, 'blocks.15.norm2': 50000, 'blocks.16.norm1': 50000, 'blocks.16.norm2': 50000, 'blocks.17.norm1': 50000, 'blocks.17.norm2': 50000, 'blocks.18.norm1': 50000, 'blocks.18.norm2': 50000, 'blocks.19.norm1': 50000, 'bloc

In [7]:
print(total_samples['blocks.21.norm2'])

50000


In [4]:
layers = ['blocks.4.norm1', 'blocks.18.norm2', 'blocks.9.norm1', 'blocks.20.norm2']

In [5]:
for key in layers:
    total_samples[key] = np.vstack(total_samples[key])

effective_dimensionality = {}
for key, array in total_samples.items():
    singular_values = np.linalg.svd(array, compute_uv=False)
    effective_dimensionality[key] = (singular_values.sum())**2 / (np.sum(singular_values**2))

output_file = os.path.join(output_path, 'new-ed.npz')
print(effective_dimensionality)

TypeError: arrays to stack must be passed as a "sequence" type such as list or tuple.