A script trying to implement and make sense of the different similarity metrics

In [123]:
import importlib
import numpy as np
import sklearn
from matplotlib import pyplot as plt
from os.path import join
import os
import seaborn as sns
import lib.utils_RSA as rsa
from lib.algos import *
from scipy.spatial import procrustes as scipro
import lib.utils_CKA as cka

importlib.reload(rsa)
importlib.reload(cka)

<module 'lib.utils_CKA' from '/home/alban/SAYCam_Vs_EGO4D/lib/utils_CKA.py'>

In [72]:
### Load in every activation sets
dataset = 'ecoVal'
models  = ['ego', 'saycam', 'imagenet', 'supervised', 'random', 'resnet']
path2activations = f'/data/alban/activations_datadriven/%s_{dataset}/'

imagelists = {}
activations = {}
for model in models:
    with open(join(path2activations%model, 'imagepaths.txt'), 'r') as f:
        imagelists[model] = [line.strip() for line in f.readlines()]
    activations[model] = np.load(join(path2activations % model, 'cls_tokens.npy'))

activations[model].shape

(28250, 2048)

In [73]:
### check if images were shown in the same order
imagelists['ego'] == imagelists['saycam']
imagelist = imagelists['ego'] # since they are the same, only consider one list

#### check if each category has the same number of images and list all categories in listcats
count = 0
cat = ''
listcat = list()
for i, imgp in enumerate(imagelist):
    current_cat = imgp.split('/')[7]
    if i == 0:
        cat = current_cat
        listcat.append(current_cat)
    if cat != current_cat:
        cat = current_cat
        listcat.append(current_cat)
        count = 1
    else:
        count += 1

nb_per_cat = count # in val, 50 images per category


In [74]:
### only select one image per category to play with metrics as a toy example
#activations_normalized = {}
for model in models:
    activations[model] = activations[model][::nb_per_cat]
    #activations_normalized[model] = activations[model].copy()
    #activations_normalized[model]

One thing I really want to check and understand is the supposed equivalance of linear CKA and RSA wth L2squared similarity (Cf. Williams, 2024)

In [124]:
### Compute RDMs for several metrics
RDMs = {}
metrics = ['pearson', 'L2', 'L2squared', 'L2_normalize', 'L2squared_normalize']
for i, model in enumerate(models):
    RDMs[model] = {}
    for m, metric in enumerate(metrics):
        RDMs[model][metric] = rsa.compute_RDMs(activations[model], metric = metric, display = False, title = f'{model}_{metric}')

In [102]:
### Compute differences between the different RDMs
for i, metric1 in enumerate(metrics[:-1]):
    for j, metric2 in enumerate(metrics[i+1:]):
        diff = list()
        for model in models:
            diff.append(np.absolute(RDMs[model][metric1] - RDMs[model][metric2]).mean())
        print(f'{metric1} VS {metric2} is {[float(x) for x in diff]}')


pearson VS L2 is [32.50833648805175, 21.752096620230294, 19.497312252587925, 250.87047294874097, 29.655785706540296, 21.271819621948634]
pearson VS L2squared is [1110.4616597742029, 503.07821215580566, 407.18682490978625, 64490.55313326643, 1005.4890152237215, 499.64711311789944]
pearson VS L2_normalize is [0.4731603241367399, 0.494150933651756, 0.4934584030220974, 0.4396358181617593, 0.4387060263826232, 0.05448409969106015]
pearson VS L2squared_normalize is [0.7469598066657239, 0.5762094870679715, 0.5928130206772492, 0.8981276256973455, 0.6550420013816447, 0.12124288074351242]
L2 VS L2squared is [1077.953323286151, 481.3261155355754, 387.68951265719835, 64239.6826603177, 975.8332295171813, 478.3752934959509]
L2 VS L2_normalize is [32.035176163915004, 21.257945686578534, 19.003853849565825, 250.43083713057916, 29.217079680157685, 21.237889837978376]
L2 VS L2squared_normalize is [31.761376681386018, 21.175887133162323, 18.904499231910673, 249.97234532304358, 29.00074370515865, 21.385043

In absolute values, we find vastly different RDMs. How about in terms of correlations?

In [100]:
### Compute correlaions between the different RDMs
for i, metric1 in enumerate(metrics[:-1]):
    for j, metric2 in enumerate(metrics[i+1:]):
        diff = list()
        for model in models:
            diff.append(np.round(np.corrcoef(RDMs[model][metric1].flatten(),RDMs[model][metric2].flatten())[0,1], 3))
        print(f'{metric1} VS {metric2} is {[float(x) for x in diff]}')


pearson VS L2 is [0.871, 0.893, 0.899, 0.212, 0.982, 0.567]
pearson VS L2squared is [0.849, 0.876, 0.868, 0.13, 1.0, 0.494]
pearson VS L2_normalize is [0.948, 0.963, 0.96, 0.952, 0.982, 0.862]
pearson VS L2squared_normalize is [1.0, 1.0, 1.0, 1.0, 1.0, 0.846]
L2 VS L2squared is [0.959, 0.977, 0.977, 0.984, 0.982, 0.976]
L2 VS L2_normalize is [0.894, 0.891, 0.881, 0.28, 1.0, 0.599]
L2 VS L2squared_normalize is [0.871, 0.893, 0.899, 0.212, 0.982, 0.588]
L2squared VS L2_normalize is [0.787, 0.819, 0.796, 0.161, 0.982, 0.533]
L2squared VS L2squared_normalize is [0.85, 0.876, 0.869, 0.13, 1.0, 0.546]
L2_normalize VS L2squared_normalize is [0.948, 0.963, 0.96, 0.952, 0.982, 0.982]


We find that the RDMs found using the various metrics are all very correlated, except for the ResNet trained on Saycam (last model) --> thus an effect of architecture. What does that mean?

The highest correlations are found for pearson and L2squared_normaized, showing they are almost perfectly equivalent!

In [111]:
### Compute similarities between models and metrics.
sim_metrics = ['cosine', 'pearson'] # We only consider cosine and pearson as similarity metrics for now

SIMs = {} # save all similarity values in a dictionary
list_sim = {} # save all similarity values in a list to directly compare with CKA later
for sm, sim_metric in enumerate(sim_metrics):
    SIMs[sim_metric] = {}
    list_sim[sim_metric] = {}
    for m, metric in enumerate(metrics):
        SIMs[sim_metric][metric] = {}
        list_sim[sim_metric][metric] = list()
        for i, model1 in enumerate(models[:-1]):
            SIMs[sim_metric][metric][model1] = {}
            for j, model2 in enumerate(models[i+1:]):
                SIMs[sim_metric][metric][model1][model2] = float(np.round(rsa.Compute_sim_RDMs(RDMs[model1][metric], RDMs[model2][metric], center = True, metric = sim_metric), 3))
                list_sim[sim_metric][metric].append(SIMs[sim_metric][metric][model1][model2])

In [110]:
### Perform CKA on the activations
CKA = {} # save all CKA values in a dictionary
list_cka = list() # save all CKA values in a list to directly compare with similarities previously computed
for i, model1 in enumerate(models[:-1]):
    CKA[model1] = {}
    for j, model2 in enumerate(models[i+1:]):
        CKA[model1][model2] = float(np.round(cka.linear_CKA(activations[model1], activations[model2]), 3))
        list_cka.append(CKA[model1][model2])

In [121]:
for sim_metric in sim_metrics:
    for metric in metrics:
        print(f'{sim_metric} {metric}')
        print(np.corrcoef(np.array(list_sim[sim_metric][metric]), np.array(list_cka))[0,1])

cosine pearson
0.9636555047137862
cosine L2
0.9668148899558918
cosine L2squared
0.9687370579151874
cosine L2_normalize
0.96215011651911
cosine L2squared_normalize
0.9635437111837866
pearson pearson
0.9634542377369398
pearson L2
0.9661662314053684
pearson L2squared
0.9687370579151874
pearson L2_normalize
0.9614215826299306
pearson L2squared_normalize
0.9633040736089267


It seems that, indepently of the various similarity measures used here, the resulting similarities are equivalently correlated with a linear CKA, around 0.96.

We can look at other metrics, such as CCA and Procrustes

In [82]:
### Run custom procrustes analysis
#d, Z, T = procrustes(activations['saycam'], activations['ego'])
### Run scipy procrustes analysis as a control
#mtx1, mtx2, dsci = scipro(activations['saycam'], activations['ego'])
#print([d, dsci])
### --> Both algos agree with each other, and the disparity measures are pretty high (somewhat unexpectedly)