In [None]:
import importlib
import numpy as np
import sklearn
from matplotlib import pyplot as plt
from os.path import join
import os
import seaborn as sns
from torchvision.ops.misc import interpolate
from tqdm.notebook import tqdm

#### Custum libraries
import lib.algos_maxRSA as max_rsa
import lib.utils_RSA as rsa
import lib.utils_CKA as cka
from lib.algos import *


importlib.reload(rsa)
importlib.reload(cka)
importlib.reload(max_rsa)

In [None]:
dataset = 'ecoLennyTest'
arch = 'vgg'
models  = ['faces', 'dual', 'objects', 'random']
#models  = ['ego', 'saycam']
path2activations = f'/home/alban/Documents/activations_datadriven/{arch}%s_{dataset}/'

imagelists = {}
activations = {}
for model in models:
    with open(join(path2activations%model, 'imagepaths.txt'), 'r') as f:
        imagelists[model] = [line.strip() for line in f.readlines()]
    activations[model] = np.load(join(path2activations % model, 'fc1_outputs.npy'))

imagelist = imagelists[model]
activations[model].shape

In [None]:
#### Normalize vectors
for model in models:
    norms = np.linalg.norm(activations[model], axis=1, keepdims=True)
    activations[model] = activations[model]/norms # normalization

In [None]:
### check if images were shown in the same order
assert imagelists[models[0]] == imagelists[models[1]]
imagelist = imagelists[models[0]] # since they are the same, only consider one list

#### check if each category has the same number of images and list all categories in listcats
count = 0
cat = ''
listcat = list()
for i, imgp in enumerate(imagelist):
    current_cat = imgp.split('/')[-2]
    if i == 0:
        cat = current_cat
        listcat.append(current_cat)
    if cat != current_cat:
        cat = current_cat
        listcat.append(current_cat)
        count = 1
    else:
        count += 1

nb_per_cat = count # in val, 50 images per cate

nb_per_cat

In [None]:
### reshape activations according to include categories
cat_activations = activations.copy()

for model in models:
    shape = activations[model].shape
    cat_activations[model] = activations[model].reshape(-1, nb_per_cat, shape[-1])

In [None]:

savedir = 'results/compactness/fisher_discriminant_vggDual/'
if not os.path.exists(savedir):
    os.makedirs(savedir)
    sorted_compactness, sorted_compact_categories, compactness = max_rsa.compute_compactness(cat_activations, models, listcat, measure = 'Fisher_discriminant')
    np.save(join(savedir, 'compactness.npy'), compactness)
    np.save(join(savedir, 'sorted_compactness.npy'), sorted_compactness)
    np.save(join(savedir, 'sorted_compact_categories.npy'), sorted_compact_categories)
else: # if we computed and saved compactness before, let's just load it
    compactness = np.load(join(savedir, 'compactness.npy'), allow_pickle=True).item()
    sorted_compactness = np.load(join(savedir, 'sorted_compactness.npy'), allow_pickle=True).item()
    sorted_compact_categories = np.load(join(savedir, 'sorted_compact_categories.npy'), allow_pickle=True).item()

In [None]:
fig_compactness, ax_compactness = max_rsa.plot_stats_one(sorted_compactness,models,  ['Categories', 'Normalized var'])

In [None]:
def check_list_similarity(list1, list2):
    '''Checks if two lists contain the same elements, regardless of order,
    and calculates the proportion of common elements.'''
    set1 = set(list1)
    set2 = set(list2)
    common_elements = set1 & set2  # Intersection of sets
    proportion = (len(common_elements) / max(len(set1), len(set2))) * 100 if max(len(set1), len(set2)) > 0 else 0
    return proportion

import seaborn as sns


model_overlap_matrix= np.zeros((len(models), len(models)))

for m1, model1 in enumerate(models):
    for m2, model2 in enumerate(models):
        model_overlap_matrix[m1,m2] = check_list_similarity(sorted_compact_categories[model1][:50],sorted_compact_categories[model2][:50])
plt.figure(figsize=(6, 6))
plt.rcParams['axes.grid'] = False
# Replace the plt.imshow() section with:
sns.heatmap(model_overlap_matrix,
        annot=True, fmt='.1f', cmap='grey',
        xticklabels=models, yticklabels=models, cbar=False)
plt.xticks(rotation=45)
plt.yticks(rotation=45)
#plt.imshow(model_overlap_matrix[dataset], cmap = 'grey')
#plt.xticks(np.arange(len(models)), models, rotation = 45)
#plt.yticks(np.arange(len(models)), models, rotation = 45)
plt.show()

In [None]:
def sample_catrdm_pairs(cat_activations, submodels, n_samples=1000, nb_subcategories=12, nb_per_category = 50,
                                    batch_size=10, seed=None):
    """
    Memory-efficient version that processes in batches and optionally saves to disk.

    Parameters:
    -----------
    batch_size : int
        Number of samples to process at once (default: 1000)
    output_file : str, optional
        If provided, saves results to this file using pickle
    """

    if seed is not None:
        np.random.seed(seed)

    dissimilarity_metric = 'L2squared'

    nb_categories = len(cat_activations[submodels[0]])
    n_batches = (n_samples + batch_size - 1) // batch_size

    all_sims_samples = []
    all_indices = []
    print(f"Processing {n_samples} samples in {n_batches} batches of {batch_size}...")

    batch_rdms = {}
    for batch_idx in tqdm(range(n_batches)):
        start_idx = batch_idx * batch_size
        end_idx = min(start_idx + batch_size, n_samples)
        current_batch_size = end_idx - start_idx

        subset_size = nb_subcategories
        # Allocate batch arrays
        batch_sim = np.zeros((current_batch_size))
        batch_indices = np.zeros((current_batch_size, subset_size), dtype=int)
        for model in submodels:
            batch_rdms[model] = np.zeros((current_batch_size, nb_subcategories*nb_per_category, nb_subcategories*nb_per_category))
        for i in range(current_batch_size):
            # Randomly select images
            cat_indices = np.random.choice(nb_categories, size=nb_subcategories, replace=False)

            # Compute subrdms
            for model in submodels:
                batch_rdms[model][i] = rsa.compute_RDMs(cat_activations[model][cat_indices].reshape(nb_subcategories*nb_per_category, -1),
                            metric=dissimilarity_metric, display=False)
            # Extract submatrices
            batch_sim[i] = rsa.Compute_sim_RDMs(batch_rdms[submodels[0]][i], batch_rdms[submodels[1]][i], center = False, metric = 'pearson' )
            batch_indices[i] = cat_indices

        all_sims_samples.append(batch_sim)
        all_indices.append(batch_indices)

    # Concatenate all batches
    sim_samples = np.concatenate(all_sims_samples, axis=0)
    indices_used = np.concatenate(all_indices, axis=0)


    return sim_samples, indices_used




In [None]:
for i, model1 in enumerate(models[:-2]):
    for j, model2 in enumerate(models[i+1:-1]): # ignore random
        sim_samples, indices_used = sample_catrdm_pairs(cat_activations, [model1, model2], n_samples=50, nb_subcategories = 12, nb_per_category = 50, batch_size=10, seed=None)
        np.save(f'results/categories_sim_samples_{arch}_{model1}_{model2}_{dataset}.npy', sim_samples)

We have an idea of the average pearson similarity found if we select 12 categories - pretty high!
Can we find a subset of 12 categories that has a much lover similarity than that. For example, categories with the lowest correlations between the models.

In [None]:
def subsimilar_categories(cat_activations, submodels, dissimilarity_metric = 'L2squared', similarity_metric = 'pearson', nb_subcategories = 12):
    assert len(submodels)== 2
    assert cat_activations[submodels[0]].shape[:2] == cat_activations[submodels[1]].shape[:2]

    shape = cat_activations[submodels[0]].shape

    nb_categories = shape[0]
    nb_per_categories = shape[1]

    mean_cat_activations1 = cat_activations[submodels[0]].mean(axis = 1)
    mean_cat_activations2 = cat_activations[submodels[1]].mean(axis = 1)

    RDM1 = rsa.compute_RDMs(mean_cat_activations1,
                            metric=dissimilarity_metric, display=False)
    RDM2 = rsa.compute_RDMs(mean_cat_activations2,
                            metric=dissimilarity_metric, display=False)

    RDM1_centered = RDM1 - np.mean(RDM1)
    RDM2_centered = RDM2 - np.mean(RDM2)

    #RDM1_centered = RDM1_centered / np.sqrt(np.sum(RDM1_centered ** 2))
    #RDM2_centered = RDM2_centered / np.sqrt(np.sum(RDM2_centered ** 2))

    correlations = np.sum(RDM1_centered * RDM2_centered, axis=0)
    subsimiliar_categories = np.argsort(correlations)[:nb_subcategories]

    '''#subsimilar_RDM1 = rsa.compute_RDMs(cat_activations[submodels[0]][subsimiliar_categories].reshape(nb_subcategories*nb_per_categories, -1),
                            metric=dissimilarity_metric, display=False)
    #subsimilar_RDM2 = rsa.compute_RDMs(cat_activations[submodels[1]][subsimiliar_categories].reshape(nb_subcategories*nb_per_categories, -1),
                            metric=dissimilarity_metric, display=False)
    #print(rsa.Compute_sim_RDMs(subsimilar_RDM1, subsimilar_RDM2, metric = similarity_metric))'''
    return correlations, subsimiliar_categories




In [None]:
cat_similarities = {}
similarities = {}
for i, model1 in enumerate(models[:-1]):
    for j, model2 in enumerate(models[i+1:]):
        correlations, subsimilar_cats = subsimilar_categories(cat_activations, [model1, model2], nb_subcategories = 12)
        RDM1, RDM2, RDM1_sorted, RDM2_sorted, sorted_indices = max_rsa.find_subsimilar_subset(cat_activations, [model1, model2], subsimilar_cats,  images_per_subset = 4, nb_per_category = 50)
        cat_sim = rsa.Compute_sim_RDMs(RDM1, RDM2, metric = 'pearson')
        sim = rsa.Compute_sim_RDMs(RDM1_sorted, RDM2_sorted, metric = 'pearson')

        '''fig, subs = plt.subplots(1,2, sharex=True, sharey=True)
        subs[0].imshow(RDM1, cmap='gray')
        subs[1].imshow(RDM2, cmap='gray')
        subs[0].axis('off')
        subs[1].axis('off')
        fig.suptitle(f'{cat_sim}')
        fig.tight_layout()'''
        cat_similarities[f'{model1}_{model2}'] = cat_sim
        print(np.array(listcat)[subsimilar_cats])

        fig, subs = plt.subplots(1,2, sharex=True, sharey=True)
        subs[0].imshow(RDM1_sorted, cmap='gray')
        subs[1].imshow(RDM2_sorted, cmap='gray')
        subs[0].axis('off')
        subs[1].axis('off')
        fig.suptitle(f'{sim}')
        fig.tight_layout()
        plt.show()
        plt.close()
        similarities[f'{model1}_{model2}'] = sim

In [None]:
nb_seleted_categories = 12
cat_similarities_compact = {}
similarities_compact = {}
sorted_indices = {}
maxdiffs = {}
for i, model1 in enumerate(models[:-1]):
    for j, model2 in enumerate(models[i+1:]):
        labels, sortedmaxdiffcats, maxdiffs[f'{model1}_{model2}'] = max_rsa.max_compactness_difference(
                sorted_compact_categories, compactness, listcat, models = [model1, model2],
                nb_considered_categories = nb_seleted_categories, compactness_diff_measure = 'normalizedDiff'
            )
        RDM1, RDM2, RDM1_sorted, RDM2_sorted, sorted_indices[f'{model1}_{model2}'] = max_rsa.find_subsimilar_subset(cat_activations, [model1, model2], labels[:nb_seleted_categories],  images_per_subset = 4, nb_per_category = 50)
        cat_sim = rsa.Compute_sim_RDMs(RDM1, RDM2, metric = 'pearson')
        sim = rsa.Compute_sim_RDMs(RDM1_sorted, RDM2_sorted, metric = 'pearson')
        '''fig, subs = plt.subplots(1,2, sharex=True, sharey=True)
        subs[0].imshow(RDM1, cmap='gray')
        subs[1].imshow(RDM2, cmap='gray')
        subs[0].axis('off')
        subs[1].axis('off')
        fig.suptitle(f'{rsa.Compute_sim_RDMs(RDM1, RDM2, metric = 'pearson')}')
        fig.tight_layout()'''
        cat_similarities_compact[f'{model1}_{model2}'] = cat_sim

        savename = f'Truenormalize_Fisher_discriminant_corr_{arch}_{model1}_{model2}_ecoLennyTest'
        fig, subs = plt.subplots(1,2, sharex=True, sharey=True)
        subs[0].imshow(RDM1_sorted, cmap='gray')
        subs[1].imshow(RDM2_sorted, cmap='gray')
        subs[0].axis('off')
        subs[1].axis('off')
        fig.suptitle(f'{rsa.Compute_sim_RDMs(RDM1_sorted, RDM2_sorted, metric = 'pearson')}')
        fig.tight_layout()
        fig.savefig(f'figures/compactness/subRDMs_vgg/{savename}.png')
        similarities_compact[f'{model1}_{model2}'] = sim


In [None]:
import glob
listsamples = glob.glob(f'results/categories_sim_samples_{arch}_*_{dataset}.npy')
nb_cols = 3
fig, subs = plt.subplots(nrows = 1, ncols = nb_cols, figsize = (25,10), sharex = True, sharey = True)
for f, file in enumerate(listsamples):
    sample = np.load(file)
    hist, bin_edges = np.histogram(sample, 100)
    subs[f].bar(bin_edges[:-1],hist/max(hist), width = bin_edges[1] - bin_edges[0], linewidth = 0, align = 'edge')
    #subs[f//5, f%5].legend()
    subs[f].set_xlabel('Similarity')
    subs[f].set_ylabel('Density')
    subs[f].set_title(f'{file.split('_')[-3]}_{file.split('_')[-2]}')
    name = f'{file.split('_')[-3]}_{file.split('_')[-2]}'
    subs[f].vlines(cat_similarities[name],0,1, 'g')
    subs[f].vlines(cat_similarities_compact[name],0,1, 'r')


plt.tight_layout()
plt.show()
#fig.savefig(f'figures/compactness/categories_sim_samples_{arch}_{model1}_{model2}_{dataset}.npy')
plt.close()

In [None]:
print(list(sorted_compact_categories['objects']).index('0001_man'))
print(list(sorted_compact_categories['faces']).index('0001_man'))


In [None]:
list(sortedmaxdiffcats).index('0001_man')

In [None]:
3//4

In [None]:
## Looking at selections
imagelist = [img.replace('/raid/leonard_vandyck/datasets/genloc/', '/home/alban/Documents/ecoLennyTest/') for img in imagelist]
#imagelist = [img.replace('/raid/shared/datasets/visoin/ecoset/', '/home/alban/Documents/ecoset/') for img in imagelist]
imagespaths = {}
for i, model1 in enumerate(models[:-1]):
    for j, model2 in enumerate(models[i+1:]):
        savename = f'Truenormalize_Fisher_discriminant_corr_{arch}_{model1}_{model2}_ecoLennyTest'
        images, imagespaths[model1 + '_' + model2] = max_rsa.display_low_similarity_images(imagelist, sorted_indices[f'{model1}_{model2}'], maxdiffs[f'{model1}_{model2}'][:nb_seleted_categories], n_images=48,
                                                      grid_cols=8, figsize=(20, 12),
                                                      save_path=f'figures/compactness/subset_vgg/{savename}.png')

In [None]:
import glob
listsamples = glob.glob(f'results/categories_sim_samples*{dataset}.npy')
nb_cols = 5
fig, subs = plt.subplots(nrows = 2, ncols = nb_cols, figsize = (25,10), sharex = True, sharey = True)
for f, file in enumerate(listsamples):
    sample = np.load(file)
    hist, bin_edges = np.histogram(sample, 100)
    subs[int(f/nb_cols), f%nb_cols].bar(bin_edges[:-1],hist/max(hist), width = bin_edges[1] - bin_edges[0], linewidth = 0, align = 'edge')
    #subs[f//5, f%5].legend()
    subs[int(f/nb_cols), f%nb_cols].set_xlabel('Similarity')
    subs[int(f/nb_cols), f%nb_cols].set_ylabel('Density')
    subs[int(f/nb_cols), f%nb_cols].set_title(f'{file.split('_')[-3]}_{file.split('_')[-2]}')
    name = f'{file.split('_')[-3]}_{file.split('_')[-2]}'
    subs[int(f/nb_cols), f%nb_cols].vlines(cat_similarities[name],0,1, 'g')
    subs[int(f/nb_cols), f%nb_cols].vlines(cat_similarities_compact[name],0,1, 'r')


plt.tight_layout()
plt.show()
fig.savefig(f'figures/compactness/categories_selectionVSdistribution_{dataset}.png')
plt.close()

In [None]:
import pickle
listpickles_ecoLennyTest = glob.glob(f'/home/alban/Documents/results_image_selection/{dataset}_*.pkl')

RESULTS = {}
for p, pkl in enumerate(listpickles_ecoLennyTest):
    name = pkl.split('/')[-1][:-4]
    f = open(pkl, "rb")
    RESULTS[name] = pickle.load(f)
    f.close()

list_names = [k for k in RESULTS.keys()]
similarities_compactness = {}

similarities_compactness = [RESULTS[f'{dataset}_Truenormalize_silhouette_score_normalizedDiff_pearson']['similarity_dict'][pair]['similarity'] for pair in
                      RESULTS[f'{dataset}_Truenormalize_silhouette_score_normalizedDiff_pearson']['similarity_dict'].keys()]

In [None]:
## Looking at selections
imagelist = [img.replace('/raid/leonard_vandyck/datasets/genloc/', '/home/alban/Documents/ecoLennyTest/') for img in imagelist]
#imagelist = [img.replace('/raid/shared/datasets/visoin/ecoset/', '/home/alban/Documents/ecoset/') for img in imagelist]
imagespaths = {}
for i, model1 in enumerate(models[:-1]):
    for j, model2 in enumerate(models[i+1:]):
        savename = f'Truenormalize_Fisher_discriminant_corr_{arch}_{model1}_{model2}_ecoLennyTest'
        images, imagespaths[model1 + '_' + model2] = max_rsa.display_low_similarity_images(imagelist, sorted_indices[f'{model1}_{model2}'], maxdiffs[f'{model1}_{model2}'][:nb_seleted_categories], n_images=48,
                                                      grid_cols=8, figsize=(20, 12),
                                                      save_path=f'figures/compactness/subset_vgg/{savename}.png')

In [None]:
maxdiffs[:nb_seleted_categories]

In [None]:
imagelist

In [None]:
RDMs = {}
metric = 'L2squared'
for i, model in enumerate(models):
    print(model)
    RDMs[model] = rsa.compute_RDMs(activations[model], metric = metric, display = False, title = f'{model}_{metric}')

In [None]:
mean_RDM = RDMs.copy()
for model in models:
    mean_RDM[model] = RDMs[model].reshape(len(listcat), nb_per_cat, len(listcat), nb_per_cat)
    mean_RDM[model] = mean_RDM[model].transpose(0, 2, 1, 3)
    mean_RDM[model] = mean_RDM[model].mean(axis = (2,3))

fig, subs = plt.subplots(1,2, sharex=True, sharey=True)
subs[0].imshow(mean_RDM[submodels[0]], cmap='gray')
subs[1].imshow(mean_RDM[submodels[1]], cmap='gray')
subs[0].axis('off')
subs[1].axis('off')
fig.suptitle(f'{rsa.Compute_sim_RDMs(mean_RDM[submodels[0]], mean_RDM[submodels[1]], metric = 'pearson')}')
fig.tight_layout()