<a href="https://colab.research.google.com/github/BangachevKiril/RepresentationLearningTheory/blob/main/GeometryofTrainedModels.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Gemetry of Representations Notebook

Finding relative bias and margin across 8 different models

In [1]:
from datasets import load_dataset
import os
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np
import torch
from torchvision import transforms
import torchvision

In [2]:
# optional. This is toconveniently save embeddings once processed if necessary
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# get libraries necessary for Hugging Face models
!pip install -U transformers
import requests
from transformers import AutoProcessor, AutoModel

## Model Names and Parameters

In [4]:
varieties_siglip = ['google/siglip-so400m-patch14-384',
             'google/siglip-base-patch16-224',
             'google/siglip-base-patch16-384',
             'google/siglip-large-patch16-256',
             'google/siglip-so400m-patch14-224',
             'google/siglip-base-patch16-256',
             'google/siglip-base-patch16-512',
             'google/siglip-large-patch16-384']

In [None]:
model_params = dict()

for variety in varieties_siglip:
  model_params[variety] = dict()
  model = AutoModel.from_pretrained(variety)
  model_params[variety]['bias'] = model.logit_bias.item()
  model_params[variety]['inverse_temperature'] = np.exp(model.logit_scale.item())
  model_params[variety]['relative_bias'] = model_params[variety]['bias']/model_params[variety]['inverse_temperature']

# Getting Data and Embedding

In [None]:
def calculate_xi(image_embeddings, text_embeddings):
  diff = image_embeddings - text_embeddings
  mean_of_norms = np.mean(np.linalg.norm(diff, axis=1)**2)
  norm_of_mean = np.linalg.norm(np.mean(diff, axis = 0))**2
  random = np.random.permutation(np.arange(image_embeddings.shape[0]))
  random_diff = image_embeddings[random, :] - text_embeddings
  random_mean_of_norms = np.mean(np.linalg.norm(random_diff, axis=1)**2)
  return np.array([mean_of_norms, norm_of_mean, random_mean_of_norms])

In [None]:
!mkdir ImageNetVal
%cd ImageNetVal
!wget https://image-net.org/data/ILSVRC/2012/ILSVRC2012_img_val.tar

In [None]:
!mkdir val
# extract
!tar -xvf ILSVRC2012_img_val.tar -C val

In [None]:
# uses the labels_text.txt file which has a list of labels in English.
labels_lookup = []
with open('labels_text.txt', 'r') as f:
  for line in f:
    labels_lookup.append(line.split('\n')[0])

In [None]:
# split the ImageNet dataset based on classes
import shutil

# Paths
val_dir = "val"                          # folder with 50k images
mapping_file = "labels_text.txt"         # class labels
output_dir = "val_unpacked"              # where we unpack into 998 folders (two classes have 100 instead of 50 images)

# Create output folder
os.makedirs(output_dir, exist_ok=True)

# Read mapping
with open(mapping_file, "r") as f:
    lines = f.readlines()

for i,line in enumerate(lines):
    filename = 'ILSVRC2012_val_000' +'0'*(5-len(str(i+1))) + str(i+1) + '.JPEG'
    src = os.path.join(val_dir, filename)
    dst_dir = os.path.join(output_dir, str(line.split('\n')[0]))
    os.makedirs(dst_dir, exist_ok=True)
    dst = os.path.join(dst_dir, filename)
    shutil.move(src, dst)

print("Done! Validation set organized into 1000 folders.")

Done! Validation set organized into 1000 folders.


### Run Infernece

In [None]:
# siglip
for variety in varieties_siglip:
  # load model
  print(variety)
  processor = AutoProcessor.from_pretrained(variety)
  model = AutoModel.from_pretrained(variety).to('cuda')
  # set up data
  image_embeddings = []
  text_embeddings = []
  splits = []
  ordered_labels = []
  so_far = 0
  # embed
  for i,label in enumerate(os.listdir('ImageNetVal/val_unpacked')):
    with torch.no_grad(): #no grad computations plz
      # get data
      images = [Image.open('ImageNetVal/val_unpacked/'+label + '/' + filename ).convert('RGB') for
              filename in os.listdir('ImageNetVal/val_unpacked/'+label)] # get images
      for _ in os.listdir('ImageNetVal/val_unpacked/'+label):
        ordered_labels.append(label)
      # process data
      inputs = processor(images=images, text=[label],
                       return_tensors="pt", padding="max_length").to(model.device) # process
      splits.append(so_far)
      # find representations
      outputs = model(**inputs) # forward pass
    #record
      image_embeddings.append(outputs.image_embeds.detach().cpu().numpy())
      text_embeddings.append(np.outer(np.ones(len(images)),
                                     outputs.text_embeds.detach().cpu().numpy()))
      so_far += len(images)
  # after all is embedded
  image_embeddings = np.concatenate(image_embeddings, axis=0)
  text_embeddings = np.concatenate(text_embeddings, axis=0)
  splits = np.array(splits)
  ordered_labels = np.array(ordered_labels)
  xi = calculate_xi(image_embeddings, text_embeddings)
  np.savez('drive/My Drive/Research/SigLIP/imagenetval'+variety.split('/')[-1]+'.npz',
         text = text_embeddings,
         image = image_embeddings,
         splits = splits,
         ordered_labels= ordered_labels,
         xi = xi)

# Experiments

# Get statistics

In [52]:
# get splits and form similarity boolean matrix
prefix = 'drive/My Drive/Research/SigLIP/imagenetval'
data = np.load(prefix+varieties_siglip[-1].split('/')[-1]+'.npz')
splits = data['splits']
labels = data['ordered_labels']

In [53]:
correct_label = np.zeros((np.shape(data['text'])[0], np.shape(splits)[0]),dtype=bool)
c = 0
for i in range(correct_label.shape[0]):
  if c+1<correct_label.shape[1] and i == splits[c+1]:
    c += 1
  correct_label[i,c] = True

In [54]:
def get_cos_similarity_statistics(text_embeddings, image_embeddings, temperature, bias):
  ips = image_embeddings @ text_embeddings[splits, :].T
  # stats of inner products
  matching = ips[correct_label]
  mismatching = ips[~correct_label]
  return np.array([np.mean(matching),np.mean(mismatching)])

In [55]:
prefix = 'drive/My Drive/Research/SigLIP/imagenetval'
vals = []
for variety in varieties_siglip:
  print(variety)
  data = np.load(prefix+variety.split('/')[-1]+'.npz')

  text_full = data['text']
  image_full = data['image']
  vals.append(get_cos_similarity_statistics(text_full, image_full,
                              model_params[variety]['inverse_temperature'], model_params[variety]['bias']))

google/siglip-so400m-patch14-384
google/siglip-base-patch16-224
google/siglip-base-patch16-384
google/siglip-large-patch16-256
google/siglip-so400m-patch14-224
google/siglip-base-patch16-256
google/siglip-base-patch16-512
google/siglip-large-patch16-384


In [56]:
import pandas as pd
summary  = pd.DataFrame(vals, index=varieties_siglip,
                       columns=['mean_pos_cos', 'mean_neg_cos'])

In [57]:
summary['margin'] = (summary['mean_pos_cos'] - summary['mean_neg_cos'])/2
summary['relative_bias'] = (summary['mean_pos_cos'] + summary['mean_neg_cos'])/2
summary[['mean_pos_cos', 'mean_neg_cos', 'margin', 'relative_bias']]

Unnamed: 0,mean_pos_cos,mean_neg_cos,margin,relative_bias
google/siglip-so400m-patch14-384,0.137568,-0.001476,0.069522,0.068046
google/siglip-base-patch16-224,0.094982,-0.030491,0.062736,0.032245
google/siglip-base-patch16-384,0.096593,-0.031865,0.064229,0.032364
google/siglip-large-patch16-256,0.102251,-0.03585,0.069051,0.0332
google/siglip-so400m-patch14-224,0.136509,-0.002205,0.069357,0.067152
google/siglip-base-patch16-256,0.100431,-0.029425,0.064928,0.035503
google/siglip-base-patch16-512,0.097063,-0.032233,0.064648,0.032415
google/siglip-large-patch16-384,0.095794,-0.038382,0.067088,0.028706
