# Choose settings

##### Choose your settings here

In [23]:
DOG_OR_CAT = "cat" #can also be "dog"
RUN = 2 #update if don't want overwritten results
TYPE_EXTERNAL_POLLUTION = "cross-label" #other option: "imagenet_only". With cross-label: 1% pollution, 1/2 of that cross-label, other 1/2 imagenet
MODEL = "CLIP"


# choose dataset
DATASET_NAME = 'cats-vs-dogs-large'  # needs to match folder name in FM/datasets
LOAD_AND_EMBED_DATASET_IN_BATCHES = True  # True for large datasets, False for small ones
USE_CACHED_EMBEDDINGS = f'{MODEL}_cats-vs-dogs-large.pkl'  # '' for loading the dataset normally, 'CREATE__{x}.pkl' for creating the cache file {x}.pkl, '{x}.pkl' for loading the cache file {x}.pkl
MISLABELED_INSTANCES = 'mislabeled_instances_cats-vs-dogs.pkl'  # if not '', but e.g. 'mislabeled_instances_cats-vs-dogs.pkl', the pickle file specifies which files to drop from the loaded embeddings

# choose how many image-label mismatches to insert
MISMATCH_PORTION = 0.01  # percentage of mismatching image-label pairs added
MANIPULATION_TYPES = [0.5, 0.5, 0.0, 0.0]  # how much of the MISMATCH_PORTION to produce by [exchanging images between classes, inserting images from other datasets, inserting randomly generated images, inserting placeholder images]
IMAGENET_EMBEDDINGS = f'{MODEL}_imagenet-subset.pkl'  # specify if MANIPULATION_TYPES[1] > 0

CORRECT_TEXT_EMBEDDINGS_NAME_BASE = f"text_embeddings_correct_{MODEL.lower()}_{DOG_OR_CAT}"
RANDOM_TEXT_EMBEDDINGS_NAME_BASE = f"text_embeddings_random_{MODEL.lower()}_{DOG_OR_CAT}"

RESULTS_FILE = f"results_{MODEL.lower()}_{DOG_OR_CAT}_run_{RUN}"

##### This part is calculated automatically

In [2]:
datasets_path = '/content/drive/My Drive/FM/datasets/'
dataset_path = datasets_path + DATASET_NAME + '/'

if DATASET_NAME == 'cats-vs-dogs-large' or DATASET_NAME == 'train-small':
  LABELS = ['cat', 'dog']
elif DATASET_NAME == 'jellyfish-classification':
  LABELS = ['barrel jellyfish', 'compass jellyfish', 'lions mane jellyfish', 'moon jellyfish']
elif DATASET_NAME == 'traffic-signs':
  LABELS = ['30 kilometers per hour speed limit traffic sign', '80 kilometers per hour speed limit traffic sign', '100 kilometers per hour speed limit traffic sign', 'give way traffic sign', 'no entry traffic sign', 'no overtaking traffic sign', 'priority over oncoming traffic sign', 'stop sign']
else:
  raise ValueError('Invalid dataset selected or labels not set!')

assert 0 <= MISMATCH_PORTION <= 1, f'MISMATCH_PORTION must be in [0, 1] but is {MISMATCH_PORTION}'
assert len(MANIPULATION_TYPES) == 4 and sum(MANIPULATION_TYPES) == 1, f'MANIPULATION_TYPES must contain 4 entries that sum up to 1.0 but is {MANIPULATION_TYPES}'
assert MANIPULATION_TYPES[0] + MANIPULATION_TYPES[1] == 1, 'At the moment, only interclass corruption and imagenet corruption are implemented!'

# Load libraries

In [3]:
# hacky way when hitting "run all" that libraries are not reloaded
try:
  torch.tensor([[0]])
  libraries_already_loaded = True
except:
  libraries_already_loaded = False

In [4]:
if not libraries_already_loaded:
  ! pip install ftfy regex tqdm
  ! pip install git+https://github.com/openai/CLIP.git

Collecting ftfy
  Downloading ftfy-6.1.3-py3-none-any.whl (53 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/53.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.4/53.4 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: ftfy
Successfully installed ftfy-6.1.3
Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-59ltn__f
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-59ltn__f
  Resolved https://github.com/openai/CLIP.git to commit a1d071733d7111c9c014f024669f959182114e33
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: clip
  Building wheel for clip (setup.py) ... [?25l[?25hdone
  Created wheel for clip: filename=clip-1.0-py3-none-any.whl size=1369497 sha256=1669c4f020ccbf7645ee05eba55cd06ec09106dc1a382da40841

In [5]:
import torch
from torchvision import transforms
import clip
from transformers import AutoImageProcessor, ViTModel, AlignProcessor, AlignModel, AutoTokenizer
from transformers.tokenization_utils_base import BatchEncoding
import numpy as np
from matplotlib import pyplot as plt
from PIL import Image
from pkg_resources import packaging
import os
from google.colab import drive
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, confusion_matrix
import glob
import pickle
from scipy.spatial.distance import cosine

print("Torch version:", torch.__version__)

#from sklearn.metrics.pairwise import cosine_similarity #as cosine_similarity_sklearn
#def cosine_similarity(X, Y=None):
#  X = X.clone().detach().numpy()
#  if Y is not None:
#    Y = Y.clone().detach().numpy()
#  return torch.tensor(cosine_similarity_sklearn(X, Y))

Torch version: 2.1.0+cu121


In [6]:
def calculate_column_average(matrix):
    """
    Calculates the average for each column in a PyTorch matrix.

    Parameters:
    - matrix: torch.Tensor
        The input matrix for which the column averages are to be calculated.

    Returns:
    - torch.Tensor
        A tensor containing the average value for each column of the input matrix.

    Raises:
    - TypeError:
        Raises an error if the input is not a PyTorch tensor.
    - ValueError:
        Raises an error if the input tensor is empty or has zero columns.
    """

    # Checking if the input is a PyTorch tensor
    if not isinstance(matrix, torch.Tensor):
        raise TypeError("Input should be a PyTorch tensor.")

    # Checking if the input tensor is empty or has zero columns
    if matrix.numel() == 0 or matrix.size(1) == 0:
        raise ValueError("Input tensor is empty or has zero columns.")

    # Calculating the column averages but taking out the entry for the same vector in the matrix (diagonal entry)
    column_sums = torch.sum(matrix, dim=0)
    column_counts = torch.tensor([matrix.size(0)] * (matrix.size(1)), dtype=torch.float32)
    column_averages = (column_sums -1) / (column_counts - 1)

    return column_averages

def cosine_similarity_matrix(embeddings_tensor):
    """
    Function to compute a similarity matrix using dot product

    Parameters:
    - embeddings: list of torch.Tensor
        List of embeddings of images produced with CLIP.

    Returns:
    - similarity_matrix: numpy.ndarray
        2D numpy array representing the similarity matrix between the embeddings.
        Each element (i, j) in the matrix represents the similarity between embeddings[i] and embeddings[j].
        The similarity score between the two embeddings is calculated using cos similarity. The score ranges from 0 to 1,
        where 0 indicates completely dissimilar embeddings and 1 indicates identical embeddings.
    """

    """
    # Calculating the cosine distance between the two embeddings
    distance = cosine(embedding1, embedding2)

    # Converting the distance to similarity score
    similarity = 1 - distance

    return similarity
    """

    # Normalizing the embeddings
    embeddings_tensor = torch.nn.functional.normalize(embeddings_tensor, dim=1)

    # Computing the similarity matrix using dot product
    similarity_matrix = torch.matmul(embeddings_tensor, embeddings_tensor.T)

    return similarity_matrix

from sklearn.metrics.pairwise import cosine_similarity as cosine_similarity_sklearn
def cosine_similarity(x, y=None):
  if y is None:
    return cosine_similarity_matrix(x)
  return cosine_similarity_sklearn(x, y)

# Mounting storage

In [7]:
drive.mount('/content/drive')
!ls "{datasets_path}"

Mounted at /content/drive
AlexNet_cats-vs-dogs-large.pkl			 mislabeled_instances_cats-vs-dogs.pkl
AlexNet_imagenet-subset.pkl			 note.txt
AlexNet_traffic-signs.pkl			 text_dog_embeddings.pkl
ALIGN_cats-vs-dogs-large.pkl			 text_embeddings_correct_align_cat.pkl
ALIGN_imagenet-subset.pkl			 text_embeddings_correct_align_dog.pkl
ALIGN_traffic-signs.pkl				 text_embeddings_correct_clip_cat.pkl
cats-dogs-big_ids.pkl				 text_embeddings_correct_clip_dog.pkl
cats-dogs-big.pkl				 text_embeddings_random_align_cat.pkl
cats-vs-dogs-large				 text_embeddings_random_align_dog.pkl
CLIP_cats-vs-dogs-large.pkl			 text_embeddings_random_clip_cat.pkl
CLIP_imagenet-subset.pkl			 text_embeddings_random_clip_dog.pkl
CLIP_traffic-signs.pkl				 text_random_embeddings.pkl
dog_wrong_2_12.txt				 traffic-signs
dog_wrong.txt					 train-small
image_embeddings__cats-vs-dogs.pkl		 ViT-CLS_cats-vs-dogs-large.pkl
image_embeddings__traffic-signs.pkl		 ViT-CLS_imagenet-subset.pkl
imagenet_one-of-each-class-except-cats

# Define dataset loader

In [None]:
def load_dataset(folder_path, labels):

    # Checking if the provided folder path exists
    if not os.path.exists(folder_path):
        raise ValueError("Folder path does not exist.")

    images = {}
    for label in labels:
      images[label] = []

    # Looping through all files in the folder
    for i, filename in enumerate(glob.glob(folder_path + '**/*', recursive=True)):

      if i % 1000 == 0:
        print(i, 'files loaded')

      try:
        img = Image.open(filename).convert('RGB')
      except:
        continue

      label_found = False
      for label in labels:
        if label in '/'.join(filename.split('/')[-2:]):
          if label_found:
            raise ValueError(f"Label of {filename} is ambiguous.")
          label_found = True
          images[label].append(img)

      if not label_found:
        raise ValueError(f"No label for {filename} found.")

    print(i+1, 'files loaded')

    return images

def get_embeddings_dict_batchwise(folder_path, labels, model, preprocess, batch_size=64):

  # Checking if the provided folder path exists
  if not os.path.exists(folder_path):
    raise ValueError("Folder path does not exist.")

  image_embeddings = {}
  for label in labels:
    image_embeddings[label] = []

  images = {}
  for label in labels:
    images[label] = []

  # Looping through all files in the folder
  all_files = glob.glob(folder_path + '**/*', recursive=True)
  for n_instances_processed, filename in enumerate(all_files):

    try:
      img = Image.open(filename).convert('RGB')
    except:
      continue

    # Find label of the image
    label_found = False
    for label in labels:
      if label in '/'.join(filename.split('/')[-2:]):
        if label_found:
          raise ValueError(f"Label of {filename} is ambiguous.")
        label_found = True
        images[label].append(img)
    if not label_found:
      raise ValueError(f"No label for {filename} found.")

    # Get embeddings if already a batch is full
    if n_instances_processed % batch_size == 0 and n_instances_processed > 0 or n_instances_processed == len(all_files) - 1:
      for label in labels:
        if len(images[label]) == 0:
          continue
        with torch.no_grad():
          processed_images = torch.cat(([preprocess(img).unsqueeze(0) for img in images[label]]))
          image_embeddings[label].append(model.encode_image(processed_images.to(device)))
          del processed_images
      images = {}
      for label in labels:
        images[label] = []
      print(n_instances_processed, 'loaded and encoded')

  # Convert list of embeddings to tensor
  for label in labels:
    image_embeddings[label] = torch.cat((image_embeddings[label]))

  return image_embeddings

# Load and embed dataset

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"


class Align(torch.nn.Module):

  def __init__(self):
    super().__init__()
    self.align = AlignModel.from_pretrained("kakaobrain/align-base")

  def forward(self, x: torch.Tensor) -> torch.Tensor:
    return self.align(x)

  def encode_image(self, img: torch.Tensor) -> torch.Tensor:
    return self.align.get_image_features(img)

  def encode_text(self, text: BatchEncoding) -> torch.Tensor:
    return self.align.get_text_features(**text)


def align_preprocessor_with_memory_fix(img) -> torch.Tensor:
  processor = AlignProcessor.from_pretrained("kakaobrain/align-base")
  with torch.no_grad():
    processed = processor(images=img, return_tensors="pt").to(device).pixel_values.squeeze(0)
  del processor
  return processed


if MODEL == 'CLIP':
  model, preprocess = clip.load("ViT-B/32", device=device)
  tokenize = clip.tokenize
elif MODEL == 'ALIGN':
  model = Align().to(device)
  preprocess = align_preprocessor_with_memory_fix
  tokenizer = AutoTokenizer.from_pretrained("kakaobrain/align-base")
  tokenize = lambda s: tokenizer([s], padding=True, return_tensors="pt")
else:
  raise ValueError(f'Invalid model {MODEL} selected!')

100%|████████████████████████████████████████| 338M/338M [00:02<00:00, 129MiB/s]


In [None]:
if USE_CACHED_EMBEDDINGS != '' and USE_CACHED_EMBEDDINGS.split('__')[0] != 'CREATE':

  # load embeddings of previous execution from pickle file
  pickle_file = datasets_path + USE_CACHED_EMBEDDINGS
  with open(pickle_file, 'rb') as f:
    image_embeddings = pickle.load(f)

  print('Embeddings loaded from', pickle_file)

else:

  if LOAD_AND_EMBED_DATASET_IN_BATCHES:

    # load and embed images in batches (to save GPU memory and especially RAM)
    image_embeddings = get_embeddings_dict_batchwise(dataset_path, LABELS, model, preprocess, batch_size=512)

  else:

    # load images
    images = load_dataset(dataset_path, LABELS)

    # embed images and text
    image_embeddings = {}
    for label in LABELS:
      processed_images = torch.cat(([preprocess(img).unsqueeze(0) for img in images[label]])).to(device)
      with torch.no_grad():
        image_embeddings[label] = model.encode_image(processed_images)

  # move embeddings to cpu and convert to suitable datatype for further analysis
  for key in image_embeddings:
    image_embeddings[key] = image_embeddings[key].cpu().type(torch.float)

  # save embeddings in pickle file if desired (enabled to reload them later on)
  if USE_CACHED_EMBEDDINGS != '' and USE_CACHED_EMBEDDINGS.split('__')[0] == 'CREATE':
    pickle_filename = '__'.join(USE_CACHED_EMBEDDINGS.split('__')[1:])  # remove prefix 'CREATE__'
    pickle_file = datasets_path + pickle_filename
    with open(pickle_file, 'wb') as f:
      pickle.dump(image_embeddings, f)
    print('Embeddings stored in', pickle_file)

# save embedding dimension for creation of reference vectors etc.
embedding_dim = image_embeddings[LABELS[0]].shape[1]


Embeddings loaded from /content/drive/My Drive/FM/datasets/CLIP_cats-vs-dogs-large.pkl


In [None]:
LABELS_POLLUTION_ALSO = LABELS.copy()
mislabeled_indices = None

if MISLABELED_INSTANCES != '':
  with open(datasets_path + MISLABELED_INSTANCES, 'rb') as f:
    mislabeled_indices = pickle.load(f)
  for label in mislabeled_indices:
    if len(mislabeled_indices[label]) == 0 or sum(mislabeled_indices[label]) == 0:
      continue
    image_embeddings[label + '_pollution'] = image_embeddings[label][mislabeled_indices[label]].clone().detach()
    image_embeddings[label + '_clean'] = image_embeddings[label][[not i for i in mislabeled_indices[label]]].clone().detach()
    LABELS_POLLUTION_ALSO += [label + '_pollution', label + '_clean']

if MANIPULATION_TYPES[1] > 0:
  pickle_file = datasets_path + IMAGENET_EMBEDDINGS
  with open(pickle_file, 'rb') as f:
    image_embeddings['imagenet_subset'] = pickle.load(f)['val']
  print('Embeddings loaded from', pickle_file)
  LABELS_POLLUTION_ALSO += ['imagenet_subset']

print('LABELS =', LABELS)
print('LABELS_POLLUTION_ALSO =', LABELS_POLLUTION_ALSO)
for k in image_embeddings:
  print(k, len(image_embeddings[k]))

Embeddings loaded from /content/drive/My Drive/FM/datasets/CLIP_imagenet-subset.pkl
LABELS = ['cat', 'dog']
LABELS_POLLUTION_ALSO = ['cat', 'dog', 'cat_pollution', 'cat_clean', 'dog_pollution', 'dog_clean', 'imagenet_subset']
cat 12502
dog 12499
cat_pollution 25
cat_clean 12477
dog_pollution 24
dog_clean 12475
imagenet_subset 869


In [None]:
MISLABELED_INSTANCES_LIST = 'mislabeled_instances_cats-vs-dogs.pkl'

with open(datasets_path + MISLABELED_INSTANCES_LIST, 'rb') as f:
  mislabeled_indices = pickle.load(f)

 Imagenet pics in array for easier handling with dataset creation (test phase external pollution)

In [None]:
imagenet_arr = [ image_embeddings["imagenet_subset"][i,:].numpy() for i in range(image_embeddings["imagenet_subset"].shape[0])]

# DL Dataset preparation

## General preparations


### Load textual embeddings

In [9]:
def load_from_pickle (datasets_path, USE_CACHED_EMBEDDINGS ="cats-vs-dogs.pkl" ):
  image_embeddings = []
  pickle_file = datasets_path + USE_CACHED_EMBEDDINGS
  with open(pickle_file, 'rb') as f:
    image_embeddings = pickle.load(f)
  return image_embeddings

In [None]:
text_random_embeddings = load_from_pickle (datasets_path, USE_CACHED_EMBEDDINGS =f"{RANDOM_TEXT_EMBEDDINGS_NAME_BASE}.pkl" )
text_dog_embeddings = load_from_pickle (datasets_path, USE_CACHED_EMBEDDINGS =f"{CORRECT_TEXT_EMBEDDINGS_NAME_BASE}.pkl" )

In [None]:
len(text_random_embeddings)

1103

In [None]:
len(text_dog_embeddings)

511

### Fcts for similarity calculation

In [None]:
import numpy as np
import torch
from scipy.spatial.distance import cosine

def calc_similarity_matrix_embeddings(embeddings_tensor, embeddings_tensor2 = True):
    """
    Function to compute a similarity matrix using dot product

    Parameters:
    - embeddings: list of torch.Tensor
        List of embeddings of images produced with CLIP.

    Returns:
    - similarity_matrix: numpy.ndarray
        2D numpy array representing the similarity matrix between the embeddings.
        Each element (i, j) in the matrix represents the similarity between embeddings[i] and embeddings[j].
        The similarity score between the two embeddings is calculated using cos similarity. The score ranges from 0 to 1,
        where 0 indicates completely dissimilar embeddings and 1 indicates identical embeddings.
    """

    """
    # Calculating the cosine distance between the two embeddings
    distance = cosine(embedding1, embedding2)

    # Converting the distance to similarity score
    similarity = 1 - distance

    return similarity
    """
    if isinstance(embeddings_tensor2, bool):
      embeddings_tensor2 = embeddings_tensor

    # Normalizing the embeddings
    embeddings_tensor = torch.nn.functional.normalize(embeddings_tensor, dim=1)
    embeddings_tensor2 = torch.nn.functional.normalize(embeddings_tensor2, dim=1)

    # Computing the similarity matrix using dot product
    similarity_matrix = torch.matmul(embeddings_tensor, embeddings_tensor2.T)

    return similarity_matrix

In [None]:
def calculate_column_average(matrix):
    """
    Calculates the average for each column in a PyTorch matrix.

    Parameters:
    - matrix: torch.Tensor
        The input matrix for which the column averages are to be calculated.

    Returns:
    - torch.Tensor
        A tensor containing the average value for each column of the input matrix.

    Raises:
    - TypeError:
        Raises an error if the input is not a PyTorch tensor.
    - ValueError:
        Raises an error if the input tensor is empty or has zero columns.
    """

    # Checking if the input is a PyTorch tensor
    if not isinstance(matrix, torch.Tensor):
        raise TypeError("Input should be a PyTorch tensor.")

    # Checking if the input tensor is empty or has zero columns
    if matrix.numel() == 0 or matrix.size(1) == 0:
        raise ValueError("Input tensor is empty or has zero columns.")

    # Calculating the column averages but taking out the entry for the same vector in the matrix (diagonal entry)
    column_sums = torch.sum(matrix, dim=0)
    column_counts = torch.tensor([matrix.size(0)] * (matrix.size(1)), dtype=torch.float32)
    column_averages = (column_sums -1) / (column_counts - 1)

    return column_averages

In [None]:
similarity_matrix = calc_similarity_matrix_embeddings(image_embeddings[DOG_OR_CAT])
avg_similarity_imgs = calculate_column_average(similarity_matrix)

## Approach 1: Training on embeddings

### Helper fcts

In [None]:
import random
import math

def generate_random_vectors(n: int, size: int) -> list:

    vectors = []
    for _ in range(n):
        vector = [random.uniform(-1, 1) for _ in range(size)]
        vectors.append(vector)

    return vectors

def normalize_vectors(vectors: list) -> list:

    normalized_vectors = []
    for vector in vectors:
        magnitude = math.sqrt(sum([x**2 for x in vector]))
        normalized_vector = [x / magnitude for x in vector]
        normalized_vectors.append(torch.tensor(normalized_vector))

    return normalized_vectors


In [None]:
def create_train_assumed_correct(dataset_img_embeddings, text_random_embeddings,
                                 text_class_embeddings, avg_similarity_imgs,
                                  random_vec_size = 512):




  import string
  import random

  # Assuming X is your data matrix (features) and y is the corresponding labels
  # Make sure to replace this with your actual dataset

  # Split the data into training and testing sets
  dict_self_similarity = {}
  for i, el in enumerate(avg_similarity_imgs):
      dict_self_similarity[i] = el


  sorted_similarities = sorted(dict_self_similarity.items(), key=lambda x: x[1])
  n = len(dict_self_similarity)
  q3_index = int((3 *n )// 6)
  q4_index = int((4 * n) // 6)
  assumed_correct = sorted_similarities[q3_index:q4_index]

   # Generate random vectors
  random_vectors = generate_random_vectors(int(4*n/18)+ text_class_embeddings.shape[0] - text_random_embeddings.shape[0], random_vec_size)

  # Normalize the random vectors
  normalized_random_vectors = normalize_vectors(random_vectors)
  extr_embed = lambda index: dataset_img_embeddings[index]

  embeddings_img_assumed_correct = [ extr_embed(index) for index in dict(assumed_correct).keys()]
  train = {}
  for emb in (embeddings_img_assumed_correct + [text_class_embeddings[i] for i in range(text_class_embeddings.shape[0])]):
    train[emb] = 1
  for emb in (normalized_random_vectors +  [text_random_embeddings[i] for i in range(text_random_embeddings.shape[0])]):
    train[emb] = 0

  l = list(train.items())


  X_train = [item[0] for item in l]
  y_train = [item[1] for item in l]

  return X_train, y_train

### Dataset prep

#### Train dataset

In [None]:
random_vectors = generate_random_vectors(2, text_random_embeddings[0].shape[0])

  # Normalize the random vectors
normalized_random_vectors = normalize_vectors(random_vectors)

In [None]:
similarity_matrix_dog = calc_similarity_matrix_embeddings(image_embeddings[DOG_OR_CAT])
avg_similarity_dog_imgs = calculate_column_average(similarity_matrix_dog)

In [None]:
X_train_embeddings, y_train_embeddings = create_train_assumed_correct(dataset_img_embeddings = image_embeddings[DOG_OR_CAT],
                                                                      text_random_embeddings = text_random_embeddings,
                                                                      text_class_embeddings = text_dog_embeddings,
                                                                      avg_similarity_imgs = avg_similarity_dog_imgs, random_vec_size = text_random_embeddings[0].shape[0])


#### Test dataset for internal pollution

In [None]:
X_test_embeddings_internal = []
y_test_embeddings_internal = []
for emb, correct in zip(image_embeddings[DOG_OR_CAT],mislabeled_indices[DOG_OR_CAT]):
  X_test_embeddings_internal.append(emb)
  if correct == False:
    y_test_embeddings_internal.append(1)
  else:
    y_test_embeddings_internal.append(0)

#### Test dataset for external pollution

In [None]:
if TYPE_EXTERNAL_POLLUTION == "cross-label":
  if DOG_OR_CAT =="cat":
    other_label = "dog"
  else:
    other_label = "cat"

  size_clean = len(image_embeddings[f"{DOG_OR_CAT}_clean"].tolist())

  X_test_embeddings_external = image_embeddings[f"{DOG_OR_CAT}_clean"].tolist() + imagenet_arr[:int(size_clean/198)] + image_embeddings[other_label][:int(size_clean/198)].tolist()
  y_test_embeddings_external = [1 for i in range(size_clean)] + [0 for i in range(int(size_clean/99))]

else:
  X_test_embeddings_external = image_embeddings[f"{DOG_OR_CAT}_clean"].tolist() + imagenet_arr
  y_test_embeddings_external = [1 for i in range(len(image_embeddings[f"{DOG_OR_CAT}_clean"]))] + [0 for i in range(len(imagenet_arr))]

# Models internal pollution

## Setup

In [None]:
from collections import Counter

In [None]:
predictions_internal ={}
predictions_external = {}
predicted_filenames_internal = {}

## On embeddings

### MLP

In [None]:
from sklearn.neural_network import MLPClassifier

clf_mlp = MLPClassifier(solver='sgd',  hidden_layer_sizes=(5, 2), random_state=1)

clf_mlp.fit([x.numpy() for x in X_train_embeddings], y_train_embeddings)



In [None]:
predictions_internal["Embeddings: MLP (ReLu + SGD)"] = clf_mlp.predict(np.stack(X_test_embeddings_internal))
predictions_external["Embeddings: MLP (ReLu + SGD)"] = clf_mlp.predict(np.stack(X_test_embeddings_external))

### MLP with threshold

In [None]:
from sklearn.neural_network import MLPClassifier

clf_mlp = MLPClassifier(solver='sgd',  hidden_layer_sizes=(5, 2), random_state=1)

clf_mlp.fit([x.numpy() for x in X_train_embeddings], y_train_embeddings)



In [None]:
detect_outlier = lambda x: 0 if x >0.4 else 1

predictions_internal["Embeddings: MLP with threshold (ReLu + SGD)"] = [detect_outlier(i[0]) for i in clf_mlp.predict_proba(np.stack(X_test_embeddings_internal))]
predictions_external["Embeddings: MLP with threshold (ReLu + SGD)"] = [detect_outlier(i[0]) for i in clf_mlp.predict_proba(np.stack(X_test_embeddings_external))]

### SVC with sigmoid

In [None]:
from sklearn import svm

clf_sigmoid = svm.SVC(kernel='sigmoid', gamma='scale')

clf_sigmoid.fit(np.stack(X_train_embeddings), np.stack(y_train_embeddings))

In [None]:
predictions_internal["Embeddings: SVC with sigmoid"] = clf_sigmoid.predict(np.stack(X_test_embeddings_internal))
predictions_external["Embeddings: SVC with sigmoid)"] = clf_sigmoid.predict(np.stack(X_test_embeddings_external))


### SVC with RBF

In [None]:
from sklearn import svm

clf_sigmoid = svm.SVC(kernel='rbf', gamma='scale')

clf_sigmoid.fit(np.stack(X_train_embeddings), np.stack(y_train_embeddings))

In [None]:
predictions_internal["Embeddings: SVC with RBF"] = clf_sigmoid.predict(np.stack(X_test_embeddings_internal))
predictions_external["Embeddings: SVC with RBF"] = clf_sigmoid.predict(np.stack(X_test_embeddings_external))

## Ensembles

In [None]:
num_instances= len(image_embeddings[DOG_OR_CAT])

### Ensemble Embeddings MLP

In [None]:
from sklearn.neural_network import MLPClassifier

y=[]
y_external = []
detect_outlier = lambda x: 0 if x >0.48 else 1

for r in [random.randint(1,10000)  for i in range(30)]:
  clf_mlp = MLPClassifier(solver='sgd',  hidden_layer_sizes=(5, 2), random_state=r)

  clf_mlp.fit([x.numpy() for x in X_train_embeddings], y_train_embeddings)

  y_pred = [detect_outlier(i[0]) for i in clf_mlp.predict_proba(np.stack(X_test_embeddings_internal))]
  if np.count_nonzero(y_pred)>0.995*num_instances:
    if  y ==[]:
      y=y_pred
    else:
      for i,e in enumerate(clf_mlp.predict_proba(np.stack(X_test_embeddings_internal))):
        if detect_outlier(e[0])==0:
          y[i]=0


  y_pred = [detect_outlier(i[0]) for i in clf_mlp.predict_proba(np.stack(X_test_embeddings_external))]
  if np.count_nonzero(y_pred)>0.995*num_instances:
    if  y_external ==[]:
      y_external=y_pred
    else:
      for i,e in enumerate(clf_mlp.predict_proba(np.stack(X_test_embeddings_external))):
        if detect_outlier(e[0])==0:
          y_external[i]=0



In [None]:
predictions_internal["Embeddings Ensemble MLP (ReLu + SGD)"] = y
predictions_external["Embeddings Ensemble MLP (ReLu + SGD)"] = y_external

In [None]:
np.count_nonzero(y)

12415

### Ensemble Embeddings SVC and MLP

In [None]:
from sklearn.neural_network import MLPClassifier

y=[]
y_external =[]
detect_outlier = lambda x: 0 if x >0.48 else 1

for r in [random.randint(1,10000)  for i in range(30)]:
  clf_mlp = MLPClassifier(solver='sgd',  hidden_layer_sizes=(5, 2), random_state=r)

  clf_mlp.fit([x.numpy() for x in X_train_embeddings], y_train_embeddings)

  y_pred = [detect_outlier(i[0]) for i in clf_mlp.predict_proba(np.stack(X_test_embeddings_internal))]
  if np.count_nonzero(y_pred)>0.99*num_instances:
    if  y ==[]:
      y=y_pred
    else:
      for i,e in enumerate(clf_mlp.predict_proba(np.stack(X_test_embeddings_internal))):
        if detect_outlier(e[0])==0:
          y[i]=0

  y_pred = [detect_outlier(i[0]) for i in clf_mlp.predict_proba(np.stack(X_test_embeddings_external))]
  if np.count_nonzero(y_pred)>0.99*num_instances:
    if  y_external ==[]:
      y_external=y_pred
    else:
      for i,e in enumerate(clf_mlp.predict_proba(np.stack(X_test_embeddings_external))):
        if detect_outlier(e[0])==0:
          y_external[i]=0


from sklearn import svm

clf_sigmoid = svm.SVC(kernel='sigmoid', gamma='scale')

clf_sigmoid.fit(np.stack(X_train_embeddings), np.stack(y_train_embeddings))

y_pred = clf_sigmoid.predict(np.stack(X_test_embeddings_internal))
if np.count_nonzero(y_pred)>0.8*num_instances:
  for i,e in enumerate(y_pred):
    if e==0:
      y[i]=0

y_pred = clf_sigmoid.predict(np.stack(X_test_embeddings_external))
if np.count_nonzero(y_pred)>0.8*num_instances:
  for i,e in enumerate(y_pred):
    if e==0:
      y_external[i]=0



In [None]:
predictions_internal["Embeddings: Ensemble MLP and SVC"] = y
predictions_external["Embeddings: Ensemble MLP and SVC"] = y_external

# Evaluation

## Helper fcts

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

def calculate_metrics(prediction, ground_truth):

    # Checking if the lengths of the prediction and ground truth arrays are equal.
    if len(prediction) != len(ground_truth):
        raise ValueError("Lengths of prediction and ground truth arrays should be equal.")

    tp = 0
    tn = 0
    fp = 0
    fn = 0
    for pred, truth in zip(prediction, ground_truth):
      if pred == truth:
        if pred == 0:
          tp += 1
        else:
          tn +=1
      elif pred == 0:
        fp +=1
      else:
        fn +=1

    if (tp+fp) >0:
      prec = tp / (tp+fp)
    else:
      prec=0
    if (tp+fn)>0:
      rec = tp / (tp+fn)
    else:
      rec=0
    if prec!= 0 and rec !=0:
      f1 = 2* prec* rec/(prec+rec)
    else:
      f1 = 0

    print("-----")
    print(f"tp: {tp}")
    print(f"Precision: {prec}")
    print(f"recall: {rec}")
    print(f"f1: {f1}")

    return {"tp":tp,"p":prec,"r":rec,"f1":f1}

## Eval core

In [None]:
path_store_results = "/content/drive/My Drive/FM/results/ML_approaches/"

Internal

In [None]:
log = []
for key, predictions in predictions_internal.items():
  print()
  print()
  print(key)
  metrics = calculate_metrics(predictions, y_test_embeddings_internal)
  metrics["0_count"]=num_instances - np.count_nonzero(predictions)
  metrics["id"]=key
  log.append(metrics)
  print(Counter(predictions).keys()) # equals to list(set(words))
  print(Counter(predictions).values())



Embeddings: MLP (ReLu + SGD)
-----
tp: 13
Precision: 0.7647058823529411
recall: 0.52
f1: 0.6190476190476191
dict_keys([1, 0])
dict_values([12485, 17])


Embeddings: MLP with threshold (ReLu + SGD)
-----
tp: 13
Precision: 0.6842105263157895
recall: 0.52
f1: 0.5909090909090909
dict_keys([1, 0])
dict_values([12483, 19])


Embeddings: SVC with sigmoid
-----
tp: 17
Precision: 0.5483870967741935
recall: 0.68
f1: 0.6071428571428571
dict_keys([1, 0])
dict_values([12471, 31])


Embeddings: SVC with RBF
-----
tp: 9
Precision: 1.0
recall: 0.36
f1: 0.5294117647058824
dict_keys([1, 0])
dict_values([12493, 9])


Embeddings Ensemble MLP (ReLu + SGD)
-----
tp: 19
Precision: 0.21839080459770116
recall: 0.76
f1: 0.3392857142857143
dict_keys([1, 0])
dict_values([12415, 87])


Embeddings: Ensemble MLP and SVC
-----
tp: 17
Precision: 0.20238095238095238
recall: 0.68
f1: 0.3119266055045872
dict_keys([1, 0])
dict_values([12418, 84])


In [None]:
with open(path_store_results + f"{RESULTS_FILE}_internal.pkl" , 'wb') as file:
  # Writing the dictionary to the pickle file
  pickle.dump(log, file)

External

In [None]:
TYPE_EXTERNAL_POLLUTION

'cross-label'

In [None]:
log_external = []
for key, predictions in predictions_external.items():
  print()
  print()
  print(key)
  metrics = calculate_metrics(predictions, y_test_embeddings_external)
  metrics["0_count"]=num_instances - np.count_nonzero(predictions)
  metrics["id"]=key
  log_external.append(metrics)
  print(Counter(predictions).keys()) # equals to list(set(words))
  print(Counter(predictions).values())




Embeddings: MLP (ReLu + SGD)
-----
tp: 74
Precision: 0.9487179487179487
recall: 0.5873015873015873
f1: 0.7254901960784315
dict_keys([1, 0])
dict_values([12525, 78])


Embeddings: MLP with threshold (ReLu + SGD)
-----
tp: 83
Precision: 0.9325842696629213
recall: 0.6587301587301587
f1: 0.7720930232558139
dict_keys([1, 0])
dict_values([12514, 89])


Embeddings: SVC with sigmoid)
-----
tp: 125
Precision: 0.8992805755395683
recall: 0.9920634920634921
f1: 0.9433962264150942
dict_keys([1, 0])
dict_values([12464, 139])


Embeddings: SVC with RBF
-----
tp: 22
Precision: 1.0
recall: 0.1746031746031746
f1: 0.29729729729729726
dict_keys([1, 0])
dict_values([12581, 22])


Embeddings Ensemble MLP (ReLu + SGD)
-----
tp: 124
Precision: 0.6458333333333334
recall: 0.9841269841269841
f1: 0.779874213836478
dict_keys([1, 0])
dict_values([12411, 192])


Embeddings: Ensemble MLP and SVC
-----
tp: 125
Precision: 0.6510416666666666
recall: 0.9920634920634921
f1: 0.7861635220125787
dict_keys([1, 0])
dict_valu

In [None]:
with open(path_store_results + f"{RESULTS_FILE}_external_{TYPE_EXTERNAL_POLLUTION}.pkl" , 'wb') as file:
  # Writing the dictionary to the pickle file
  pickle.dump(log_external, file)

# Prompt engineering

In [None]:
def clip_generated_label(img_encoding, text_features_normalized):
  # source: https://github.com/openai/CLIP


  similarity = (100.0 * img_encoding @ text_features_normalized.T).softmax(dim=-1)
  reducer = lambda x: 1 if x[0]>0.5 else 0
  return [reducer(x) for x in similarity]

In [None]:
text_inputs = [f"a photo of a {c}" for c in [DOG_OR_CAT,"something"]]

# Calculate features
text_features=[]
for i in text_inputs:
  with torch.no_grad():
    text_features.append(model.encode_text(tokenize(i).to(device)).cpu())

text_features=torch.cat(text_features, 0)
images = image_embeddings[DOG_OR_CAT]
# Pick the top 5 most similar labels for the image
img_encoding = images/ images.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)

predicted = clip_generated_label(img_encoding, text_features)


calculate_metrics(predicted,  y_test_embeddings_internal)
print(Counter(predicted).keys())
print(Counter(predicted).values())

-----
tp: 13
Precision: 0.65
recall: 0.52
f1: 0.5777777777777778
dict_keys([1, 0])
dict_values([12482, 20])
