Notebook for generating the textual embeddings necessary for the labeled training dataset creation for the ML based approaches.

Choice whether to use CLIP or ALIGN - through "MODEL" parameter in Setup section

# Setup

In [None]:
# choose which model to use to generate the embeddings: "CLIP" or "ALIGN"
MODEL = 'CLIP'

# choose base name where to store embeddings
CORRECT_TEXT_EMBEDDINGS_NAME_BASE = f"text_embeddings_correct_{MODEL.lower()}"
RANDOM_TEXT_EMBEDDINGS_NAME_BASE = f"text_embeddings_random_{MODEL.lower()}"

# choose dataset
DATASET_NAME = 'cats-vs-dogs-large'  # needs to match folder name in FM/datasets
LOAD_AND_EMBED_DATASET_IN_BATCHES = True  # True for large datasets, False for small ones
USE_CACHED_EMBEDDINGS = f'{MODEL.lower()}_cats-vs-dogs-large.pkl'  # '' for loading the dataset normally, 'CREATE__{x}.pkl' for creating the cache file {x}.pkl, '{x}.pkl' for loading the cache file {x}.pkl
MISLABELED_INSTANCES = 'mislabeled_instances_cats-vs-dogs.pkl'  # if not '', but e.g. 'mislabeled_instances_cats-vs-dogs.pkl', the pickle file specifies which files to drop from the loaded embeddings

# choose how many image-label mismatches to insert
MISMATCH_PORTION = 0.01  # percentage of mismatching image-label pairs added
MANIPULATION_TYPES = [0.5, 0.5, 0.0, 0.0]  # how much of the MISMATCH_PORTION to produce by [exchanging images between classes, inserting images from other datasets, inserting randomly generated images, inserting placeholder images]
IMAGENET_EMBEDDINGS = f'{MODEL.lower()}_imagenet-subset.pkl'  # specify if MANIPULATION_TYPES[1] > 0


In [None]:
root_path = '/content/drive/My Drive/FM/'
datasets_path = root_path + 'datasets/'
dataset_path = datasets_path + DATASET_NAME + '/'

In [None]:
# hacky way when hitting "run all" that libraries are not reloaded
try:
  torch.tensor([[0]])
  libraries_already_loaded = True
except:
  libraries_already_loaded = False

In [None]:
if not libraries_already_loaded:
  ! pip install ftfy regex tqdm
  ! pip install git+https://github.com/openai/CLIP.git

Collecting ftfy
  Downloading ftfy-6.1.3-py3-none-any.whl (53 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/53.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.4/53.4 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: ftfy
Successfully installed ftfy-6.1.3
Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-oh1x8e0m
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-oh1x8e0m
  Resolved https://github.com/openai/CLIP.git to commit a1d071733d7111c9c014f024669f959182114e33
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: clip
  Building wheel for clip (setup.py) ... [?25l[?25hdone
  Created wheel for clip: filename=clip-1.0-py3-none-any.whl size=1369497 sha256=919ce0f617f6df9a32f71febae854aa7907ae82f7d126e919125

In [None]:
if not libraries_already_loaded:
  import torch
  from torchvision import transforms
  import clip
  from transformers import AutoImageProcessor, ViTModel, AlignProcessor, AlignModel, AutoTokenizer
  from transformers.tokenization_utils_base import BatchEncoding
  import numpy as np
  from matplotlib import pyplot as plt
  from PIL import Image
  from pkg_resources import packaging
  import os
  from google.colab import drive
  from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, confusion_matrix
  import glob
  import pickle
  from scipy.spatial.distance import cosine

print("Torch version:", torch.__version__)

Torch version: 2.1.0+cu121


In [None]:
drive.mount('/content/drive')
!ls "{datasets_path}"

Mounted at /content/drive
AlexNet_cats-vs-dogs-large.pkl			 mislabeled_instances_cats-vs-dogs_CATS_ONLY.pkl
AlexNet_imagenet-subset.pkl			 mislabeled_instances_cats-vs-dogs.pkl
AlexNet_traffic-signs.pkl			 note.txt
ALIGN_cats-vs-dogs-large.pkl			 text_dog_embeddings.pkl
ALIGN_imagenet-subset.pkl			 text_embeddings_correct_align_cat.pkl
ALIGN_traffic-signs.pkl				 text_embeddings_correct_align_dog.pkl
cats-dogs-big_ids.pkl				 text_embeddings_correct_clip_dog.pkl
cats-dogs-big.pkl				 text_embeddings_random_align_cat.pkl
cats-vs-dogs-large				 text_embeddings_random_align_dog.pkl
CLIP_cats-vs-dogs-large.pkl			 text_embeddings_random_clip_dog.pkl
CLIP_imagenet-subset.pkl			 text_random_embeddings.pkl
CLIP_traffic-signs.pkl				 traffic-signs
dog_wrong_2_12.txt				 train-small
dog_wrong.txt					 ViT-CLS_cats-vs-dogs-large.pkl
image_embeddings__cats-vs-dogs.pkl		 ViT-CLS_imagenet-subset.pkl
image_embeddings__traffic-signs.pkl		 ViT-CLS_traffic-signs.pkl
imagenet_one-of-each-class-except-cat

# Foundation models load

In [None]:
def get_embeddings_batchwise(all_labels, model, preprocess, batch_size=64):
  currently_processed =[]
  embeddings = []
  for n_instances_processed, label in enumerate(all_labels):

    currently_processed.append(label)

    # Get embeddings if already a batch is full
    if n_instances_processed % batch_size == 0 and n_instances_processed > 0 or n_instances_processed == len(all_labels) - 1:
      for i in currently_processed:
        with torch.no_grad():
          embeddings.append(model.encode_text(tokenize(i).to(device)))
        #embeddings.append(model.encode_text(clip.tokenize(currently_processed).to(device)))
      currently_processed =[]
      print(n_instances_processed, 'loaded and encoded')

  return torch.cat(embeddings)

## Load clip or align

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"


class Align(torch.nn.Module):

  def __init__(self):
    super().__init__()
    self.align = AlignModel.from_pretrained("kakaobrain/align-base")

  def forward(self, x: torch.Tensor) -> torch.Tensor:
    return self.align(x)

  def encode_image(self, img: torch.Tensor) -> torch.Tensor:
    return self.align.get_image_features(img)

  def encode_text(self, text: BatchEncoding) -> torch.Tensor:
    return self.align.get_text_features(**text)


def align_preprocessor_with_memory_fix(img) -> torch.Tensor:
  processor = AlignProcessor.from_pretrained("kakaobrain/align-base")
  with torch.no_grad():
    processed = processor(images=img, return_tensors="pt").to(device).pixel_values.squeeze(0)
  del processor
  return processed


if MODEL == 'CLIP':
  model, preprocess = clip.load("ViT-B/32", device=device)
  tokenize = clip.tokenize
elif MODEL == 'ALIGN':
  model = Align().to(device)
  preprocess = align_preprocessor_with_memory_fix
  tokenizer = AutoTokenizer.from_pretrained("kakaobrain/align-base")
  tokenize = lambda s: tokenizer([s], padding=True, return_tensors="pt")
else:
  raise ValueError(f'Invalid model {MODEL} selected!')


100%|████████████████████████████████████████| 338M/338M [00:03<00:00, 100MiB/s]


# DOG case
Generate and store only the embeddings for dataset creation for ML approach
- randomized text embeddings for correct and incorrect images

In [None]:
additional_classes = [ 'car', 'bicycle', 'flower', 'sunset', 'beach', 'mountain', 'skyline', 'food', 'people',
                      'portrait', 'wildlife', 'landscape', 'cityscape', 'architecture', 'street', 'travel', 'vacation',
                       'sports', 'basketball', 'soccer', 'tennis', 'gymnastics', 'swimming', 'yoga', 'music', 'concert',
                       'festival', 'guitar', 'piano', 'art', 'painting', 'sculpture', 'abstract', 'fashion', 'clothing',
                       'jewelry', 'makeup', 'hair', 'wedding', 'technology', 'smartphone', 'laptop', 'camera', 'drone', 'nature',
                       'forest', 'river', 'desert', 'wildlife', 'bird', 'insect', 'fish', 'reptile', 'fruit', 'vegetable',
                       'coffee', 'tea', 'wine', 'beer', 'cocktail', 'vintage', 'retro', 'urban', 'rural', 'portrait', 'selfie',
                       'family', 'friends', 'love', 'romance', 'wedding', 'baby', 'childhood', 'home', 'garden', 'interior',
                       'exterior', 'sunset', 'sunrise', 'moon', 'stars', 'space', 'astronomy', 'weather', 'rain',
                      'snow', 'fireworks', 'festive', 'celebration', 'holiday', 'halloween', 'christmas', 'easter',
                       'surprise', 'temple', 'sadness', 'hapiness', 'game', 'cat', 'person', 'plant', 'furniture', 'memory']

In [None]:
import random
import string
text_random = [''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(random.randint(2,10))) for j in range(1000)]+[f"a picture of {obj}" for obj in additional_classes]

text_dog = ["a dog", "a picture of dog", "a picture of people with dog", "a picture of dog playing",
                            "a picture of dog sleeping", "dog picture","dog drawing", "a drawing of dog",
                            "a picture of many dogs","a picture of dog with objects",
             "a picture of dog in nature"] + ['a picture of dog '+ ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(random.randint(2,10))) for j in range(100)]+ [
                 'a picture of people with dog '+ ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(random.randint(2,10))) for j in range(100)] + [
                 'a picture of dog with objects '+ ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(random.randint(2,10))) for j in range(100)] + [
                 'a picture of many dogs '+ ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(random.randint(2,10))) for j in range(100)] + [
                 'a picture of dog with '+ ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(random.randint(2,10))) for j in range(100)]



In [None]:
text_random_embeddings = get_embeddings_batchwise(text_random, model, preprocess)

64 loaded and encoded
128 loaded and encoded
192 loaded and encoded
256 loaded and encoded
320 loaded and encoded
384 loaded and encoded
448 loaded and encoded
512 loaded and encoded
576 loaded and encoded
640 loaded and encoded
704 loaded and encoded
768 loaded and encoded
832 loaded and encoded
896 loaded and encoded
960 loaded and encoded
1024 loaded and encoded
1088 loaded and encoded
1102 loaded and encoded


In [None]:
text_dog_embeddings = get_embeddings_batchwise(text_dog, model, preprocess)

64 loaded and encoded
128 loaded and encoded
192 loaded and encoded
256 loaded and encoded
320 loaded and encoded
384 loaded and encoded
448 loaded and encoded
510 loaded and encoded


In [None]:
pickle_file_rand = datasets_path + f"{RANDOM_TEXT_EMBEDDINGS_NAME_BASE}_dog.pkl"
pickle_file_dog = datasets_path + f"{CORRECT_TEXT_EMBEDDINGS_NAME_BASE}_dog.pkl"

with open(pickle_file_rand, 'wb') as f:
  pickle.dump(text_random_embeddings, f)

with open(pickle_file_dog, 'wb') as f:
  pickle.dump(text_dog_embeddings, f)

# CAT case

In [None]:
additional_classes = [ 'car', 'bicycle', 'flower', 'sunset', 'beach', 'mountain', 'skyline', 'food', 'people',
                      'portrait', 'wildlife', 'landscape', 'cityscape', 'architecture', 'street', 'travel', 'vacation',
                       'sports', 'basketball', 'soccer', 'tennis', 'gymnastics', 'swimming', 'yoga', 'music', 'concert',
                       'festival', 'guitar', 'piano', 'art', 'painting', 'sculpture', 'abstract', 'fashion', 'clothing',
                       'jewelry', 'makeup', 'hair', 'wedding', 'technology', 'smartphone', 'laptop', 'camera', 'drone', 'nature',
                       'forest', 'river', 'desert', 'wildlife', 'bird', 'insect', 'fish', 'reptile', 'fruit', 'vegetable',
                       'coffee', 'tea', 'wine', 'beer', 'cocktail', 'vintage', 'retro', 'urban', 'rural', 'portrait', 'selfie',
                       'family', 'friends', 'love', 'romance', 'wedding', 'baby', 'childhood', 'home', 'garden', 'interior',
                       'exterior', 'sunset', 'sunrise', 'moon', 'stars', 'space', 'astronomy', 'weather', 'rain',
                      'snow', 'fireworks', 'festive', 'celebration', 'holiday', 'halloween', 'christmas', 'easter',
                       'surprise', 'temple', 'sadness', 'hapiness', 'game', 'dog', 'person', 'plant', 'furniture', 'memory']

In [None]:
import random
import string
text_random = [''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(random.randint(2,10))) for j in range(1000)]+[f"a picture of {obj}" for obj in additional_classes]

text_cat = ["a cat", "a picture of cat", "a picture of people with cat", "a picture of cat playing",
                            "a picture of cat sleeping", "cat picture","cat drawing", "a drawing of cat",
                            "a picture of many cats","a picture of cat with objects",
             "a picture of cat in nature"] + ['a picture of cat '+ ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(random.randint(2,10))) for j in range(100)]+ [
                 'a picture of people with cat '+ ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(random.randint(2,10))) for j in range(100)] + [
                 'a picture of cat with objects '+ ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(random.randint(2,10))) for j in range(100)] + [
                 'a picture of many cat '+ ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(random.randint(2,10))) for j in range(100)] + [
                 'a picture of cat with '+ ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(random.randint(2,10))) for j in range(100)]



In [None]:
text_random_embeddings = get_embeddings_batchwise(text_random, model, preprocess)

64 loaded and encoded
128 loaded and encoded
192 loaded and encoded
256 loaded and encoded
320 loaded and encoded
384 loaded and encoded
448 loaded and encoded
512 loaded and encoded
576 loaded and encoded
640 loaded and encoded
704 loaded and encoded
768 loaded and encoded
832 loaded and encoded
896 loaded and encoded
960 loaded and encoded
1024 loaded and encoded
1088 loaded and encoded
1102 loaded and encoded


In [None]:
text_cat_embeddings = get_embeddings_batchwise(text_cat, model, preprocess)

64 loaded and encoded
128 loaded and encoded
192 loaded and encoded
256 loaded and encoded
320 loaded and encoded
384 loaded and encoded
448 loaded and encoded
510 loaded and encoded


In [None]:
pickle_file_rand = datasets_path + f"{RANDOM_TEXT_EMBEDDINGS_NAME_BASE}_cat.pkl"
pickle_file_cat = datasets_path + f"{CORRECT_TEXT_EMBEDDINGS_NAME_BASE}_cat.pkl"


with open(pickle_file_rand, 'wb') as f:
  pickle.dump(text_random_embeddings, f)

with open(pickle_file_cat, 'wb') as f:
  pickle.dump(text_cat_embeddings, f)