In [2]:
import os
import sys
import torch
import random
import configs
import numpy as np
import transformers
import torch.nn as nn
from PIL import Image
import tensorflow as tf
from tqdm.auto import tqdm
import torch.nn.functional as F
from training_utils import BottleneckTrainer

2023-11-30 00:47:59.970701: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-30 00:47:59.970787: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-30 00:47:59.970810: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-11-30 00:47:59.978534: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
configs.set_seed(42)

In [6]:
device = configs.set_device(2)

There are 8 GPU(s) available.
We will use the GPU: NVIDIA A100-SXM4-80GB


In [7]:
from datasets import load_dataset, DatasetDict

hf_dataset = load_dataset("Andron00e/CIFAR10-custom")
dataset = hf_dataset["train"].train_test_split(test_size=0.2)
val_test = dataset["test"].train_test_split(test_size=0.5)

dataset = DatasetDict({
    "train": dataset["train"],
    "validation": val_test["train"],
    "test": val_test["test"],
})

In [8]:
dataset

DatasetDict({
    train: Dataset({
        features: ['image_file_path', 'image', 'labels'],
        num_rows: 48000
    })
    validation: Dataset({
        features: ['image_file_path', 'image', 'labels'],
        num_rows: 6000
    })
    test: Dataset({
        features: ['image_file_path', 'image', 'labels'],
        num_rows: 6000
    })
})

In [9]:
model_name = "openai/clip-vit-base-patch32"
clip = transformers.CLIPModel.from_pretrained(model_name)
processor = transformers.CLIPProcessor.from_pretrained(model_name)

In [10]:
def remove_prefixes(strings):
    prefixes = ['a', 'an', 'the']
    result = []

    for string in strings:
        words = string.split()
        if words[0].lower() in prefixes:
            result.append(' '.join(words[1:]))
        else:
            result.append(string)

    return result

with open("conceptnet_cifar10_filtered_new.txt", "r") as f:
    concepts = f.read().lower().split("\n")
    concepts = remove_prefixes(concepts)

In [11]:
def contrastive_loss(logits, dim):
    neg_ce = torch.diag(F.log_softmax(logits, dim=dim))
    return -neg_ce.mean()
    
def clip_loss(similarity: torch.Tensor) -> torch.Tensor:
    caption_loss = contrastive_loss(similarity, dim=0)
    image_loss = contrastive_loss(similarity, dim=1)
    return (caption_loss + image_loss) / 2.0

In [12]:
classes = {0: "airplane", 1: "automobile", 2: "bird", 3: "cat", 4: "deer", 5: "dog", 6: "frog", 7: "horse", 8: "ship", 9: "truck"}

def label_to_word(label: int) -> str:
    return classes[label]

In [13]:
from tensorflow.keras.datasets import cifar10

(train_images, train_labels), (test_images, test_labels) = cifar10.load_data()


image_paths = []
labels = []

save_dir = 'cifar10_images'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

for i in range(len(train_images)):
    image_path = os.path.join(save_dir, f"train_image_{i}.jpg")
    tf.keras.preprocessing.image.save_img(image_path, train_images[i])
    image_paths.append(image_path)
    labels.append(train_labels[i][0])


for i in range(len(test_images)):
    image_path = os.path.join(save_dir, f"test_image_{i}.jpg")
    tf.keras.preprocessing.image.save_img(image_path, test_images[i])
    image_paths.append(image_path)
    labels.append(test_labels[i][0])

print(len(train_images), "\n")
print(len(test_images), "\n")
print(len(image_paths), "\n")

class CLIPDataset():
    def __init__(self, list_image_path, list_txt):
        self.image_path = list_image_path
        self.title  = list_txt
        #self.title_text = [classes[l] for l in list_txt]

    def __len__(self):
        return len(self.title)

    def __getitem__(self, idx):
        image = Image.open(self.image_path[idx])
        title = self.title[idx]
        #title_text = self.title_text[idx]
        return image, title, #title_text


def collate_fn(batch):
    return {
        'image': [x[0] for x in batch],
        'title': [x[1] for x in batch],
        #'title-text': [x[2] for x in batch]
    }

dataset = CLIPDataset(list_image_path=image_paths, list_txt=labels) # but it can be with <<titles>> to get textual annotations for class labels
train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(dataset, [50000, 3000, 7000])

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn, pin_memory=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn, pin_memory=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn, pin_memory=True)

assert len(dataset) == len(image_paths)
print("Dataset size: {}".format(len(dataset)), "\n")

train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(dataset, [50000, 3000, 7000])
print("Train set: {}".format(len(train_dataset)), "\n")
print("Validation set: {}".format(len(val_dataset)), "\n")
print("Test set: {}".format(len(test_dataset)))

50000 

10000 

60000 

Dataset size: 60000 

Train set: 50000 

Validation set: 3000 

Test set: 7000


In [14]:
from transformers import CLIPModel, CLIPProcessor, AutoTokenizer

def preprocess_loader(loader):
    preprocessed_batches = []
    processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
    for batch in tqdm(loader):
        preprocessed_batch = preprocess_batch(batch, processor)
        preprocessed_batches.append(preprocessed_batch)
    return preprocessed_batches

def preprocess_batch(batch, processor):
    return processor(text=list(classes.values()), images=batch['image'], return_tensors="pt", padding=True), batch['title']

In [15]:
train_loader_preprocessed = preprocess_loader(train_loader)
val_loader_preprocessed = preprocess_loader(val_loader)
test_loader_preprocessed = preprocess_loader(test_loader)

  0%|          | 0/1563 [00:00<?, ?it/s]

  0%|          | 0/94 [00:00<?, ?it/s]

  0%|          | 0/219 [00:00<?, ?it/s]

classes text features

In [16]:
classes_inputs = processor.tokenizer(list(classes.values()), padding=True, return_tensors="pt")
classes_features = clip.get_text_features(**classes_inputs)
classes_features /= classes_features.norm(dim=-1, keepdim=True)

concepts text features

In [17]:
concepts_inputs = processor.tokenizer(concepts, padding=True, return_tensors="pt")
concepts_features = clip.get_text_features(**concepts_inputs)
concepts_features /= concepts_features.norm(dim=-1, keepdim=True)

dict of concepts encodings

In [18]:
concept_encodings = {}
for concept, concept_vector in zip(concepts, concepts_features):
    concept_encodings[concept]  = concept_vector

dict of classes encodings

In [19]:
classes_encodings = {}
for class_name, class_vector in zip(classes.values(), classes_features):
    concept_encodings[class_name]  = class_vector

In [20]:
def aggregate_similarity(similarity_matrix_chunk, aggregation_method='mean'):
    if aggregation_method == 'max': return similarity_matrix_chunk.max(dim=1)[0]
    elif aggregation_method == 'sum': return similarity_matrix_chunk.sum(dim=1)
    elif aggregation_method == 'mean': return similarity_matrix_chunk.mean(dim=1)
    else: raise ValueError("Unknown aggregate_similarity")

In [193]:
for batch_num, batch in enumerate(tqdm(train_loader)):
    images, labels, titles = batch['image'], batch['title'], batch['title-text']
    classes_inputs = processor(text = list(classes.values()), images=images, padding=True, return_tensors="pt")
    concept_inputs = processor(text = concepts, images=images, padding=True, return_tensors="pt")
    classes_out = clip(**classes_inputs)
    concept_out = clip(**concept_inputs)
    #print(out.logits_per_image)
    #print(classes_out.logits_per_image.softmax(dim=1), "\n")
    #print(classes_out.logits_per_image.softmax(dim=1)[0].argmax())
    #print(labels[0], "\n")
    #print(concept_out.logits_per_image.softmax(dim=1)[0].argmax())
    print(aggregate_similarity(concept_out.logits_per_image))
    break

  0%|          | 0/1563 [00:00<?, ?it/s]

tensor([20.2827, 20.0167, 20.6020, 19.9099, 20.7902, 21.9529, 21.8211, 20.2437,
        21.4387, 21.5623, 20.5779, 21.0109, 20.2587, 21.6531, 21.0925, 22.0862,
        19.8738, 21.6007, 21.1770, 21.8955, 20.9331, 21.1743, 20.9983, 21.2279,
        21.7373, 21.4769, 21.1718, 21.0591, 20.6278, 20.7612, 21.3855, 21.0181],
       grad_fn=<MeanBackward1>)


In [None]:
all_preds = []
all_labels = []
clip.to(device)
for batch_num, batch in enumerate(tqdm(train_loader_preprocessed)):
    inputs, labels = batch
    inputs = inputs.to(device)
    classes_out = clip(**inputs).logits_per_image
    probs = classes_out.softmax(dim=1)
    preds = torch.argmax(probs, dim=1)
    all_preds.extend(list(preds.cpu().numpy()))
    all_labels.extend(labels)

  0%|          | 0/1563 [00:00<?, ?it/s]

In [None]:
import datasets
metric = datasets.load_metric("accuracy")

In [None]:
accuracy = metric.compute(predictions=all_preds, references=all_labels)

In [None]:
accuracy

## cummulative similarity

In [None]:
def get_cummulative_similarity()