 # 1. Prep Work

### 1.1 Imports

In [133]:
import os
import numpy as np
import pandas as pd
import pprint

import pickle # Load refs and annotations
from typing import Any, Optional

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, TensorDataset

import torchvision
import torchmetrics


import pytorch_lightning as pl
from pytorch_lightning.utilities.types import STEP_OUTPUT

from transformers import T5Tokenizer, T5ForConditionalGeneration 

from tqdm import tqdm

from PIL import Image
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle

import clip
from ultralytics import YOLO


import torch
import clip
from PIL import Image, ImageDraw
import cv2
import numpy as np


import matplotlib.pyplot as plt
import torch
import numpy as np
import os
import json

from torch.utils.data import DataLoader
from torchvision import transforms
from PIL import Image, ImageDraw


### 1.2 Dataset

In [134]:
from torch.utils.data import Dataset

import json

class RefCOCOg(Dataset):
    """
    Args:
        dataset: a list of dictionaries containing:
        {
            'file_name': # path of the image, images will be loaded on the fly
            'caption': # referring caption
            'ann_id': # annotation ID (one per caption), taken from 'file_name'
            'bbox': # coordinates (xmin, ymin, xmax, ymax) of the bounding box
        }
    """
    def __init__(self, refs, annotations, split="train"):

        self.dataset = [{"file_name": os.path.join("./refcocog/images/", f'{"_".join(elem["file_name"].split("_")[:3])}.jpg'),
                            "caption": elem["sentences"][0]["raw"],
                            "ann_id": int(elem["file_name"].split("_")[3][:-4]),
                            "bbox": annotations[int(elem["file_name"].split("_")[3][:-4])]}
                        for elem in [d for d in refs if d["split"]==split]]

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        return self.dataset[idx]

    def __call__(self, idx):
        print(json.dumps(self.dataset[idx], indent=4))

In [135]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import pickle
from clip.simple_tokenizer import SimpleTokenizer

class RefCOCOGDataset(Dataset):
    def __init__(self, root_dir, split_type, transform=None):
        self.root_dir = root_dir
        self.split_type = split_type
        self.transform = transform
        self.annotations = self._load_annotations()

    def _load_annotations(self):
        annotations_file = os.path.join(self.root_dir + '/annotations/', 'refs(umd).p')
        with open(annotations_file, 'rb') as f:
            annotations = pickle.load(f, encoding='latin1')
        return annotations

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, idx):
        annotation = self.annotations[idx]
        image_id = annotation['image_id']
        image_path = os.path.join(self.root_dir, 'images', f'COCO_train2014_{str(image_id).zfill(12)}.jpg')
        image = Image.open(image_path).convert('RGB')

        if self.transform:
            image = self.transform(image)

        caption = annotation['sentences'][0]['raw']

        return image, caption

### 1.2 Load annotations

In [136]:
with open("./refcocog/annotations/refs(umd).p", "rb") as fp:
  refs = pickle.load(fp)

# 'annotations' will be a dict object mapping the 'annotation_id' to the 'bbox' to make search faster
with open("./refcocog/annotations/instances.json", "rb") as fp:
  data = json.load(fp)
  annotations = dict(sorted({ann["id"]: ann["bbox"] for ann in data["annotations"]}.items()))

In [5]:
# with open("./refcocog/annotations/refs(umd).p", "rb") as fp:
#     obj = pickle.loads(fp)
# pprint.pprint(obj)

### 1.3 Dataloader

In [137]:
def pad_image(image):
    """
    Performs bottom-right padding of the original image to 640x640 (max size of images in the dataset).
    Bottom-right padding prevents corruption of bounding boxes.

    ### Arguments
    image: a PIL.Image to transform
    """
    original_width, original_height = image.size
    padded_width, padded_height = 640, 640

    #pad_width = padded_width - original_width
    #pad_height = padded_height - original_height

    padded_image = Image.new(image.mode, (padded_width, padded_height), (0, 0, 0))
    padded_image.paste(image, (0, 0))

    return padded_image

def collate_fn(batch):
    images = [] #tensors of images
    for sample in batch:
        image = Image.open(sample["file_name"]).convert("RGB")
        image = pad_image(image=image)
        images.append(transform(image))
        #images.append(sample["file_name"])

    images = torch.stack(images, dim=0)
    
    data = {} #captions, cap_id, bbox
    for key in batch[0].keys():
        if key != "file_name":
            data[key] = [sample[key] for sample in batch]
    
    return images, data

def collate_fn2(batch):
    images, caption_tokens = zip(*batch)
    print(images)
    # Process images
    transform = transforms.Compose([
        transforms.Pad(0, fill=0),  # Replace padding_value with your desired padding
        transforms.Resize((224, 224)),  # ResNet-50 input size
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])
    image_tensors = torch.stack([transform(img) for img in images])

    # Tokenize the captions using the CLIP tokenizer
    #caption_lengths = [len(tokens) for tokens in caption_tokens]
    #max_caption_length = max(caption_lengths)
    captions = [item[1] for item in batch]
    #added_caption_tokens = [tokens + [0] * (max_caption_length - len(tokens)) for tokens in caption_tokens]
    caption_tensors = torch.tensor(captions)

    return {'images': image_tensors, 'captions': caption_tensors, 'caption_lengths': caption_lengths}

transform = transforms.Compose([
    transforms.Pad(0, fill=0),  # Replace padding_value with your desired padding
    transforms.Resize((224, 224)),  # ResNet-50 input size
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# create dataset and dataloader
"""
dataset_train = RefCOCOg(refs, annotations, split="train")
dataset_test = RefCOCOg(refs, annotations, split="test")
plt.imshow(Image.open(dataset_train[2]["file_name"]))
plt.imshow(Image.open(dataset_test[2]["file_name"]))
dataloader_train = DataLoader(dataset_train, batch_size=1, collate_fn=collate_fn)
dataloader_test = DataLoader(dataset_test, batch_size=1, collate_fn=collate_fn)
"""

dataset_root = "./refcocog"
test = RefCOCOg(refs, annotations, split="test")
#print(len(test))
#display(test[0][0])
#print(test[0])
dataloader_test = DataLoader(test[:1000], batch_size=1, collate_fn=collate_fn)


In [138]:
for inputs,outputs in dataloader_test:
    print(inputs)
    print("-----------")
    print(outputs["caption"][0])
    break


ciao = Image.open('./refcocog/images/COCO_train2014_000000380440.jpg')

tensor([[[[ 1.2899,  1.2557,  1.1700,  ...,  1.4612,  1.4954,  1.4783],
          [ 1.3242,  1.3242,  1.3242,  ...,  1.4440,  1.4269,  1.4098],
          [ 1.3413,  1.3413,  1.3413,  ...,  1.3242,  1.3927,  1.3413],
          ...,
          [-2.1179, -2.1179, -2.1179,  ..., -2.1179, -2.1179, -2.1179],
          [-2.1179, -2.1179, -2.1179,  ..., -2.1179, -2.1179, -2.1179],
          [-2.1179, -2.1179, -2.1179,  ..., -2.1179, -2.1179, -2.1179]],

         [[ 1.4482,  1.4307,  1.3782,  ...,  1.6232,  1.6583,  1.6408],
          [ 1.5007,  1.4832,  1.4832,  ...,  1.6057,  1.5882,  1.5707],
          [ 1.5007,  1.5007,  1.5007,  ...,  1.5007,  1.5532,  1.5007],
          ...,
          [-2.0357, -2.0357, -2.0357,  ..., -2.0357, -2.0357, -2.0357],
          [-2.0357, -2.0357, -2.0357,  ..., -2.0357, -2.0357, -2.0357],
          [-2.0357, -2.0357, -2.0357,  ..., -2.0357, -2.0357, -2.0357]],

         [[ 1.6988,  1.6988,  1.6465,  ...,  1.8034,  1.8383,  1.8383],
          [ 1.7511,  1.7337,  

### 1.5 Utils

In [139]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

print(device)

cpu


In [140]:
def load_image_plt(path):
    """ Load image with matplotlib"""
    return plt.imread(path)

def load_image_pil(path):
    """ Load image with PIL"""
    return Image.open(path)

def get_distance_box_iou_accuracy(box_pred, box_true, iou_threshold=0.5):
    """
    Given the target boxes and the prediction return the
    accuracy of the prediction. The accuracy is computed as
    the percentage of boxes that have an IoU > iou_threshold
    with the target box.

    Args:
    @params box_pred: tensor of shape (batch_size, n_boxes, 4)
    @params box_true: tensor of shape (batch_size, n_boxes, 4)
    @params iou_threshold: float

    Returns:
    @params accuracy: float

    """

    iou = torchvision.ops.box_iou(box_pred, box_true).diagonal()
    giou = torchvision.ops.generalized_box_iou(box_pred, box_true).diagonal()

    return (iou > iou_threshold).float().mean(), iou.mean(), giou.mean()

# 2. Baseline

### 2.1 Gigiate Random

In [141]:
model, preprocess = clip.load("RN50")
model = model.eval()
preprocess

Compose(
    Resize(size=224, interpolation=bicubic, max_size=None, antialias=warn)
    CenterCrop(size=(224, 224))
    <function _convert_image_to_rgb at 0x7fe6ff7112d0>
    ToTensor()
    Normalize(mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711))
)

CELLE SINGOLE PER L'ENCODING DI IMMAGINE E TESTO

In [142]:
#CODICE PER DATALOADER CON SOLO I NOMI DELLE IMMAGINI

imgs = []
txts = []
for input, output in dataloader_test:
    imgs.append(input)
    txts.append(output["caption"][0])
print("PRIME 3 IMMAGINI: \n", imgs[:3])
print("PRIME 3 CAPTIONS: \n", txts[:3])

print(len(imgs))
print(len(txts))
print("DIM: ", imgs[0].dim())
porzione_imgs = imgs[:1000]
porzione_txts = txts[:1000]


#CODICE PER DATALOADER CON LE IMMAGINI DA ELABORATE

PRIME 3 IMMAGINI: 
 [tensor([[[[ 1.2899,  1.2557,  1.1700,  ...,  1.4612,  1.4954,  1.4783],
          [ 1.3242,  1.3242,  1.3242,  ...,  1.4440,  1.4269,  1.4098],
          [ 1.3413,  1.3413,  1.3413,  ...,  1.3242,  1.3927,  1.3413],
          ...,
          [-2.1179, -2.1179, -2.1179,  ..., -2.1179, -2.1179, -2.1179],
          [-2.1179, -2.1179, -2.1179,  ..., -2.1179, -2.1179, -2.1179],
          [-2.1179, -2.1179, -2.1179,  ..., -2.1179, -2.1179, -2.1179]],

         [[ 1.4482,  1.4307,  1.3782,  ...,  1.6232,  1.6583,  1.6408],
          [ 1.5007,  1.4832,  1.4832,  ...,  1.6057,  1.5882,  1.5707],
          [ 1.5007,  1.5007,  1.5007,  ...,  1.5007,  1.5532,  1.5007],
          ...,
          [-2.0357, -2.0357, -2.0357,  ..., -2.0357, -2.0357, -2.0357],
          [-2.0357, -2.0357, -2.0357,  ..., -2.0357, -2.0357, -2.0357],
          [-2.0357, -2.0357, -2.0357,  ..., -2.0357, -2.0357, -2.0357]],

         [[ 1.6988,  1.6988,  1.6465,  ...,  1.8034,  1.8383,  1.8383],
         

In [37]:
#CONVERSIONE DEL PATH IN IMMAGINE

#CODICE PER DATALOADER CON SOLO I NOMI DELLE IMMAGINI

#images = [(Image.open(image[0]).convert("RGB")) for image in porzione_imgs]

#print("PRIME 3 IMMAGINI CONV: \n", images[:3])

PRIME 3 IMMAGINI CONV: 
 [<PIL.Image.Image image mode=RGB size=640x376 at 0x7FE6DAF8C1F0>, <PIL.Image.Image image mode=RGB size=640x431 at 0x7FE6DAF8C100>, <PIL.Image.Image image mode=RGB size=640x426 at 0x7FE6DAF8CBE0>]


In [143]:

#CODICE PER DATALOADER CON SOLO I NOMI DELLE IMMAGINI
tmp = []
for i in images:
    tmp.append(preprocess(i))


print("PRIME 3 IMMAGINI-TENSORI: \n", tmp[:3])

KeyboardInterrupt: 

In [89]:
#tensor stack & TOKENIZER


#CODICE PER DATALOADER CON SOLO I NOMI DELLE IMMAGINI

# image_input = torch.tensor(np.stack(imgs))
# text_tokens = clip.tokenize(["This is " + desc for desc in porzione_txts])

#CODICE PER DATALOADER CON LE IMMAGINI DA ELABORATE
image_input = torch.tensor(np.stack(imgs))
text_tokens = clip.tokenize(["This is " + desc for desc in porzione_txts])

print("\nDIM image_input: ", image_input[0].dim())
#print("\nDIM tmp: ", tmp[0].dim())


print("PRIME 3 IMMAGINI-TENSORI (after stack): \n", image_input[:3])
print("PRIME 3 TEXT-TOKEN: \n", text_tokens[:3])


DIM image_input:  4
PRIME 3 IMMAGINI-TENSORI (after stack): 
 tensor([[[[[ 1.2899,  1.2557,  1.1700,  ...,  1.4612,  1.4954,  1.4783],
           [ 1.3242,  1.3242,  1.3242,  ...,  1.4440,  1.4269,  1.4098],
           [ 1.3413,  1.3413,  1.3413,  ...,  1.3242,  1.3927,  1.3413],
           ...,
           [-2.1179, -2.1179, -2.1179,  ..., -2.1179, -2.1179, -2.1179],
           [-2.1179, -2.1179, -2.1179,  ..., -2.1179, -2.1179, -2.1179],
           [-2.1179, -2.1179, -2.1179,  ..., -2.1179, -2.1179, -2.1179]],

          [[ 1.4482,  1.4307,  1.3782,  ...,  1.6232,  1.6583,  1.6408],
           [ 1.5007,  1.4832,  1.4832,  ...,  1.6057,  1.5882,  1.5707],
           [ 1.5007,  1.5007,  1.5007,  ...,  1.5007,  1.5532,  1.5007],
           ...,
           [-2.0357, -2.0357, -2.0357,  ..., -2.0357, -2.0357, -2.0357],
           [-2.0357, -2.0357, -2.0357,  ..., -2.0357, -2.0357, -2.0357],
           [-2.0357, -2.0357, -2.0357,  ..., -2.0357, -2.0357, -2.0357]],

          [[ 1.6988,  1.6

In [94]:
#ENCODING

#CODICE PER DATALOADER CON SOLO I NOMI DELLE IMMAGINI

# with torch.no_grad():
#     image_features = model.encode_image(image_input).float()
#     text_features = model.encode_text(text_tokens).float()


#CODICE PER DATALOADER CON LE IMMAGINI DA ELABORATE
with torch.no_grad():
    #image_features = model.encode_image(image_input).float()
    text_features = model.encode_text(text_tokens).float()


#print("PRIME 3 IMAGE FEATUREs: \n", image_input[:3])
print("PRIME 3 TEXT FEATURES: \n", text_features[:3])

PRIME 3 TEXT FEATURES: 
 tensor([[-0.0478,  0.0035, -0.1372,  ...,  0.1532, -0.0260,  0.1411],
        [-0.1248,  0.0529, -0.0958,  ...,  0.1181, -0.0588, -0.3231],
        [-0.0469,  0.0723, -0.0057,  ..., -0.1106, -0.4775,  0.3507]])


In [144]:
def encode_data(images_fp: list[str], texts: list[str]):
  # preprocess the images to transform from filenames to images to tensors
  images = [preprocess(Image.open(image[0]).convert("RGB")) for image in images_fp]
  
  # preprocess the texts to transform from text to tensors
  images_input = torch.tensor(np.stack(images))
  
  text_tokens = clip.tokenize(["This is " + desc for desc in texts])

  # encode the inputs
  with torch.no_grad():
    images_z = model.encode_image(images_input).float()
    texts_z = model.encode_text(text_tokens).float()
  
  return images_z, texts_z

# def encode_text(images_fp: list[torch.Tensor], texts: list[str]):

#   # preprocess the texts to transform from text to tensors
#   image_input = torch.tensor(np.stack(images_fp))
#   text_tokens = clip.tokenize(["This is " + desc for desc in texts])
#   print("YEEE CI SONO")
#   # encode the inputs
#   with torch.no_grad():
#     images_z = model.encode_image(image_input).float()
#     texts_z = model.encode_text(text_tokens).float()
#   print("E Qua ci sono? SIII")
#   return images_z, texts_z

def cosine_similarity(images_z: torch.Tensor, texts_z: torch.Tensor):
  # normalise the image and the text
  images_z /= images_z.norm(dim=-1, keepdim=True)
  texts_z /= texts_z.norm(dim=-1, keepdim=True)

  # evaluate the cosine similarity between the sets of features
  similarity = (texts_z @ images_z.T)

  return similarity.cpu()

# def get_data():
#   a = 0
#   images = []
#   texts = []

#   for d in dataloader:
#     texts.append(d[1]["caption"][0])
#     if a == 4:
#         break
#     a += 1
#   a = 0
#   for d in dataset:
#     images.append(d['file_name'])
#     if a == 4:
#         break
#     a += 1
    
#   return images, texts

# images_fp, texts = get_data()


In [12]:
imgs = []
txts = []
for input, output in dataloader_test:
    imgs.append(input)
    txts.append(output["caption"][0])
#print(imgs[0][0])
images_z, texts_z = encode_data(imgs, txts)
#print("IMAGES_Z", images_z[:3])
print("TEXTS_Z", txts[:3])
# encoded_texts = encode_text(images_z, texts_z)
#print("Encoded text: ", encoded_texts[:3])
# similarity = cosine_similarity(images_z, texts_z)
# print(similarity)

KeyboardInterrupt: 

In [145]:
# Constants and models

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# YOLO
model_yolo = YOLO("yolov8x.pt")

# CLIP
clip_model, clip_preprocess = clip.load("RN50", device=device)

### 2.2 FineTuning

#### 2.2.1 Creating train, validation and test set

In [None]:
def SplitData(dataset_name, batch_size=64, transform=None, test_batch_size=64):
  dataset = dataset_name

  if not transform:
    # convert the PIL images to Tensors
    transform = torchvision.transforms.Compose([torchvision.transforms.ToTensor()])

  # load data
  # full_training_data = dataset('./data', train=True, transform=transform, download=True)
  # test_data = dataset('./data', train=False, transform=transform, download=True)

  # create train and validation splits
  num_samples = len(dataset)
  training_samples = int(num_samples * 0.5 + 1)
  validation_samples = num_samples - training_samples

  training_data, validation_data = torch.utils.data.random_split(dataset, [training_samples, validation_samples])

  # initialize dataloaders
  train_loader = DataLoader(training_data, batch_size=1, collate_fn=collate_fn)
  val_loader = DataLoader(validation_data, batch_size=1, collate_fn=collate_fn)
  #test_loader = torch.utils.data.DataLoader(test_data, 1, shuffle=False)

  return train_loader, val_loader

#### 2.2.2 Test Step Zero-Shot Clip

In [152]:
def test_step_zero_shot_clip(net, data_loader, texts_z, device='cpu'):
  samples = 0.0
  cumulative_accuracy = 0.0

  # set the network to evaluation mode
  net.eval()

  # disable gradient computation (we are only testing, we do not want our model to be modified in this step!)
  with torch.no_grad():
    # iterate over the test set
    for inputs, targets in data_loader:
      #image = preprocess(Image.open(inputs[0]).convert("RGB"))
      #print("DIM: ", inputs.dim())
      #image_input = torch.tensor(np.stack(image))
      #print("DIM: ", image_input.dim())
      #print("\nimage: ", inputs)
      #print(data.keys())

      #print(targets)
      # load data into GPU
      # inputs = inputs.to(device)
      # targets = targets.to(device)
      
      text = clip.tokenize(targets["caption"][0])
      #print("TEXT: ", text)
      texts_z = model.encode_text(text).float()
      #print("TEXT encoded: ", texts_z2)
      # forward pass
      # these two lines are different from the "traditional" ones
      images_z = model.encode_image(inputs).float()
      print("\nImages_z: ", images_z.dim())


      images_z /= images_z.norm(dim=-1, keepdim=True)
      texts_z /= texts_z.norm(dim=-1, keepdim=True)
      
      outputs = (100 * images_z @ texts_z.T).softmax(dim=-1)
      print("\nOutput: ", outputs)

      # fetch prediction and loss value
      samples += inputs.shape[0]
      _, predicted = outputs.max(1)
      print("\nPREDICTED: ", predicted)

      # compute accuracy
      cumulative_accuracy += predicted.eq(texts_z).sum().item()
      #print("prediction: ", predicted.eq(texts_z).sum().item())
      print("Cumultative acc: ", cumulative_accuracy)

  return cumulative_accuracy / samples * 100

In [153]:
acc = test_step_zero_shot_clip(clip_model, dataloader_test, text_features)
print("ACCURACY: ", acc)

4

Images_z:  2

Output:  tensor([[1.0255e-05, 5.7962e-07, 1.4600e-08, 1.3623e-07, 3.4612e-07, 1.5488e-06, 3.3096e-08, 4.1697e-08, 1.0388e-07, 3.7865e-07, 9.0177e-09, 1.2125e-08, 1.4915e-07, 7.6058e-07, 1.2214e-06, 2.8844e-09, 1.0834e-02, 1.5411e-06, 7.9783e-10, 2.6430e-06, 1.6294e-07, 3.8926e-08, 1.2348e-06, 4.6986e-06, 2.5332e-10, 5.4078e-07,
         7.9448e-07, 2.2818e-07, 7.3196e-09, 7.5219e-08, 1.4211e-06, 3.7168e-07, 1.1598e-07, 3.6080e-08, 3.0425e-07, 2.0356e-06, 2.2827e-09, 3.9302e-09, 3.1921e-07, 2.6633e-07, 1.2088e-07, 4.7990e-08, 2.0044e-06, 5.3220e-07, 6.1258e-08, 2.3051e-08, 1.8669e-10, 3.4970e-06, 9.1164e-09, 7.7477e-08, 2.1703e-06, 6.1974e-08,
         1.1276e-10, 1.2411e-07, 1.6891e-07, 7.5512e-08, 2.0662e-07, 1.5273e-05, 9.9806e-08, 2.5086e-07, 5.1209e-08, 5.3706e-07, 3.3018e-09, 2.6585e-05, 1.8335e-06, 1.9704e-05, 9.2189e-06, 8.5769e-09, 2.1551e-08, 3.2507e-08, 1.3491e-04, 1.5022e-07, 7.5394e-07, 7.3412e-08, 1.3203e-07, 6.0027e-09, 3.2181e-08, 6.0032e-08,
         2.

KeyboardInterrupt: 

In [92]:
def test_step_zero_shot_clip2(net, data_loader, cost_function, device='cpu'):
  samples = 0.0
  cumulative_loss = 0.0
  cumulative_accuracy = 0.0


  # set the network to evaluation mode
  net.eval()

  # disable gradient computation (we are only testing, we do not want our model to be modified in this step!)
  with torch.no_grad():
    # iterate over the test set
    for batch_idx, (inputs, targets) in enumerate(data_loader):
      # load data into GPU
      inputs = inputs.to(device)
      targets = targets.to(device)

      # forward pass
      outputs = net(inputs)

      # loss computation
      loss = cost_function(outputs, targets)

      # fetch prediction and loss value
      samples += inputs.shape[0]
      cumulative_loss += loss.item() # Note: the .item() is needed to extract scalars from tensors
      _, predicted = outputs.max(1)

      # compute accuracy
      cumulative_accuracy += predicted.eq(targets).sum().item()

  return cumulative_loss / samples, cumulative_accuracy / samples * 100

In [13]:
def get_cost_function():
  cost_function = torch.nn.CrossEntropyLoss()
  return cost_function

#### 2.2.3 Tokenize 

In [None]:
for d in dataset:
    print(d['caption'])
    break

NameError: name 'dataset' is not defined

In [None]:
def tokenize_batches(dataset, batch_size):
    # Extract descriptions from the dataset
    descriptions = [data['target'] for data in dataset]
    
    # Initialize lists to store tokenized textual features
    all_texts_z = []

    # Tokenize and encode in batches
    for batch_start in range(0, len(descriptions), batch_size):
        batch_descriptions = descriptions[batch_start:batch_start + batch_size]

        # Tokenize batch and move to GPU
        text_tokens = clip.tokenize(batch_descriptions).cuda()

        # Encode batch and normalize
        with torch.no_grad():
            texts_z = model.encode_text(text_tokens).float()
            texts_z /= texts_z.norm(dim=-1, keepdim=True)

        all_texts_z.append(texts_z)

    # Concatenate and return all batches
    return torch.cat(all_texts_z, dim=0)

In [None]:
model, preprocess = clip.load("RN50")
model = model.cuda().eval()

In [None]:
#dataset_name = "cifar10"

batch_size_token = 64

#_, _, test_loader = SplitData(transform=preprocess)
texts_z = tokenize_batches(test,batch_size_token)


AttributeError: 'list' object has no attribute 'find'

In [None]:
print(test_loader[0])

TypeError: 'DataLoader' object is not subscriptable

In [None]:
cost_function = get_cost_function()

test_accuracy = test_step_zero_shot_clip2(model, dataloader_test, cost_function)

print("Test accuracy {:.2f}".format(test_accuracy))

TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class 'PIL.Image.Image'>