In [7]:
from datasets import load_dataset

imagenette = load_dataset(
    'frgfm/imagenette',
    '160px',
    split='train',
    ignore_verifications=False  # set to True if seeing splits Error
)
imagenette

Downloading and preparing dataset imagenette/160px (download: 95.08 MiB, generated: 129.48 MiB, post-processed: Unknown size, total: 129.48 MiB) to /home/gk/.cache/huggingface/datasets/frgfm___imagenette/160px/1.0.0/38929285b8abcae5c1305418e9d8fea5dd6b189bbbd22caba5f5537c7fa0f01f...


Downloading data: 100%|██████████| 99.0M/99.0M [01:52<00:00, 882kB/s] 
Downloading data: 100%|██████████| 500k/500k [00:00<00:00, 573kB/s]
Downloading data: 100%|██████████| 199k/199k [00:00<00:00, 374kB/s]t]
Downloading data files: 100%|██████████| 2/2 [00:04<00:00,  2.15s/it]
                                                                                         

Dataset imagenette downloaded and prepared to /home/gk/.cache/huggingface/datasets/frgfm___imagenette/160px/1.0.0/38929285b8abcae5c1305418e9d8fea5dd6b189bbbd22caba5f5537c7fa0f01f. Subsequent calls will reuse this data.




Dataset({
    features: ['image', 'label'],
    num_rows: 9469
})

In [8]:
from transformers import CLIPTokenizerFast, CLIPProcessor, CLIPModel
import torch

# if you have CUDA or MPS, set it to the active device like this
device = "cuda" if torch.cuda.is_available() else \
         ("mps" if torch.backends.mps.is_available() else "cpu")
model_id = "openai/clip-vit-base-patch32"

# we initialize a tokenizer, image processor, and the model itself
tokenizer = CLIPTokenizerFast.from_pretrained(model_id)
processor = CLIPProcessor.from_pretrained(model_id)
model = CLIPModel.from_pretrained(model_id).to(device)

Downloading: 100%|██████████| 568/568 [00:00<00:00, 192kB/s]
Downloading: 100%|██████████| 862k/862k [00:01<00:00, 669kB/s] 
Downloading: 100%|██████████| 525k/525k [00:01<00:00, 389kB/s]  
Downloading: 100%|██████████| 2.22M/2.22M [00:04<00:00, 546kB/s] 
Downloading: 100%|██████████| 389/389 [00:00<00:00, 171kB/s]
Downloading: 100%|██████████| 4.19k/4.19k [00:00<00:00, 1.08MB/s]
Downloading: 100%|██████████| 316/316 [00:00<00:00, 67.4kB/s]
Downloading: 100%|██████████| 605M/605M [07:20<00:00, 1.38MB/s]  


In [25]:
prompt = "a dog running in the snow"

# create transformer-readable tokens
inputs = tokenizer(prompt, return_tensors="pt")

In [26]:
text_emb = model.get_text_features(**inputs)
text_emb.shape

torch.Size([1, 512])

In [27]:
image = processor(
    text=None,
    images=imagenette[0]['image'],
    return_tensors='pt'
)['pixel_values'].to(device)
image.shape

torch.Size([1, 3, 224, 224])

In [28]:
img_emb = model.get_image_features(image)
img_emb.shape

torch.Size([1, 512])

In [29]:
import numpy as np

np.random.seed(0)
# select 100 random image index values
sample_idx = np.random.randint(0, len(imagenette)+1, 100).tolist()
# extract the image sample from the dataset
images = [imagenette[i]['image'] for i in sample_idx]

In [30]:
len(images)

100

In [31]:
from tqdm.auto import tqdm

batch_size = 16
image_arr = None

for i in tqdm(range(0, len(images), batch_size)):
    # select batch of images
    batch = images[i:i+batch_size]
    # process and resize
    batch = processor(
        text=None,
        images=batch,
        return_tensors='pt',
        padding=True
    )['pixel_values'].to(device)
    # get image embeddings
    batch_emb = model.get_image_features(pixel_values=batch)
    # convert to numpy array
    batch_emb = batch_emb.squeeze(0)
    batch_emb = batch_emb.cpu().detach().numpy()
    # add to larger array of all image embeddings
    if image_arr is None:
        image_arr = batch_emb
    else:
        image_arr = np.concatenate((image_arr, batch_emb), axis=0)
image_arr.shape

100%|██████████| 7/7 [00:14<00:00,  2.02s/it]


(100, 512)

In [32]:
image_arr

array([[ 1.18991256e-01,  4.21024971e-02, -3.51747051e-02, ...,
         3.99638116e-01, -9.78647694e-02,  7.43066221e-02],
       [ 6.63230896e-01,  2.78828859e-01, -1.59405559e-01, ...,
         4.26265538e-01,  8.68764073e-02, -2.41833195e-01],
       [ 3.30069304e-01,  1.47955149e-01,  2.93400437e-02, ...,
         8.18970680e-01,  1.49025507e-02,  1.00412294e-01],
       ...,
       [ 1.70199797e-01, -3.23256552e-02,  7.37652183e-04, ...,
         5.50143361e-01,  3.10972035e-01,  2.53675759e-01],
       [-9.71966684e-01,  2.79788256e-01, -3.36869240e-01, ...,
         1.02926981e+00,  1.21567607e-01, -2.04559550e-01],
       [-8.52783561e-01,  1.90348431e-01, -4.11760002e-01, ...,
         6.10124826e-01,  4.70097624e-02, -8.22311565e-02]], dtype=float32)

In [33]:
image_arr.min(), image_arr.max()

(-8.291456, 3.392732)

In [34]:
image_arr = image_arr / np.linalg.norm(image_arr, axis=0)
image_arr.min(), image_arr.max()

(-0.3697751, 0.45963725)

In [35]:
text_emb = text_emb.cpu().detach().numpy()
scores = np.dot(text_emb, image_arr.T)
scores.shape

(1, 100)

In [36]:
scores

array([[ 3.27038944e-01,  9.76806104e-01,  5.27521074e-01,
         7.48146653e-01,  1.17575407e+00,  1.38256431e-01,
         3.08556700e+00, -9.47804570e-01,  2.96096206e-01,
         1.42296898e+00,  2.64722109e-03,  5.62113702e-01,
         1.54908824e+00,  5.19008160e-01,  1.27888322e+00,
         2.89413738e+00,  5.99182010e-01,  2.46326065e+00,
         2.32161164e+00,  5.26606321e-01, -2.15892017e-01,
         9.51878488e-01,  1.73784709e+00, -1.94650337e-01,
         2.22960281e+00,  4.99060065e-01,  3.87086868e-01,
         6.71090066e-01,  7.41404712e-01,  8.30270052e-01,
         4.06762362e-02,  2.90264398e-01,  7.15774894e-01,
         5.07093191e-01,  1.70750093e+00, -1.57645404e-01,
         9.45403934e-01, -3.77126962e-01,  8.90902042e-01,
         1.58476162e+00,  1.20106602e+00,  2.00123191e-02,
         2.56577396e+00,  7.64355659e-01,  2.33283252e-01,
         1.62110567e+00,  1.90073028e-01,  1.22596169e+00,
        -8.91583622e-01, -1.02959074e-01,  1.04290307e+0