Task1: Download the Flickr8K dataset.

You will get it from Kaggle.

After downloading check if you have around 8000 images and 40K around captions.

In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("adityajn105/flickr8k")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/flickr8k


Task 2: Dump and save CLIP embeddings as .pt

In [2]:
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch

# Load model and processor once
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

def get_image_clip_embedding(image_path):
    """Returns the CLIP embedding for an image."""
    image = Image.open(image_path).convert("RGB")
    inputs = processor(images=image, return_tensors="pt")
    with torch.no_grad():
        outputs = model.get_image_features(**inputs)
    return outputs.squeeze().numpy()

def get_text_clip_embedding(text):
    """Returns the CLIP embedding for a text string."""
    inputs = processor(text=[text], return_tensors="pt")
    with torch.no_grad():
        outputs = model.get_text_features(**inputs)
    return outputs.squeeze().numpy()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

In [3]:
import os
import pandas as pd

# Path to the dataset directory
dataset_path = "/kaggle/input/flickr8k"
images_path = os.path.join(dataset_path, "Images")
captions_path = os.path.join(dataset_path, "captions.txt")

# List all image files
image_files = [os.path.join(images_path, f) for f in os.listdir(images_path) if f.endswith('.jpg')]
print(f"Found {len(image_files)} image files.")

# Load captions
captions_df = pd.read_csv(captions_path)
print(f"Loaded {len(captions_df)} captions.")
display(captions_df.head())

Found 8091 image files.
Loaded 40455 captions.


Unnamed: 0,image,caption
0,1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set o...
1,1000268201_693b08cb0e.jpg,A girl going into a wooden building .
2,1000268201_693b08cb0e.jpg,A little girl climbing into a wooden playhouse .
3,1000268201_693b08cb0e.jpg,A little girl climbing the stairs to her playh...
4,1000268201_693b08cb0e.jpg,A little girl in a pink dress going into a woo...


In [4]:
import torch
import os

# Create a directory to save image embeddings
image_embeddings_dir = "image_embeddings"
os.makedirs(image_embeddings_dir, exist_ok=True)

for image_file in image_files:
    try:
        # Get the image name without the path and extension
        image_name = os.path.basename(image_file).split('.')[0]
        embedding = get_image_clip_embedding(image_file)
        # Save the embedding as a .pt file
        torch.save(torch.tensor(embedding), os.path.join(image_embeddings_dir, f"{image_name}.pt"))
    except Exception as e:
        print(f"Error processing image {image_file}: {e}")

print(f"Saved image embeddings to {image_embeddings_dir}")

Saved image embeddings to image_embeddings


In [5]:
import torch
import os

# Create a directory to save text embeddings
text_embeddings_dir = "text_embeddings"
os.makedirs(text_embeddings_dir, exist_ok=True)

# We will group captions by image to save them together
for image_name, group in captions_df.groupby('image'):
    # Get the image name without the extension for the filename
    image_name_base = image_name.split('.')[0]
    text_embeddings = []
    for caption in group['caption']:
        try:
            embedding = get_text_clip_embedding(caption)
            text_embeddings.append(embedding)
        except Exception as e:
            print(f"Error processing caption '{caption}' for image {image_name}: {e}")
            # Append a placeholder or handle the error as appropriate
            text_embeddings.append(None) # Or np.zeros(embedding_dimension)

    # Save the list of embeddings for all captions of an image as a single .pt file
    # Filter out None values if any errors occurred
    valid_embeddings = [emb for emb in text_embeddings if emb is not None]
    if valid_embeddings:
        torch.save(torch.tensor(valid_embeddings), os.path.join(text_embeddings_dir, f"{image_name_base}.pt"))
    else:
        print(f"No valid embeddings generated for image {image_name}")


print(f"Saved text embeddings to {text_embeddings_dir}")

  torch.save(torch.tensor(valid_embeddings), os.path.join(text_embeddings_dir, f"{image_name_base}.pt"))


Saved text embeddings to text_embeddings


In [9]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
# Example: Copy a folder named 'my_results' from Colab's content directory to your Drive
!cp -r /content/image_embeddings/ /content/drive/MyDrive/AIML_Lab/Capstone_Project/clip_image_embeddings

In [13]:
# Example: Copy a folder named 'my_results' from Colab's content directory to your Drive
!cp -r /content/text_embeddings/ /content/drive/MyDrive/AIML_Lab/Capstone_Project/clip_text_embeddings