In [None]:
!pip install kagglehub



In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("adityajn105/flickr8k",  force_download=True)

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/flickr8k


In [None]:
import os
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.applications.inception_v3 import InceptionV3
from tensorflow.keras.preprocessing.image import img_to_array, load_img
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, add
from tensorflow.keras.layers import Flatten, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler
from sklearn.model_selection import train_test_split
from PIL import Image
from tqdm import tqdm_notebook
from collections import Counter

In [None]:
images_dir = "/kaggle/input/flickr8k/Images"
captions_file = "/kaggle/input/flickr8k/captions.txt"


In [None]:
!ls /kaggle/input/flickr8k/

captions.txt  Images


In [None]:

from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch

# Load model and processor once
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

def get_image_clip_embedding(image_path):
    """Returns the CLIP embedding for an image."""
    image = Image.open(image_path).convert("RGB")
    inputs = processor(images=image, return_tensors="pt")
    with torch.no_grad():
        outputs = model.get_image_features(**inputs)
    return outputs.squeeze().numpy()

def get_text_clip_embedding(text):
    """Returns the CLIP embedding for a text string."""
    inputs = processor(text=[text], return_tensors="pt")
    with torch.no_grad():
        outputs = model.get_text_features(**inputs)
    return outputs.squeeze().numpy()

In [None]:
im_paths = sorted(os.listdir(images_dir))
with open(captions_file, 'r') as f:
    captions = f.read().splitlines()[1:]

In [None]:
im_paths[0:6]

['1000268201_693b08cb0e.jpg',
 '1001773457_577c3a7d70.jpg',
 '1002674143_1b742ab4b8.jpg',
 '1003163366_44323f5815.jpg',
 '1007129816_e794419615.jpg',
 '1007320043_627395c3d8.jpg']

In [None]:
captions[0:5]

['1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set of stairs in an entry way .',
 '1000268201_693b08cb0e.jpg,A girl going into a wooden building .',
 '1000268201_693b08cb0e.jpg,A little girl climbing into a wooden playhouse .',
 '1000268201_693b08cb0e.jpg,A little girl climbing the stairs to her playhouse .',
 '1000268201_693b08cb0e.jpg,A little girl in a pink dress going into a wooden cabin .']

In [None]:
get_image_clip_embedding(os.path.join(images_dir, im_paths[0])).shape

(512,)

In [None]:
get_text_clip_embedding(captions[0]).shape

(512,)

In [None]:
# prompt: generate embeddings for image and text and save them as a .pt file

import torch

image_embeddings = {}
for img_name in tqdm_notebook(im_paths, desc="Generating image embeddings"):
    img_path = os.path.join(images_dir, img_name)
    embedding = get_image_clip_embedding(img_path)
    image_embeddings[img_name] = embedding

caption_embeddings = {}
for caption in tqdm_notebook(captions, desc="Generating text embeddings"):
    # Assuming the caption format is "image_name#caption_number caption_text"
    # We need to extract the image name to associate the caption with the image embedding
    parts = caption.split('#')
    if len(parts) > 1:
        img_name_with_number = parts[0]
        img_name = img_name_with_number.split('.')[0] + '.jpg' # Assuming image names end with .jpg
        caption_text = parts[1][2:] # remove caption number and space
        embedding = get_text_clip_embedding(caption_text)
        if img_name not in caption_embeddings:
            caption_embeddings[img_name] = []
        caption_embeddings[img_name].append(embedding)


# Save embeddings
embeddings_dict = {
    "image_embeddings": image_embeddings,
    "caption_embeddings": caption_embeddings
}

torch.save(embeddings_dict, 'clip_embeddings.pt')
print("Embeddings saved to clip_embeddings.pt")


Generating image embeddings:   0%|          | 0/8091 [00:00<?, ?it/s]

Generating text embeddings:   0%|          | 0/40455 [00:00<?, ?it/s]

Embeddings saved to clip_embeddings.pt
