In [5]:

import os
from dotenv import load_dotenv
from huggingface_hub import login
import torch
from PIL import Image
from transformers import BlipProcessor, BlipForImageTextRetrieval, BlipForConditionalGeneration
import torch.nn.functional as F
from transformers import CLIPProcessor, CLIPModel

# ─── 0. CONFIGURATION & LOCAL PATHS ─────────────────────────────────────────────
load_dotenv()  # Load variables from .env file
HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
if not HUGGINGFACE_TOKEN:
    raise ValueError("Hugging Face token not found in .env file!")
login(token=HUGGINGFACE_TOKEN)



CLIP_RETRIEVAL_ID = "openai/clip-vit-base-patch32"
BLIP_CAPTION_ID   = "Salesforce/blip-image-captioning-base"

# Paths where you previously saved the BLIP models & processors:
LOCAL_CAPTION_DIR = "../models/img_caption"
LOCAL_RETRIEVAL_DIR    = "../models/embedding"
# Choose device (GPU if available):
device = "cuda" if torch.cuda.is_available() else "cpu"


In [6]:

# 2.2 Load BLIP captioning model + processor (for image→caption)
caption_processor = BlipProcessor.from_pretrained(BLIP_CAPTION_ID)
caption_model     = BlipForConditionalGeneration.from_pretrained(BLIP_CAPTION_ID)

# Save the caption processor and model locally
caption_processor.save_pretrained(LOCAL_CAPTION_DIR)
caption_model.save_pretrained(LOCAL_CAPTION_DIR)


preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

In [7]:
from transformers import CLIPProcessor, CLIPModel
# ─── 2. LOAD FROM HUGGING FACE & SAVE LOCALLY ───────────────────────────────────
# 2.1 Load BLIP retrieval model + processor (for text→embed & image→embed)
retrieval_processor = CLIPModel.from_pretrained(CLIP_RETRIEVAL_ID)
retrieval_model     = CLIPProcessor.from_pretrained(CLIP_RETRIEVAL_ID)

# Save the retrieval processor and model locally
retrieval_processor.save_pretrained(LOCAL_RETRIEVAL_DIR)
retrieval_model.save_pretrained(LOCAL_RETRIEVAL_DIR)


config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

[]

In [8]:
# 1.1 Load BLIP captioning model + processor (for image→text)
caption_processor = BlipProcessor.from_pretrained(LOCAL_CAPTION_DIR)
caption_model     = BlipForConditionalGeneration.from_pretrained(LOCAL_CAPTION_DIR).to(device)

# 1.2 Load BLIP retrieval model + processor (for text→embed and image→embed)
emb_processor = CLIPProcessor.from_pretrained(LOCAL_RETRIEVAL_DIR)
emb_model     = CLIPModel.from_pretrained(LOCAL_RETRIEVAL_DIR).to(device)


In [10]:
# 1.1 Load BLIP captioning model + processor (for image→text)
caption_processor = BlipProcessor.from_pretrained(LOCAL_CAPTION_DIR)
caption_model     = BlipForConditionalGeneration.from_pretrained(LOCAL_CAPTION_DIR).to(device)

# 1.2 Load BLIP retrieval model + processor (for text→embed and image→embed)
emb_processor = CLIPProcessor.from_pretrained(LOCAL_RETRIEVAL_DIR)
emb_model     = CLIPModel.from_pretrained(LOCAL_RETRIEVAL_DIR).to(device)


In [11]:
import numpy as np
# 1. Text to normalized vector (using CLIP)
def text_to_vector(text: str) -> np.ndarray:
    """
    Convert text to normalized L2 vector using CLIP
    
    Args:
        text: Input text string
        
    Returns:
        Normalized embedding vector (numpy array)
    """
    inputs = emb_processor(text=text, return_tensors="pt", padding=True).to(device)
    with torch.no_grad():
        text_features = emb_model.get_text_features(**inputs)
    # Normalize to unit vector (L2 norm)
    text_features /= text_features.norm(dim=-1, keepdim=True)
    return text_features.cpu().numpy().squeeze()

# 2. Image to normalized vector (using CLIP)
def image_to_vector(image: Image.Image) -> np.ndarray:
    """
    Convert image to normalized L2 vector using CLIP
    
    Args:
        image: PIL Image object
        
    Returns:
        Normalized embedding vector (numpy array)
    """
    inputs = emb_processor(images=image, return_tensors="pt").to(device)
    with torch.no_grad():
        image_features = emb_model.get_image_features(**inputs)
    # Normalize to unit vector (L2 norm)
    image_features /= image_features.norm(dim=-1, keepdim=True)
    return image_features.cpu().numpy().squeeze()

# 3. Image to caption (using BLIP)
def image_to_caption(image: Image.Image, max_length: int = 30) -> str:
    """
    Generate caption from image using BLIP
    
    Args:
        image: PIL Image object
        max_length: Maximum caption length (default 30)
        
    Returns:
        Generated caption string
    """
    inputs = caption_processor(image, return_tensors="pt").to(device)
    with torch.no_grad():
        output = caption_model.generate(**inputs, max_length=max_length)
    caption = caption_processor.decode(output[0], skip_special_tokens=True)
    return caption

In [12]:
# Example 1: Embed some text
txt1 = "banana holding gun with sunglasses"
txt1_embed = text_to_vector(txt1)
print("Text embedding shape:", txt1_embed.shape)  

Text embedding shape: (512,)


In [14]:
# Example 1: Embed some text
txt2 = "t-test in hypothesis testing assumes nomality assumption of population distribution, if sample size is less than 30"
txt2_embed = text_to_vector(txt2)
print("Text embedding shape:", txt2_embed.shape)  

Text embedding shape: (512,)


In [16]:
from PIL import Image

img_path = "image.png"
image = Image.open(img_path).convert("RGB")
img_embed = image_to_vector(image)
caption = image_to_caption(image)
print("Generated caption:", caption)
print("Image embedding shape:", img_embed.shape)

Generated caption: a banana with sunglasses and a gun
Image embedding shape: (512,)


In [18]:
def find_similarity(emb1,emb2):
    return np.dot(emb1, emb2)

display(
    find_similarity(txt1_embed, img_embed),
    find_similarity(txt2_embed, img_embed),

)

0.35700822

0.17156272