In [1]:
pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.12.0
Note: you may need to restart the kernel to use updated packages.


In [None]:
!pip install transformers torch torchvision

Collecting transformers
  Downloading transformers-4.55.2-py3-none-any.whl.metadata (41 kB)
Collecting torchvision
  Downloading torchvision-0.23.0-cp313-cp313-manylinux_2_28_x86_64.whl.metadata (6.1 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2025.7.34-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (40 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.6.2-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting pillow!=8.3.*,>=5.3.0 (from torchvision)
  Using cached pillow-11.3.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (9.0 kB)
Collecting MarkupSafe>=2.0 (from jinja2->torch)
  Downloading MarkupSafe-3.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.0 kB)
Downloading transformers-4.55.2-py3-none-any.whl (11.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.3/11.3 MB[0m [31m4.0 

In [39]:
# What you're about to do:
# We'll import torch, check for a GPU, and then move our loaded model
# to the selected device (GPU or CPU) for faster processing.

import torch
from transformers import CLIPProcessor, CLIPModel

# 1. Check for GPU availability and set the device
# torch.cuda.is_available() returns True if a CUDA-enabled GPU is found.
if torch.cuda.is_available():
    # Set the device to the first available GPU ('cuda:0')
    device = torch.device("cuda")
    print("GPU is available. Using the GPU.")
else:
    # If no GPU is found, set the device to CPU
    device = torch.device("cpu")
    print("No GPU found. Using the CPU.")

# 2. Load the model and processor as before
model_name = "openai/clip-vit-base-patch32"
model = CLIPModel.from_pretrained(model_name)
processor = CLIPProcessor.from_pretrained(model_name)

# 3. Move the model to the selected device (GPU or CPU)
# The .to() method moves the model's parameters and buffers to the specified device.
model.to(device)

print(f"Model moved to {device} successfully!")

# The 'model' is now ready for fast computations on the GPU (if available).
# The 'processor' does not need to be moved as it primarily works on the CPU.

GPU is available. Using the GPU.
Model moved to cuda successfully!


In [40]:
!pip install Pillow

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [41]:
# What you're about to do:
# We will load a single image, use the processor to prepare it,
# and then use the model to extract its feature embedding.

import torch
from PIL import Image # The Pillow library for image handling

# Make sure you have the 'device', 'model', and 'processor' variables
# from the previous script loaded in your environment.

# --- Step 1: Open an Image ---
# IMPORTANT: Replace this with the actual path to one of your images.
image_path = "MewithCAt.jpg"

try:
    # This opens the image file.
    raw_image = Image.open(image_path)
    print("Image loaded successfully.")
except FileNotFoundError:
    print(f"Error: The file '{image_path}' was not found.")
    # Exit or handle the error appropriately
    exit()


# --- Step 2 & 3: Preprocess the Image and Move to GPU ---
# The processor prepares the image and returns a dictionary.
# We specify 'return_tensors="pt"' to get PyTorch tensors.
# The .to(device) moves the processed tensor to the GPU.
inputs = processor(images=raw_image, return_tensors="pt").to(device)


# --- Step 4: Get the Embedding ---
# We use torch.no_grad() because we are not training the model,
# which saves memory and computation.
with torch.no_grad():
    # This function call passes the image through the model
    # and gets the final embedding vector.
    image_features = model.get_image_features(**inputs)

# The output is our embedding!
print("\nEmbedding generated successfully!")
print("Shape of the embedding:", image_features.shape)
print("Here are the first few values of the embedding:")
print(image_features[0, :10])

Image loaded successfully.

Embedding generated successfully!
Shape of the embedding: torch.Size([1, 512])
Here are the first few values of the embedding:
tensor([ 0.1851,  0.2269,  0.0139,  0.1115, -0.3792,  0.1189, -0.3501,  0.1943,
         0.9519,  0.1973], device='cuda:0')


In [42]:
# What you're about to do:
# We will take a sample text description, process it using the tokenizer,
# and then use the model to extract its feature embedding.

import torch

# Make sure you have the 'device', 'model', and 'processor' variables
# from the previous scripts.

# --- Step 1: Define Text ---
# Let's create a sample text description. You can change this to anything!
text_description = "rocket"
print(f"Processing text: '{text_description}'")


# --- Step 2 & 3: Preprocess the Text and Move to GPU ---
# The processor tokenizes the text and returns PyTorch tensors.
# 'padding=True' ensures all sentences are the same length (important for batches).
# 'truncation=True' cuts off text that is too long for the model.
inputs = processor(text=text_description, return_tensors="pt", padding=True, truncation=True).to(device)


# --- Step 4: Get the Embedding ---
# Again, we use torch.no_grad() for efficiency.
with torch.no_grad():
    # This function call passes the text through the model
    # and gets the final embedding vector.
    text_features = model.get_text_features(**inputs)

# The output is our text embedding!
print("\nText embedding generated successfully!")
print("Shape of the embedding:", text_features.shape)
print("Here are the first few values of the embedding:")
print(text_features[0, :10])

Processing text: 'rocket'

Text embedding generated successfully!
Shape of the embedding: torch.Size([1, 512])
Here are the first few values of the embedding:
tensor([ 0.1666, -0.0631,  0.1757,  0.2560,  0.0776,  0.2820,  0.0721, -1.2316,
        -0.2682,  0.4632], device='cuda:0')


In [43]:
# What you're about to do:
# We will calculate the cosine similarity between the image and text embeddings
# to get a final score of how well they match.

import torch
import torch.nn.functional as F

# Make sure you have the 'image_features' and 'text_features' tensors
# from the previous steps.

# --- Step 1: Normalize the embeddings ---
# Normalizing the vectors is a standard step before comparison.
image_features_norm = F.normalize(image_features, p=2, dim=-1)
text_features_norm = F.normalize(text_features, p=2, dim=-1)


# --- Step 2: Calculate the dot product (cosine similarity) ---
# We use matrix multiplication (@) to compute the dot product.
similarity_score = torch.matmul(image_features_norm, text_features_norm.T)


# --- Step 3: Extract and print the score ---
# .item() extracts the single value from the tensor.
final_score = similarity_score.item()

print(f"\nSimilarity Score: {final_score:.4f}")

# Let's add some context to the score
if final_score > 0.25:
    print("This looks like a good match! ✅")
elif final_score > 0.18:
    print("This might be a potential match. 🤔")
else:
    print("This is likely not a good match. ❌")


Similarity Score: 0.2193
This might be a potential match. 🤔
