In [None]:
!pip install transformers sentence-transformers timm faiss-cpu


In [None]:
import torch
import numpy as np
import faiss
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
import matplotlib.pyplot as plt

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")


In [None]:
img_path="/content/Railway-station-Reuters.jpg.webp"
query="a man with suitcase "

In [None]:
img = Image.open(img_path).convert("RGB")

In [None]:
from torchvision import transforms
import torchvision


In [14]:
# patches = []

# # List of different patch sizes
# patch_sizes = [128, 164, 180, 224]
# width, height = img.size

# for p_size in patch_sizes:
#     stride = int(p_size * 0.6)  # 60% overlap for each size
#     for y in range(0, height - p_size + 1, stride):
#         for x in range(0, width - p_size + 1, stride):
#             crop = img.crop((x, y, x + p_size, y + p_size))
#             bbox = (x, y, x + p_size, y + p_size)  # bounding box coordinates
#             patches.append((crop, bbox))  # store tuple (patch, bbox)


In [None]:
width, height = img.size

In [None]:
import math

def generate_patch_sizes_aspect_ratios(img_width, img_height, base_unit=32, num_scales=6,
                                       min_fraction=0.1, max_fraction=0.6,
                                       aspect_ratios = [

    0.5,    # tall rectangle (1:2)
    0.75,   # tall rectangle (~3:4)

    1.33,   # wide rectangle (~4:3)
    1.5,    # wide rectangle (3:2)
    2.0,    # wide rectangle (2:1)
    3.0,    # very wide rectangle (3:1)
    0.33,   # very tall rectangle (1:3)

]):
    """
    Generate patch sizes with multiple aspect ratios.

    Args:
        img_width (int): width of the image
        img_height (int): height of the image
        base_unit (int): smallest patch size unit (e.g., 32)
        num_scales (int): number of scales per aspect ratio
        min_fraction (float): min fraction of smaller dimension for smallest patch
        max_fraction (float): max fraction of smaller dimension for largest patch
        aspect_ratios (list of float): aspect ratios (width / height) to generate patches for

    Returns:
        List of tuples [(width1, height1), (width2, height2), ...] all multiples of base_unit
    """
    min_dim = min(img_width, img_height)
    min_size = int(min_dim * min_fraction)
    max_size = int(min_dim * max_fraction)

    # Round min/max sizes to nearest multiples of base_unit
    min_size = max(base_unit, (min_size // base_unit) * base_unit)
    max_size = (max_size // base_unit) * base_unit

    # Generate base square sizes linearly spaced between min and max
    base_sizes = []
    if num_scales > 1:
        step = (max_size - min_size) / (num_scales - 1)
    else:
        step = 0
    for i in range(num_scales):
        size = min_size + int(round(step * i))
        size = max(base_unit, round(size / base_unit) * base_unit)
        base_sizes.append(size)
    base_sizes = sorted(list(set(base_sizes)))

    patches = []
    for aspect_ratio in aspect_ratios:
        for base_size in base_sizes:
            # Calculate width and height based on aspect ratio
            # aspect_ratio = width / height
            # so height = base_size / sqrt(aspect_ratio), width = base_size * sqrt(aspect_ratio)
            height = base_size / math.sqrt(aspect_ratio)
            width = base_size * math.sqrt(aspect_ratio)

            # Round width and height to multiples of base_unit
            width = max(base_unit, int(round(width / base_unit) * base_unit))
            height = max(base_unit, int(round(height / base_unit) * base_unit))

            # Avoid patches bigger than image dimensions
            if width <= img_width and height <= img_height:
                patches.append((width, height))

    # Remove duplicates and sort by area descending (optional)
    patches = list(set(patches))
    patches.sort(key=lambda x: x[0]*x[1], reverse=True)
    return patches

# Example usage:
width, height = img.size
patch_sizes = generate_patch_sizes_aspect_ratios(width, height, base_unit=32, num_scales=6)
print("Patch sizes with aspect ratios:", patch_sizes)


In [17]:
patches = []

for (patch_w, patch_h) in patch_sizes:  # patch_sizes now list of (w, h) tuples
    stride_x = int(patch_w * 0.8)  # horizontal stride
    stride_y = int(patch_h * 0.8)  # vertical stride

    for y in range(0, height - patch_h + 1, stride_y):
        for x in range(0, width - patch_w + 1, stride_x):
            crop = img.crop((x, y, x + patch_w, y + patch_h))
            bbox = (x, y, x + patch_w, y + patch_h)
            patches.append((crop, bbox))
            print(f"Patch created with size: {patch_w}x{patch_h}")


Patch created with size: 480x320
Patch created with size: 480x320
Patch created with size: 480x320
Patch created with size: 480x320
Patch created with size: 672x224
Patch created with size: 672x224
Patch created with size: 672x224
Patch created with size: 224x672
Patch created with size: 224x672
Patch created with size: 224x672
Patch created with size: 224x672
Patch created with size: 224x672
Patch created with size: 224x672
Patch created with size: 416x352
Patch created with size: 416x352
Patch created with size: 416x352
Patch created with size: 416x352
Patch created with size: 416x352
Patch created with size: 416x352
Patch created with size: 352x416
Patch created with size: 352x416
Patch created with size: 352x416
Patch created with size: 352x416
Patch created with size: 448x320
Patch created with size: 448x320
Patch created with size: 448x320
Patch created with size: 448x320
Patch created with size: 448x320
Patch created with size: 448x320
Patch created with size: 320x448
Patch crea

In [None]:
image_embeddings = []

for patch, bbox in patches:
    inputs = processor(images=patch, return_tensors="pt").to(device)
    with torch.no_grad():
        img_feat = model.get_image_features(**inputs)
        img_feat = img_feat.squeeze(0)  # remove batch dim
    image_embeddings.append(img_feat.cpu().numpy())

image_embeddings = np.vstack(image_embeddings)


In [None]:

inputs = processor(text=[query], return_tensors="pt").to(device)
with torch.no_grad():
    text_embedding = model.get_text_features(**inputs)
text_embedding = text_embedding.cpu().numpy()


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

similarities = cosine_similarity(text_embedding, image_embeddings)
best_match_idx = np.argmax(similarities)
print(f"Best matching patch index: {best_match_idx}")


In [None]:
matched_patch = patches[best_match_idx][0]  # get the image part
matched_patch.show()


In [None]:
from IPython.display import display

display(matched_patch)


In [None]:
from PIL import ImageDraw

# Get matched patch and its bounding box (x, y, w, h)
matched_crop, (x1, y1, w, h) = patches[best_match_idx]

# Calculate bottom-right coordinates
x2 = min(w, width)
y2 = min(h, height)

# Draw bounding box on original image
img_with_box = img.copy()
draw = ImageDraw.Draw(img_with_box)
draw.rectangle([x1, y1, x2, y2], outline="red", width=4)

# Display bounding box coordinates
print("Bounding box coordinates:", (x1, y1, x2, y2))

# Show the image with bounding box and the matched patch crop
display(img_with_box)
display(matched_crop)
print(w-x1)
print(h-y1)