# Code-1 for chroma enteries

In [None]:

#REQUIREMENTS->
#pip install scann chromadb torch torchvision


import os
import torch
import torchvision.transforms as transforms
from torchvision.models import mobilenet_v2, MobileNet_V2_Weights
from torch.utils.data import DataLoader, Dataset
import chromadb
from PIL import Image
from google.colab import drive

if not os.path.isdir("/content/drive"):
    drive.mount('/content/drive')

# using mobileNet model
weights = MobileNet_V2_Weights.DEFAULT
model = mobilenet_v2(weights=weights)
model.eval()

#GPU check
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)


preprocess = transforms.Compose([
    transforms.Resize(128),
    transforms.CenterCrop(128),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])


class ImageDataset(Dataset):
    def __init__(self, image_paths, transform):
        self.image_paths = image_paths
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        try:
            image = Image.open(self.image_paths[idx]).convert('RGB')
            image = self.transform(image)
            return image, os.path.basename(self.image_paths[idx])
        except Exception as e:
            print(f"Error loading image {self.image_paths[idx]}: {e}")
            return None, None


def add_image_embedding_to_chroma(collection, image_embeddings, image_ids):
    valid_embeddings = [embedding.tolist() for embedding, id in zip(image_embeddings, image_ids) if id is not None]
    valid_ids = [id for id in image_ids if id is not None]

    if valid_embeddings:
        collection.add(
            embeddings=valid_embeddings,
            ids=valid_ids
        )


def build_chroma_collection(image_dataset_folder, collection, batch_size=32):
    image_paths = [os.path.join(image_dataset_folder, fname) for fname in os.listdir(image_dataset_folder) if fname.endswith(('.jpg', '.jpeg', '.png'))]


    dataset = ImageDataset(image_paths, preprocess)
    dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=2, shuffle=False)


    for batch_images, batch_ids in dataloader:
        batch_images = batch_images.to(device)
        with torch.no_grad():
            embeddings = model(batch_images)  # Generate embeddings
        add_image_embedding_to_chroma(collection, embeddings.cpu(), batch_ids)
        print(f"Processed batch with IDs: {batch_ids}")


def initialize_chroma_collection():
    client = chromadb.Client()

    try:
        collection = client.get_collection("image_embeddings", embedding_function=None)
        print("Collection 'image_embeddings' already exists, using existing collection.")
    except:
        collection = client.create_collection("image_embeddings", embedding_function=None)
        print("Created new collection 'image_embeddings'.")

    return collection


def main():
    image_dataset_folder = "/content/drive/My Drive/Photos"  #google drive linked... #kindly replace with your file path
    collection = initialize_chroma_collection()  # Initialising Chroma collection
    build_chroma_collection(image_dataset_folder, collection)

if __name__ == "__main__":
    main()


Collection 'image_embeddings' already exists, using existing collection.




Processed batch with IDs: ('000004.jpg', '000028.jpg', '000017.jpg', '000002.jpg', '000020.jpg', '000015.jpg', '000001.jpg', '000021.jpg', '000007.jpg', '000010.jpg', '000026.jpg', '000018 (1).jpg', '000012.jpg', '000014.jpg', '000009.jpg', '000006.jpg', '000011.jpg', '000008.jpg', '000022.jpg', '000024 (1).jpg', '000024.jpg', '000027.jpg', '000018.jpg', '000023.jpg', '000013.jpg', '000019.jpg', '000005.jpg', '000025.jpg', '000003.jpg', '000016.jpg', '000038.jpg', '000063.jpg')




Processed batch with IDs: ('000160.jpg', '000119.jpg', '000135.jpg', '000161.jpg', '000111.jpg', '000162.jpg', '000034.jpg', '000108.jpg', '000167.jpg', '000064.jpg', '000116.jpg', '000184.jpg', '000088.jpg', '000168 (1).jpg', '000037.jpg', '000057.jpg', '000105.jpg', '000068.jpg', '000029.jpg', '000151.jpg', '000081.jpg', '000093.jpg', '000154 (1).jpg', '000091.jpg', '000109.jpg', '000127.jpg', '000100.jpg', '000062.jpg', '000154.jpg', '000106.jpg', '000173.jpg', '000051.jpg')




Processed batch with IDs: ('000187 (1).jpg', '000134.jpg', '000183.jpg', '000158.jpg', '000112.jpg', '000166.jpg', '000110.jpg', '000067.jpg', '000099.jpg', '000052.jpg', '000153.jpg', '000117.jpg', '000102.jpg', '000073.jpg', '000150 (1).jpg', '000122.jpg', '000137.jpg', '000082.jpg', '000164.jpg', '000076.jpg', '000058.jpg', '000185.jpg', '000169.jpg', '000071.jpg', '000107.jpg', '000179.jpg', '000098.jpg', '000147.jpg', '000094.jpg', '000148.jpg', '000152.jpg', '000165.jpg')




Processed batch with IDs: ('000069.jpg', '000040.jpg', '000182.jpg', '000087.jpg', '000156.jpg', '000136.jpg', '000043.jpg', '000155.jpg', '000140 (1).jpg', '000080.jpg', '000054.jpg', '000178 (1).jpg', '000176.jpg', '000172.jpg', '000129.jpg', '000090.jpg', '000030 (1).jpg', '000092.jpg', '000120.jpg', '000033.jpg', '000189.jpg', '000047.jpg', '000104.jpg', '000086.jpg', '000186.jpg', '000101.jpg', '000083.jpg', '000046.jpg', '000121.jpg', '000133.jpg', '000097.jpg', '000118.jpg')




Processed batch with IDs: ('000143.jpg', '000170.jpg', '000078.jpg', '000150.jpg', '000131.jpg', '000059.jpg', '000123.jpg', '000060.jpg', '000132.jpg', '000187.jpg', '000174.jpg', '000178.jpg', '000031.jpg', '000163.jpg', '000157.jpg', '000141.jpg', '000096.jpg', '000128.jpg', '000146.jpg', '000113.jpg', '000192.jpg', '000168.jpg', '000056.jpg', '000139.jpg', '000039.jpg', '000125.jpg', '000079.jpg', '000074.jpg', '000049.jpg', '000138.jpg', '000103.jpg', '000077.jpg')




Processed batch with IDs: ('000045.jpg', '000070.jpg', '000089.jpg', '000114.jpg', '000180.jpg', '000142.jpg', '000075.jpg', '000115.jpg', '000055.jpg', '000126.jpg', '000048.jpg', '000190.jpg', '000159.jpg', '000065.jpg', '000032.jpg', '000149.jpg', '000171.jpg', '000050.jpg', '000177.jpg', '000053.jpg', '000072.jpg', '000084.jpg', '000061.jpg', '000066.jpg', '000130.jpg', '000144 (1).jpg', '000188.jpg', '000044.jpg', '000095.jpg', '000181.jpg', '000175.jpg', '000124.jpg')




Processed batch with IDs: ('000085.jpg', '000144.jpg', '000140.jpg', '000041.jpg', '000145.jpg', '000035.jpg', '000191.jpg', '000030.jpg', '000042.jpg', '000036.jpg', '054358.jpg', '054316.jpg', '054414.jpg', '054286.jpg', '054308.jpg', '054402.jpg', '054310.jpg', '054326.jpg', '054407.jpg', '054350.jpg', '054337.jpg', '054293.jpg', '054285.jpg', '054375.jpg', '054383.jpg', '054312.jpg', '054299.jpg', '054295.jpg', '054291.jpg', '054267.jpg', '054338.jpg', '054302.jpg')




Processed batch with IDs: ('054325.jpg', '054301.jpg', '054347.jpg', '054413.jpg', '054287.jpg', '054386.jpg', '054394.jpg', '054327.jpg', '054392.jpg', '054265.jpg', '054324.jpg', '054288.jpg', '054319.jpg', '054395.jpg', '054379.jpg', '054365.jpg', '054306.jpg', '054352.jpg', '054399.jpg', '054340.jpg', '054367.jpg', '054344.jpg', '054341.jpg', '054372.jpg', '054368.jpg', '054314.jpg', '054397.jpg', '054398.jpg', '054384.jpg', '054273.jpg', '054311.jpg', '054275.jpg')




Processed batch with IDs: ('054343.jpg', '054378.jpg', '054321.jpg', '054373.jpg', '054292.jpg', '054284.jpg', '054355.jpg', '054270.jpg', '054406.jpg', '054266.jpg', '054396.jpg', '054300.jpg', '054307.jpg', '054388.jpg', '054279.jpg', '054336.jpg', '054364.jpg', '054328.jpg', '054410.jpg', '054348.jpg', '054376.jpg', '054381.jpg', '054313.jpg', '054334.jpg', '054297.jpg', '054377.jpg', '054405.jpg', '054329.jpg', '054360.jpg', '054309.jpg', '054353.jpg', '054269.jpg')




Processed batch with IDs: ('054330.jpg', '054370.jpg', '054274.jpg', '054283.jpg', '054331.jpg', '054362.jpg', '054315.jpg', '054349.jpg', '054356.jpg', '054271.jpg', '054280.jpg', '054289.jpg', '054277.jpg', '054298.jpg', '054354.jpg', '054305.jpg', '054351.jpg', '054320.jpg', '054296.jpg', '054408.jpg', '054393.jpg', '054390.jpg', '054281.jpg', '054339.jpg', '054411.jpg', '054278.jpg', '054409.jpg', '054335.jpg', '054268.jpg', '054346.jpg', '054318.jpg', '054272.jpg')




Processed batch with IDs: ('054276.jpg', '054357.jpg', '054389.jpg', '054391.jpg', '054345.jpg', '054342.jpg', '054401.jpg', '054361.jpg', '054366.jpg', '054387.jpg', '054363.jpg', '054403.jpg', '054322.jpg', '054400.jpg', '054323.jpg', '054294.jpg', '054317.jpg', '054404.jpg', '054333.jpg', '054359.jpg', '054385.jpg', '054332.jpg', '054304.jpg', '054290.jpg', '054282.jpg', '054382.jpg', '054303.jpg', '054369.jpg', '054371.jpg', '054380.jpg', '054412.jpg', '054415.jpg')




Processed batch with IDs: ('054374.jpg', 'IMG_4891 Medium.jpeg', 'Copy of IMG-20220511-WA0006.jpg', 'WhatsApp Image 2024-09-28 at 20.18.08 (1).jpeg', 'Copy of WhatsApp Image 2024-09-28 at 20.18.07.jpeg')


# Code-2 for searching the image

In [None]:
import torch
import torchvision.transforms as transforms
from torchvision.models import mobilenet_v2
import chromadb
import scann
import numpy as np
from PIL import Image



model = mobilenet_v2(pretrained=True)
model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)


preprocess = transforms.Compose([
    transforms.Resize(128),
    transforms.CenterCrop(128),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])


def extract_image_embedding(image_path):
    img = Image.open(image_path).convert('RGB')
    img_tensor = preprocess(img).unsqueeze(0)
    img_tensor = img_tensor.to(device)

    with torch.no_grad():
        embedding = model(img_tensor).cpu().numpy()
    return embedding.squeeze()


def get_all_embeddings_from_chroma():
    client = chromadb.Client()
    collection = client.get_collection("image_embeddings")


    items = collection.get(include=["embeddings", "documents"], limit=100000)
    embeddings = np.array(items["embeddings"])
    ids = items["documents"]

    print(f"Retrieved {len(ids)} IDs and {len(embeddings)} embeddings.")
    return embeddings, ids

#ScaNN
def build_scann_index(embeddings):
    searcher = scann.scann_ops_pybind.builder(embeddings, 5, "dot_product").tree(
        num_leaves=100, num_leaves_to_search=10, training_sample_size=360).score_ah(
        2, anisotropic_quantization_threshold=0.2).reorder(100).build()

    return searcher

# Step 5: Normalize the distance values
def normalize_distances(distances):
    min_distance = np.min(distances)
    max_distance = np.max(distances)

    normalized_distances = (distances - min_distance) / (max_distance - min_distance)
    return normalized_distances


def search_similar_images(target_image_path):

    target_embedding = extract_image_embedding(target_image_path)

    embeddings, ids = get_all_embeddings_from_chroma()
    for i in ids:
      print(i)
    searcher = build_scann_index(embeddings)

    neighbors, distances = searcher.search(target_embedding)

    print(f"Neighbors (indices): {neighbors}") #for debugging
    print(f"Distances before normalization: {distances}")

    normalized_distances = normalize_distances(distances)


    similar_image_ids = []
    for i in neighbors:
      similar_image_ids.append(embeddings[i])

    return similar_image_ids, normalized_distances


def add_image_embedding_to_chroma(collection, image_embeddings, image_ids):
    valid_embeddings = [embedding.tolist() for embedding, id in zip(image_embeddings, image_ids) if id is not None]
    valid_ids = [id for id in image_ids if id is not None]

    if valid_embeddings:
        collection.add(
            embeddings=valid_embeddings,
            documents=valid_ids
        )
        print(f"Added {len(valid_ids)} embeddings to the collection.")


def main():
    target_image_path = "/content/drive/My Drive/target_image1.jpeg" #google drive linked... #kindly replace with your file path
    similar_images, normalized_distances = search_similar_images(target_image_path)

    print("Top 5 Similar Images with Normalized Distances:")
    for image_ids, distance in zip(similar_images, normalized_distances):
        print(f"Image ID: {image_ids}, Normalized Distance: {distance}")

if __name__ == "__main__":
    main()
