In [1]:
from deeplake.core.vectorstore.deeplake_vectorstore import VectorStore
import torch
import time, os, shutil
from torchvision import transforms, models, datasets
from torchvision.models.segmentation import fcn_resnet50, FCN_ResNet50_Weights
from torchvision.models.feature_extraction import create_feature_extractor
from PIL import Image, ImageFilter



In [2]:
"""
Create ResNet34 model
(COULD BE PART OF A CLASS)
""" 
def resnet():
    # Check if GPU is available
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

    # Load the model
    model = models.resnet34(pretrained=True)

    # Create a feature extractor
    return_nodes = {
        'avgpool': 'embedding'
    }
    model = create_feature_extractor(model, return_nodes=return_nodes)

    # Freeze the model
    model.eval()
    model.to(device)
    return model

In [3]:
model = resnet()
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

def embedding_function(images, model=model, device=device, batch_size = 500):
    """Creates a list of embeddings based on a list of image filenames. Images are processed in batches."""
    transform= transforms.Compose([
        transforms.Resize((224,224)), 
        transforms.ToTensor(),
        transforms.Lambda(lambda x: torch.cat([x, x, x], dim=0) if x.shape[0] == 1 else x),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
    ])

    if isinstance(images, str):
        images = [images]

    # Proceess the embeddings in batches, but return everything as a single list
    embeddings = []
    for i in range(0, len(images), batch_size):
        batch = torch.stack([transform(Image.open(item)) for item in images[i:i+batch_size]])
        batch = batch.to(device)
        with torch.no_grad():
            embeddings+= model(batch)['embedding'][:,:,0,0].cpu().numpy().tolist()

    return embeddings

  torch.has_cuda,
  torch.has_cudnn,
  torch.has_mps,
  torch.has_mkldnn,


In [None]:
def create_all_vector_stores(vector_store_path="../data/vector_stores"):

    start_time = time.time()

    print("Loading Model")
    model = resnet()
    print("Model Loaded")

    # make_vector_store_dir(vector_store_path)

    print("Creating Vector Stores")
    # Path to the individual vector stores

    # Check if Vector Stores Folder exists and delete it if they do
    if os.path.exists(vector_store_path + "/boundingbox_vs"):
        shutil.rmtree(vector_store_path + "/boundingbox_vs")

    boundingbox_vs = VectorStore(
        path=f"{vector_store_path}/boundingbox_vs",
        tensor_params=[
            {"name": "image", "htype": "image", "sample_compression": "jpg"},
            {"name": "embedding", "htype": "embedding"},
            {"name": "filename", "htype": "text"},
        ],
    )
    print("Vector Stores Created")

    # Get the list of images
    sb_images = [f"../data/boundingbox/{item}" for item in os.listdir("../data/boundingbox/")]

    print("Adding Images to Vector Stores")

    # Get device
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    print("Device: ", device)

    # Add the images to the vector stores
    print("Boundingbox Images")
    boundingbox_vs.add(
        image=sb_images,
        filename=sb_images,
        embedding_function=embedding_function,
        embedding_data=sb_images,
    )

    print("Images Successfully Added to Vector Stores")

    end_time = time.time()

    print("Time to Create Vector Stores: ", round(end_time - start_time, 2), " seconds")
    avg_time = (end_time - start_time) / len(os.listdir('../data/boundingbox/'))
    print("Average Time per Image: ", round(avg_time, 4), " seconds")
    print("Time for all images: ", round(avg_time * len(os.listdir('../data/boundingbox/')) / 60, 2), " minutes")

    return None


# Test the function
create_all_vector_stores()