In [None]:
!pip install roboflow
import roboflow

roboflow.login()

roboflow.download_dataset(dataset_url="https://universe.roboflow.com/team-roboflow/coco-128/dataset/2", model_format="coco", location="data/coco-128")

In [1]:

# !pip install faiss-gpu
# !pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
import glob
import json
from PIL import Image
from pathlib import Path

import clip
import faiss
import torch
import numpy as np
from torch.utils.data import DataLoader, Dataset


class ImageDataset(Dataset):
    def __init__(self, image_paths, preprocess):
        self.image_paths = image_paths
        self.preprocess = preprocess

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image_path = self.image_paths[idx]
        image = Image.open(image_path)
        image = self.preprocess(image)
        return image


def get_data_paths(dir: str | list[str], data_formats: list, prefix: str = '') -> list[str]:
    """
    Get list of files in a folder that have a file extension in the data_formats.

    Args:
      dir (str | list[str]): Dir or list of dirs containing data.
      data_formats (list): List of file extensions. Ex: ['jpg', 'png']
      prefix (str): Prefix for logging messages.

    Returns:
      A list of strings.
    """
    try:
        f = []  # data files
        for d in dir if isinstance(dir, list) else [dir]:
            p = Path(d)
            if p.is_dir():
                f += glob.glob(str(p / '**' / '*.*'), recursive=True)
            else:
                raise FileNotFoundError(f'{prefix}{p} does not exist')
        data_files = sorted(x for x in f if x.split('.')[-1].lower() in data_formats)
        return data_files
    except Exception as e:
        raise Exception(f'{prefix}Error loading data from {dir}: {e}') from e


def get_image_embeddings(data_dir, model_name="ViT-B/32", batch_size=32, device="cpu"):
    # Load the CLIP model
    model, preprocess = clip.load(model_name, device=device)
    
    # Create a dataset and dataloader
    image_paths = get_data_paths(data_dir, data_formats=["jpg", "jpeg", "png"])
    print(len(image_paths))
    dataset = ImageDataset(image_paths, preprocess)
    dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=4)
    
    # List to store image embeddings
    image_embeddings = []

    # Process images in batches
    with torch.no_grad():
        for images in dataloader:
            images = images.to(device)
            embeddings = model.encode_image(images)
            embeddings /= embeddings.norm(dim=-1, keepdim=True)
            image_embeddings.append(embeddings.cpu().numpy())

    # Convert list to numpy array
    image_embeddings = np.vstack(image_embeddings)
    
    return image_embeddings, image_paths


def create_faiss_index(embeddings):
    # Determine the dimensionality of the embeddings
    d = embeddings.shape[1]
    
    # Initialize a FAISS index
    index = faiss.IndexFlatL2(d)
    
    # Add embeddings to the index
    index.add(embeddings)
    
    return index


embeddings, image_paths = get_image_embeddings("data/coco-128/train", device="cuda")
index = create_faiss_index(embeddings)
faiss.write_index(index, "data/index.faiss")
with open("data/image_paths.json", "w") as f:
    json.dump(image_paths, f, indent=4)

100%|███████████████████████████████████████| 338M/338M [01:27<00:00, 4.03MiB/s]


128


  return F.conv2d(input, weight, bias, self.stride,


In [15]:
clip.available_models()

AttributeError: module 'clip' has no attribute 'available_models'

In [7]:
import json
import os 
from PIL import Image
dataset_file_path = "data/image_paths.json"
with open(dataset_file_path, 'r') as file:
    data = json.load(file)
    
os.makedirs("dataset", exist_ok=True)
for i,path in enumerate(data):
    image = Image.open(path)
    image.save(os.path.join("dataset",f"{i}.jpg"))


FileNotFoundError: [Errno 2] No such file or directory: '/home/kieusontung/homework-week-3/dataset/0.jpg'