In [3]:
#!pip install --upgrade transformers datasets
#!pip install --upgrade huggingface hub
#!pip install --upgrade pip
#!pip install --upgrade torch torchvision 

# Using feature embeddings to find new architectural concepts


I am going to test two models for this: DINOv2 and CLIP

#### DINOv2 Model:

In [3]:
from transformers import AutoImageProcessor, Dinov2Model
import torch, torchvision
from datasets import load_dataset

In [4]:
print(torch.__version__)
print(torchvision.__version__)

2.1.2
0.16.2


In [5]:
device = torch.device("cuda" if torch.cuda.is_available()  
                            else "mps"  if torch.backends.mps.is_available() 
                            else "cpu")
print(f"Using device: {device}")

# If you have a model, move it to the device
# model.to(device)

Using device: mps


In [6]:
#!huggingface-cli login --token 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

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /Users/ari/.cache/huggingface/token
Login successful


In [14]:
dataset = load_dataset("huggingface/cats-image")
image = dataset["test"]["image"][0]

image_processor = AutoImageProcessor.from_pretrained("facebook/dinov2-base")
model = Dinov2Model.from_pretrained("facebook/dinov2-base")

inputs = image_processor(image, return_tensors="pt")
with torch.no_grad():
    outputs = model(**inputs)

last_hidden_states = outputs.last_hidden_state
list(last_hidden_states.shape)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
Downloading builder script: 100%|██████████| 2.56k/2.56k [00:00<00:00, 8.21MB/s]
Downloading data: 100%|██████████| 173k/173k [00:00<00:00, 804kB/s] 
Generating test split: 1 examples [00:00,  2.24 examples/s]
preprocessor_config.json: 100%|██████████| 436/436 [00:00<00:00, 2.73MB/s]
config.json: 100%|██████████| 548/548 [00:00<00:00, 1.66MB/s]
model.safetensors: 100%|██████████| 346M/346M [00:28<00:00, 11.9MB/s] 


[1, 257, 768]

#### last hidden state shape = feature embeddings per image patch

* 1 -> input batch size
* 257 -> 256 image patches the input images are split into (one additional token for a special purpose, like classification or a start/end token). 
* 768 -> This is the dimensionality of the embeddings. Each of the 257 elements (patches or tokens) is transformed into a 768-dimensional vector. 

In [15]:
last_hidden_states

tensor([[[-2.1747, -0.4729,  1.0936,  ...,  0.2041,  1.1101,  0.1363],
         [-3.2780, -0.8269, -0.9210,  ...,  1.4415, -0.5364, -0.8757],
         [-2.9129,  1.1284, -0.7306,  ...,  0.6959, -1.8791, -2.3638],
         ...,
         [-0.5463,  1.4382, -0.2563,  ...,  0.1873, -2.9950,  0.4067],
         [-3.0848,  2.0568,  1.5137,  ...,  0.9157, -2.7059,  2.2017],
         [-0.7499,  0.0903,  1.3731,  ..., -0.2961, -2.3682, -0.1329]]])

## CLIP Model:

The Clip model outputs just one feature vector per image, because it has already aggregated the patch embeddings. Therefore this is the simpler approach. The Clip model is a little bit older than DINOv2, but also incorporates rich and complex world knowledge due to the unsupervised training approach. 

In [None]:
%pip install git+https://github.com/openai/CLIP.git

In [None]:
import torch
import clip
from PIL import Image

In [None]:
# Load the model
model, preprocess = clip.load("ViT-B/32")

# Preprocess the image
image = Image.open("path_to_your_image.jpg")
image = preprocess(image).unsqueeze(0)  # Add batch dimension

# Calculate image features
with torch.no_grad():
    image_features = model.encode_image(image)

# Now you can use the image_features for your application


### openCLIP from Laion:

alternative to the older OpenAI clip model:

In [None]:
%pip install git+https://github.com/mlfoundations/open_clip.git


In [None]:
import torch
from PIL import Image
import open_clip

# Load the OpenCLIP model variant "H/14"
model, preprocess = open_clip.create_model_and_transforms('H/14', pretrained='laion2b_s32b_b79k')

# Preprocess the image
image_path = "path_to_your_image.jpg"
image = preprocess(Image.open(image_path)).unsqueeze(0)  # Add batch dimension

# Generate embeddings
with torch.no_grad():
    image_features = model.encode_image(image)

# image_features now contains the embeddings for your image


## Architectural epochs / styles dataset

https://www.kaggle.com/datasets/dumitrux/architectural-styles-dataset?resource=download

#### Preprocessing for DINO:
- common size with consistent aspect ratio
- normalize


In [1]:
%pip install opencv-python

Collecting opencv-python
  Using cached opencv_python-4.9.0.80-cp37-abi3-macosx_11_0_arm64.whl.metadata (20 kB)
Using cached opencv_python-4.9.0.80-cp37-abi3-macosx_11_0_arm64.whl (35.4 MB)
Installing collected packages: opencv-python
Successfully installed opencv-python-4.9.0.80
Note: you may need to restart the kernel to use updated packages.


In [2]:
import torch
import numpy as np
from torchvision import transforms
import cv2
import os
from PIL import Image

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
def transform_image(target_size=(224, 224)):
    """
    Create a transformation pipeline to resize and crop the image, 
    then convert it to a tensor.

    Args:
    - target_size: A tuple (height, width) for the target size.

    Returns:
    - A composed transformation function.
    """
    transform = transforms.Compose([
        transforms.Resize(target_size),  # Resize the shorter edge to target_size, maintaining aspect ratio
        transforms.CenterCrop(target_size),  # Crop the center of the image
        transforms.ToTensor(),  # Convert to tensor
    ])

    return transform

In [None]:
# Example usage
image_folder = "path/to/your/images"
processed_images = []
transform = transform_image(target_size=(224, 224))

for image_name in os.listdir(image_folder):
    image_path = os.path.join(image_folder, image_name)
    image = Image.open(image_path).convert('RGB')  # Open image using PIL and convert to RGB
    processed_image = transform(image)  # Apply the transformation
    processed_images.append(processed_image)

# processed_images now contains tensors of your images, preprocessed and ready for the model.


### to DINO

In [None]:
from transformers import AutoImageProcessor, Dinov2Model
import torch, torchvision
from datasets import load_dataset

device = torch.device("cuda" if torch.cuda.is_available()  
                            else "mps"  if torch.backends.mps.is_available() 
                            else "cpu")
print(f"Using device: {device}")

# If you have a model, move it to the device
# model.to(device)

In [None]:
dataset = load_dataset("huggingface/cats-image")
image = dataset["test"]["image"][0]

image_processor = AutoImageProcessor.from_pretrained("facebook/dinov2-base")
model = Dinov2Model.from_pretrained("facebook/dinov2-base")

inputs = image_processor(image, return_tensors="pt")
with torch.no_grad():
    outputs = model(**inputs)

last_hidden_states = outputs.last_hidden_state
list(last_hidden_states.shape)

#### dataloading:

In [None]:
from torch.utils.data import Dataset

class CustomImageDataset(Dataset):
    def __init__(self, image_dir, transform=None):
        self.image_dir = image_dir
        self.transform = transform
        self.image_files = [f for f in os.listdir(image_dir) if os.path.isfile(os.path.join(image_dir, f))]

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_path = os.path.join(self.image_dir, self.image_files[idx])
        image = Image.open(img_path).convert('RGB')
        if self.transform:
            image = self.transform(image)
        return image


#### 1) transformations

In [None]:
target_size = (224, 224)  # The target size for the images (height, width) for DINO input

transform = transforms.Compose([
    transforms.Resize(target_size),  # Resize the shorter edge to target_size, maintaining aspect ratio
    transforms.CenterCrop(target_size),  # Crop the center of the image
    transforms.ToTensor(),  # Convert to tensor
])

#### 2) check number of workers available

In [12]:
max_num_workers = os.cpu_count()
max_num_workers//2

4

#### 3) load the data

In [None]:
from torch.utils.data import DataLoader


# Assuming `image_folder` is the path to your image directory
dataset = CustomImageDataset(image_folder, transform=transform)

# Create a DataLoader
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=max_num_workers//2)


#### with batch processing:

In [None]:
import torch

# Initialize a variable to hold the sum of all embeddings and a counter for the number of images
sum_embeddings = torch.zeros(768)
total_images = 0

for images in dataloader:  # Assuming `dataloader` yields batches of images
    # Process images through the model to get embeddings
    # Ensure your model and images are on the same device (e.g., CPU or GPU)
    images = images.to(device)  # Move images to the same device as the model
    outputs = model(images)  # Get model outputs, adjust this line according to your model
    
    # Assuming `outputs` is now a tensor of shape [batch_size, 257, 768]
    # Calculate the mean across the patch dimension, including the CLS token
    batch_mean = outputs.mean(dim=(0, 1))  # Mean over batch and patch dimensions
    
    # Adjust for the batch size
    sum_embeddings += batch_mean * outputs.shape[0]  # Multiply by batch size and add to sum
    total_images += outputs.shape[0]  # Accumulate the total number of images processed

# Calculate the global centroid vector
global_centroid_vector = sum_embeddings / total_images


#### in case I would just input one giant batch of images:

In [None]:
# Example tensor `outputs` with shape [batch_size, 257, 768]
# outputs = model_output

# Calculate the mean over both the batch and patch dimensions to get the centroid
centroid_vector = outputs.mean(dim=(0, 1))  # This averages across all images and all patches

# `centroid_vector` will have the shape [768], representing the global centroid of the embeddings