In [None]:
!pip install transformers
!pip install torch



In [None]:
from transformers import AutoImageProcessor, AutoModelForDepthEstimation
import torch
import numpy as np
from PIL import Image
import requests

Dep_image_processor = AutoImageProcessor.from_pretrained("depth-anything/Depth-Anything-V2-Small-hf")
Dep_model = AutoModelForDepthEstimation.from_pretrained("depth-anything/Depth-Anything-V2-Small-hf")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/775 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/950 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/99.2M [00:00<?, ?B/s]

In [None]:
from transformers import AutoImageProcessor, Dinov2Model
import torch
Dino_image_processor = AutoImageProcessor.from_pretrained("facebook/dinov2-base")
Dino_model = Dinov2Model.from_pretrained("facebook/dinov2-base")

preprocessor_config.json:   0%|          | 0.00/436 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/548 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

In [15]:
import os
from PIL import Image
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

# Define a custom Dataset class
class ImageFolderDataset(Dataset):
    def __init__(self, folder_path, transform=None):
        self.folder_path = folder_path
        self.transform = transform
        # List all the files in the folder
        self.image_files = [f for f in os.listdir(folder_path) if f.endswith(('jpg', 'jpeg', 'png', 'bmp'))]

    def __len__(self):
        # Return the number of images
        return len(self.image_files)

    def __getitem__(self, idx):
        # Load image using PIL
        img_name = os.path.join(self.folder_path, self.image_files[idx])
        #image = Image.open(img_name)

        # Apply transformations if provided
        # if self.transform:
        #     image = self.transform(image)

        # return image, img_name
        return img_name

# Define any transformation you want to apply to the images (e.g., resizing, normalization)
transform = transforms.Compose([
    transforms.ToTensor(),  # Convert images to PyTorch tensors
])

# Create the dataset
dataset = ImageFolderDataset(folder_path='/content/Traffics', transform=transform)

# Create a DataLoader
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)


In [17]:
embeddings = []
for img_name in dataloader:
    image = Image.open(img_name[0])
    inputs = Dep_image_processor(images=image, return_tensors="pt")

    with torch.no_grad():
        outputs = Dep_model(**inputs)

    # interpolate to original size and visualize the prediction
    post_processed_output = Dep_image_processor.post_process_depth_estimation(
        outputs,
        target_sizes=[(image.height, image.width)],
    )

    predicted_depth = post_processed_output[0]["predicted_depth"]
    depth = (predicted_depth - predicted_depth.min()) / (predicted_depth.max() - predicted_depth.min())
    depth = depth.detach().cpu().numpy() * 255
    depth = Image.fromarray(depth.astype("uint8"))

    ### Raw Image Embedding
    inputs = Dino_image_processor(image, return_tensors="pt")
    with torch.no_grad():
        outputs = Dino_model(**inputs)
    raw_last_hidden_states = outputs.last_hidden_state

    ### Depth Image Embedding
    inputs = Dino_image_processor(image, return_tensors="pt")
    with torch.no_grad():
        outputs = Dino_model(**inputs)
    depth_last_hidden_states = outputs.last_hidden_state

    embeddings.append([raw_last_hidden_states, depth_last_hidden_states])

print(embeddings) ### You can command this

[[tensor([[[ 2.9241,  0.9375,  0.2382,  ..., -1.7826,  1.9498,  1.3011],
         [ 0.5133, -0.4215, -0.0936,  ..., -1.0115,  0.3357, -0.8127],
         [-0.5017, -2.1005,  0.7806,  ...,  0.6211, -0.0670,  0.7004],
         ...,
         [ 1.1348, -1.2153,  0.2389,  ..., -1.7963,  2.3778, -0.2875],
         [ 0.9868, -2.0592,  0.1186,  ..., -1.3238, -0.6343, -1.0445],
         [ 0.0781, -0.1208, -1.0349,  ..., -2.4403,  0.9868, -1.8155]]]), tensor([[[ 2.9241,  0.9375,  0.2382,  ..., -1.7826,  1.9498,  1.3011],
         [ 0.5133, -0.4215, -0.0936,  ..., -1.0115,  0.3357, -0.8127],
         [-0.5017, -2.1005,  0.7806,  ...,  0.6211, -0.0670,  0.7004],
         ...,
         [ 1.1348, -1.2153,  0.2389,  ..., -1.7963,  2.3778, -0.2875],
         [ 0.9868, -2.0592,  0.1186,  ..., -1.3238, -0.6343, -1.0445],
         [ 0.0781, -0.1208, -1.0349,  ..., -2.4403,  0.9868, -1.8155]]])], [tensor([[[ 1.9353, -0.1504,  1.0876,  ..., -1.4013,  1.3709, -1.5079],
         [ 0.1359, -0.6375, -0.1898,  .

In [19]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class GeminiFusionTransformer(nn.Module):
    def __init__(self, d_model=768, num_heads=8, mlp_hidden_dim=512):
        super(GeminiFusionTransformer, self).__init__()

        # Define shared layers for Q, K, V
        self.qkv = nn.Linear(d_model, 3 * d_model)  # Shared Q, K, V projection

        # MLP for combining k_img and k_depth
        self.mlp = nn.Sequential(
            nn.Linear(2 * d_model, mlp_hidden_dim),  # First layer
            nn.ReLU(),  # Activation
            nn.Linear(mlp_hidden_dim, d_model)  # Output to match key dimension
        )

        # Multi-head attention layers
        self.attn = nn.MultiheadAttention(d_model, num_heads)

        # Layer normalization
        self.norm = nn.LayerNorm(d_model)

    def forward(self, image_tokens, depth_tokens):
        # Concatenate image and depth tokens
        combined_tokens = torch.cat([image_tokens, depth_tokens], dim=0)

        # Apply shared Q, K, V projection
        qkv = self.qkv(combined_tokens)  # Shape: [257+257, 3*d_model]
        q, k, v = qkv.chunk(3, dim=-1)  # Split into Q, K, V

        # Separate the Q, K, V for image and depth tokens
        q_img = q[:image_tokens.size(0)]  # Image query tokens
        k_img = k[:image_tokens.size(0)]  # Image key tokens
        v_img = v[:image_tokens.size(0)]  # Image value tokens

        q_depth = q[image_tokens.size(0):]  # Depth query tokens
        k_depth = k[image_tokens.size(0):]  # Depth key tokens
        v_depth = v[image_tokens.size(0):]  # Depth value tokens

        # Apply the MLP + Softmax to combine K_image and K_depth
        k_combined_img = self.mlp(torch.cat([k_img, k_depth], dim=-1))  # Combine keys using MLP
        k_combined_img = F.softmax(k_combined_img, dim=-1)  # Apply softmax

        k_combined_depth = self.mlp(torch.cat([k_depth, k_img], dim=-1))  # Combine keys using MLP
        k_combined_depth = F.softmax(k_combined_depth, dim=-1)  # Apply softmax

        # First embedding: Attention on Image tokens + Joint attention (Image + Depth)
        attn_img, _ = self.attn(q_img, k_img, v_img)  # Attention using image tokens
        attn_joint_img_depth, _ = self.attn(q_img, k_combined_img, v_depth)  # Joint attention using MLP combined keys

        # Second embedding: Attention on Depth tokens + Joint attention (Image + Depth)
        attn_depth, _ = self.attn(q_depth, k_depth, v_depth)  # Attention using depth tokens
        attn_joint_depth_img, _ = self.attn(q_depth, k_combined_depth, v_img)  # Joint attention using MLP combined keys

        # Add the attention outputs
        embedding_1 = attn_img + attn_joint_img_depth
        embedding_2 = attn_depth + attn_joint_depth_img

        # Return both embeddings
        return embedding_1 + embedding_2

# Example usage:
# Assume image_tokens and depth_tokens are tensors of shape [257, 768]
image_tokens = torch.randn(257, 768)  # Example image tokens
depth_tokens = torch.randn(257, 768)  # Example depth tokens

Fusion_model = GeminiFusionTransformer()
fusion_output = Fusion_model(image_tokens, depth_tokens)

print(fusion_output.shape)  # Both should have shape [257, 768]


torch.Size([257, 768])


In [22]:
Final_token = []
for embedding in embeddings:
      Final_token.append(Fusion_model(embedding[0], embedding[1]))


5
