In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
import random

from transformers import PretrainedConfig
from model.visual_token_embedding import VisualTokenEmbedding
from utils.visualization import visualize_masks

import torch.nn.functional as F
import json
from visual_tokenizer import get_visual_tokenizer
from data import get_dataset


In [None]:

# dataset = get_dataset('clevr_caption', '/private/home/delong/workspace/data/clevr-caption', split='train')
dataset = get_dataset('imagenet', '/datasets01/imagenet_full_size/061417', split='train')
# dataset = get_dataset('pixmo_cap', '/private/home/delong/workspace/data/pixmo-cap', split='train')
# dataset = get_dataset('sharegpt4v', '/private/home/delong/workspace/data/ShareGPT4V', split='share-captioner_coco_lcs_sam_1246k_1107.json')

In [None]:
tokenizer_input_resolution = 768
embedding_input_resolution = 768
max_tokens = 256

toeknizer_config = json.load(open('configs/visual_tokenizer/directsam/directsam_tiny_sa1b_2ep@0.1.json'))
# toeknizer_config = json.load(open('configs/visual_tokenizer/superpixel/superpixel_slic.json'))
# toeknizer_config = json.load(open('configs/visual_tokenizer/panoptic/panoptic_mask2former_tiny.json'))
# toeknizer_config = json.load(open('configs/visual_tokenizer/directsam/directsam_tiny_dsa_100ep@0.5.json'))
# toeknizer_config = json.load(open('/private/home/delong/workspace/subobjects-VLM/configs/visual_tokenizer/patch/patch_16_per_side_raster.json'))

visual_tokenizer = get_visual_tokenizer(**toeknizer_config, image_resolution=tokenizer_input_resolution, max_tokens=max_tokens)

In [None]:
# embedding_config = json.load(open('configs/visual_embedding/rgb_pixel.json'))
# embedding_config = json.load(open('configs/visual_embedding/in1k_mobilenetv3_all.json'))
# embedding_config = json.load(open('configs/visual_embedding/vae.json'))
# embedding_config = json.load(open('configs/visual_embedding/convnext_in22k_stage3.json'))
# embedding_config = json.load(open('configs/visual_embedding/dinov2_small.json'))
embedding_config = json.load(open('configs/visual_embedding/clip_resnet50.json'))
# embedding_config = json.load(open('configs/visual_embedding/clip_vit_l_14_336.json'))
# embedding_config = json.load(open('configs/visual_embedding/clip_vit_b_32.json'))

# embedding_config = json.load(open('configs/visual_embedding/dinov2_large_reg.json'))
# embedding_config = json.load(open('configs/visual_embedding/dinov2_gaint_reg.json'))
# embedding_config = json.load(open('configs/visual_embedding/dinov2_small_reg.json'))
# embedding_config = json.load(open('configs/visual_embedding/dinov2_small_timm.json'))
# embedding_config = json.load(open('configs/visual_embedding/dinov2_small.json'))
# embedding_config = json.load(open('configs/visual_embedding/dinov2_large.json'))



visual_token_embedding_config = PretrainedConfig.from_dict(embedding_config)
visual_token_embedding_config.image_resolution = embedding_input_resolution
visual_token_embedding_config.output_resolution = tokenizer_input_resolution
visual_token_embedding = VisualTokenEmbedding(visual_token_embedding_config).cuda()

print(visual_token_embedding.device, visual_token_embedding.dtype)
print(visual_token_embedding.vision_encoder.feature_channels, 'channels')

In [6]:
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import minmax_scale

import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import minmax_scale
import matplotlib.pyplot as plt

def pca_projection(feature_maps, n_components=3, bg_threshold=0.0, do_thresholding=True, do_minmax_scale=True):
    """
    Perform PCA *separately* for each image in a batch of feature maps,
    then threshold the 1st PCA component to remove background.

    Args:
        feature_maps: torch.Tensor or np.ndarray of shape (B, C, H, W).
                      e.g. (5, 1152, 37, 37)
        n_components: Number of PCA components (default=3).
        bg_threshold: Threshold applied to the 1st PCA component. For example:
                      - 0.0  => use sign (positive vs. negative) 
                      - 0.5  => use if you've min-max scaled the 1st PC, keep >0.5
                      - or any other heuristic

    Returns:
        pca_results: np.ndarray of shape (B, H, W, n_components),
                     i.e. a 3D PCA projection for each image with
                     background mostly zeroed out.
    """
    # 1. Ensure we're dealing with a NumPy array on CPU
    if hasattr(feature_maps, 'cpu'):
        feature_maps = feature_maps.cpu().numpy()  # shape (B, C, H, W)
    B, C, H, W = feature_maps.shape
    
    # Prepare output buffer
    pca_results = np.zeros((B, H, W, n_components), dtype=np.float32)
    
    # 2. Loop over each image and apply PCA
    for i in range(B):
        # shape (C, H, W) for this single image
        fm_i = feature_maps[i]
        # Flatten to 2D: (H*W, C)
        fm_i_2d = fm_i.reshape(C, -1).T  # shape -> (H*W, C)
        
        # 3. Fit PCA on the single image's feature map
        pca = PCA(n_components=n_components)
        # shape -> (H*W, n_components)
        pca_2d = pca.fit_transform(fm_i_2d)
        
        if do_thresholding:
            # 4. Threshold the 1st component to remove background
            #    (This is just one simple approach—customize as needed.)
            first_pc = pca_2d[:, 0]
            # Create a mask for "foreground" (e.g., first_pc > 0)
            mask = (first_pc < bg_threshold)
            
            # Option 1: Zero out everything for masked-out rows
            pca_2d[~mask, :] = 0.0
        
        # 5. Reshape back to (H, W, n_components)
        pca_3d = pca_2d.reshape(H, W, n_components)
        
        # # 6. Min-max scale each PCA channel to [0,1], ignoring rows that got zeroed out
        if do_minmax_scale:
            for c_idx in range(n_components):
                channel = pca_3d[..., c_idx]
                # If the entire channel is zero, scaling will lead to division by zero,
                # so let's check that first.
                if np.all(channel == 0):
                    continue
                channel_scaled = minmax_scale(channel.ravel()).reshape(H, W)
                pca_3d[..., c_idx] = channel_scaled
        
        # 7. Store the result
        pca_results[i] = pca_3d.astype(np.float32)
    
    return pca_results


In [None]:
n_samples = 1

images = []
for i in range(n_samples):
    image = dataset[random.randint(0, len(dataset) - 1)]['image'].resize((tokenizer_input_resolution, tokenizer_input_resolution))
    images.append(image)

feature_maps = visual_token_embedding.vision_encoder(images)
feature_maps_upsampled = F.interpolate(
    feature_maps, 
    size=(tokenizer_input_resolution, tokenizer_input_resolution),
    mode='bilinear'
)

print(feature_maps.shape, feature_maps.dtype, feature_maps.device)
print(feature_maps_upsampled.shape, feature_maps_upsampled.dtype, feature_maps_upsampled.device)
feature_maps = feature_maps.cpu().numpy()
feature_maps_upsampled = feature_maps_upsampled.cpu().numpy()


n_components = 3
do_thresholding = False
do_minmax_scale = True

feature_maps_pca = pca_projection(feature_maps, n_components=n_components, do_thresholding=do_thresholding, do_minmax_scale=do_minmax_scale)
feature_maps_upsampled_pca = pca_projection(feature_maps_upsampled, n_components=n_components, do_thresholding=do_thresholding, do_minmax_scale=do_minmax_scale)

for b, image in enumerate(images):

    image = image.resize((tokenizer_input_resolution, tokenizer_input_resolution))

    plt.figure(figsize=(15, 5))

    plt.subplot(1, 3, 1)
    plt.imshow(image)

    plt.subplot(1, 3, 2)
    plt.imshow(feature_maps_pca[b])

    plt.subplot(1, 3, 3)
    plt.imshow(feature_maps_upsampled_pca[b])

    plt.show()

In [None]:
batch_masks = visual_tokenizer(images)
batch_masks_cpu = batch_masks.cpu().numpy()


roi_boxes, roi_masks, embeddings = visual_token_embedding(images, batch_masks)
print('embeddings', embeddings.shape)
print('roi_boxes', roi_boxes.shape)
print('roi_masks', roi_masks.shape)

roi_boxes = roi_boxes.cpu().numpy()
embeddings = embeddings.cpu().numpy()
roi_masks = roi_masks.cpu().numpy()

In [None]:
C = visual_token_embedding.vision_encoder.feature_channels
token_roi_resolution = visual_token_embedding.config.token_roi_resolution

for b, image in enumerate(images):

    plt.figure(figsize=(15, 15))
    n_rows = 6
    n_cols = 6

    # for i in range(n_rows * n_cols):
    #     plt.subplot(n_rows, n_cols, i + 1)
    #     plt.imshow(batch_masks_cpu[0][i])
    #     plt.axis('off')
    #     plt.title(batch_masks_cpu[0][i].sum())
        
    image = image.resize((tokenizer_input_resolution, tokenizer_input_resolution))
    down_sample_ratio = tokenizer_input_resolution // 1
    for i in range(6):
        plt.figure(figsize=(20, 8))
        plt.subplot(1, 6, 1)
        plt.imshow(image)

        plt.subplot(1, 6, 2)
        plt.imshow(batch_masks_cpu[b, i], cmap='inferno')
        plt.imshow(image, alpha=0.2)

        x1, y1, x2, y2 = (roi_boxes[b][i] * tokenizer_input_resolution).astype(int)
        plt.plot([x1, x2, x2, x1, x1], [y1, y1, y2, y2, y1], 'r')
        plt.title(f'ROI [{x1}, {y1}, {x2}, {y2}]')

        plt.subplot(1, 6, 3)
        plt.title(f'Mask {np.average(roi_masks[b][i])}')
        plt.imshow(roi_masks[b][i])

        # plt.subplot(1, 6, 4)
        # plt.title('Embedding')
        # embedding = embeddings[b][i]
        # embedding = embedding.reshape(C, token_roi_resolution, token_roi_resolution)

        # # unsqueeze embedding
        # embedding = np.expand_dims(embedding, axis=0)
        # plt.imshow(apply_pca(embedding)[0])
        # # plt.imshow(feature_maps_rgb * roi_masks[b][i][:, :, None])

        plt.show()
