In [None]:
import requests
import torch
from PIL import Image
from transformers import AutoProcessor, CLIPModel

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device}")

## Check Vision Part

In [None]:
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)

In [None]:
inputs = processor(
    text=["a photo of a cat", "a photo of a dog"],
    images=image,
    return_tensors="pt",
    padding=True,
).to(device)

image_input = processor(images=image, return_tensors="pt").to(device)
text_input = processor(
    text=["a photo of a cat", "a photo of a dog"],
    return_tensors="pt",
    padding=True,
).to(device)

outputs = model(**inputs)
logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
logit_scale = model.logit_scale.exp()

In [None]:
outputs.keys()

In [None]:
image_embeds = outputs.image_embeds
text_embeds = outputs.text_embeds

In [None]:
image_embeds_norm = image_embeds / image_embeds.norm(dim=-1, keepdim=True)
text_embeds_norm = text_embeds / text_embeds.norm(dim=-1, keepdim=True)
similarity = (100.0 * image_embeds_norm @ text_embeds_norm.T).softmax(dim=-1) * logit_scale
print(similarity)

In [None]:
print(logits_per_image, probs)

In [None]:
clip_vm = model.vision_model
clip_vproj = model.visual_projection

clip_tm = model.text_model
clip_tproj = model.text_projection

In [None]:
image_embed_single = clip_vm(**image_input).pooler_output
text_embed_single = clip_tm(**text_input).pooler_output

In [None]:
image_embed_single_proj = clip_vproj(image_embed_single)
text_embed_single_proj = clip_tproj(text_embed_single)

In [None]:
similarity2 = (100.0 * image_embed_single_proj @ text_embed_single_proj.T).softmax(
    dim=-1
) * logit_scale
print(similarity2)

In [None]:
image_features = model.get_image_features(**image_input)
image_features.shape

# s

In [None]:
(image_features == image_embeds).all()

In [None]:
(image_features == image_embed_single_proj).all()

所以这里能看出，model.get_image_features所获得的，和我先进入vmodel再proj pooled_output得到的是一样的，所以我的方法是对的。

## Test with annotation file and umap

In [None]:
import json
import os
import sys

import numpy as np
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

In [None]:
ann_path = "/project/Deep-Clustering/data/flickr30k/test.json"
img_path = "/data/SSD/flickr30k/images"

In [None]:
class FeatureExtractionDataset(Dataset):
    def __init__(self, annotation_path: str, image_path: str, processor, ratio=0.1) -> None:
        self.annotations = json.load(open(annotation_path))
        self.annotations = self.annotations[: int(len(self.annotations) * ratio)]
        self.image_path = image_path
        self.processor = processor

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, idx: int) -> tuple:
        annotation = self.annotations[idx]
        img_path = os.path.join(self.image_path, annotation["image"])
        raw_image = Image.open(img_path).convert("RGB")
        image_input = self.processor(images=raw_image, return_tensors="pt")

        if "pixel_values" in image_input:
            image_input["pixel_values"] = image_input["pixel_values"].squeeze()

        raw_text = (
            self.annotations[idx]["caption"]
            if type(self.annotations[idx]["caption"]) is str
            else self.annotations[idx]["caption"][0]
        )

        return image_input, raw_text

In [None]:
test_dataset = FeatureExtractionDataset(ann_path, img_path, processor, ratio=1)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=4)

In [None]:
img_embeddings, text_embeddings = [], []
with torch.no_grad():
    for batch in tqdm(test_dataloader):
        images, raw_texts = batch
        image_input = images.to(device)
        text_input = processor(
            text=raw_texts,
            return_tensors="pt",
            padding="max_length",
            truncation=True,
            max_length=77,
        ).to(device)

        img_embed = clip_vm(**image_input).pooler_output
        text_embed = clip_tm(**text_input).pooler_output

        img_embed = clip_vproj(img_embed)
        text_embed = clip_tproj(text_embed)

        img_embeddings.append(img_embed)
        text_embeddings.append(text_embed)

img_embeddings = torch.cat(img_embeddings)
text_embeddings = torch.cat(text_embeddings)

print(img_embeddings.shape, text_embeddings.shape)

In [None]:
import matplotlib.pyplot as plt
from cuml.cluster import HDBSCAN, KMeans
from cuml.dask.manifold import UMAP as MNMG_UMAP
from cuml.datasets import make_blobs
from cuml.manifold import UMAP
from dask.distributed import Client
from dask_cuda import LocalCUDACluster

In [None]:
cluster = LocalCUDACluster(threads_per_worker=1)
client = Client(cluster)

In [None]:
label_embedding = text_embeddings.clone().detach().to(device)
label_embedding = label_embedding.to(device)
label_embedding_np = label_embedding.cpu().numpy()

local_model = UMAP(random_state=42, n_components=2)
umap_features = local_model.fit_transform(label_embedding_np)

umap_features = torch.tensor(umap_features, device="cpu")
umap_features_np = umap_features.numpy()

In [None]:
plt.scatter(umap_features[:, 0], umap_features[:, 1])
plt.show()