In [97]:
import torch
import numpy as np
import clip
from tqdm.auto import tqdm
from PIL import Image

import torchvision.transforms as transforms
import torchvision.io as io
import math
import itertools
from types import SimpleNamespace

import jnu as J

device = "cuda" if torch.cuda.is_available() else "cpu"
torch.set_printoptions(sci_mode=False)
np.set_printoptions(suppress=True)

def load_mp4(video_path, frame_rate):
    video = io.VideoReader(video_path, "video")
    metadata = video.get_metadata()
    duration = metadata['video']['duration'][0]
    fps = metadata['video']['fps'][0]
    est_total_frames = (duration * fps)
    des_total_frames = (duration * frame_rate)
    skip = math.ceil(est_total_frames / des_total_frames)
    frames = []
    for frame in tqdm(itertools.islice(video, 0, None, skip), desc="loading video"):
        frames.append(frame['data'].float() / 255.)
    return torch.stack(frames)

def locate(imgs, text):
    with torch.no_grad():
        image_features = model.encode_image(imgs)
        text_features = model.encode_text(text)
        logits_per_image, logits_per_text = model(imgs, text)
        image_features /= image_features.norm(dim=-1, keepdim=True)
        text_features /= text_features.norm(dim=-1, keepdim=True)
        similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
        return SimpleNamespace(similarity=similarity, 
                               image_features=image_features, text_features=text_features,
                               image_logits=logits_per_image, text_logits=logits_per_text)
    
def grid(imgs, size=224, stride=224//4):
    imgs = transforms.ToTensor()(imgs)
    l = len(imgs.shape)
    imgs = imgs.unfold(l-2, size, stride)
    imgs = imgs.unfold(l-1, size, stride)
    reshape = np.arange(len(imgs.shape)).tolist()
    c = reshape[l-3]
    del reshape[l-3]
    reshape.insert(l-1,c)
    return imgs.permute(*reshape)

resize = transforms.Compose([
        transforms.ToTensor(),
        transforms.Resize((224, 224), interpolation=Image.BICUBIC, antialias=True),
        transforms.CenterCrop((224, 224)),
    ])
normalize = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
])



model, _ = clip.load("ViT-B/32", device=device)
image = Image.open("img/dog.jpg")#.unsqueeze(0)
#image = normalize(image)
image_grid = grid(image)
grid_shape = image_grid.shape[:-3]

#print(image_grid.shape)
#J.images(image_grid.reshape(-1, *image_grid.shape[-3:]))

text = clip.tokenize(["dog"]).to(device)
result = locate(image_grid.reshape(-1,*image_grid.shape[-3:]), text)
logits = result.image_logits
logits = logits.reshape(*grid_shape,-1)

KeyboardInterrupt: 

In [None]:
import seaborn as sb

logits = (logits - logits.min()) / (logits.max() - logits.min())
sb.heatmap(logits.squeeze().numpy())

In [98]:
print(device)

cpu


In [19]:
file_path = "./video/chess-02.mp4"
imgs = load_mp4(file_path, 2).to(device)
J.images(imgs)

loading video: 0it [00:00, ?it/s]

interactive(children=(IntSlider(value=0, description='x', layout=Layout(width='99%'), max=31), Output()), _dom…

HBox(children=(Canvas(height=1080, width=1920),), layout=Layout(align_items='center', display='flex', flex_flo…

<jnu.image._image.Image at 0x7f9f3f894910>

In [25]:
text = clip.tokenize(["chess board", "dog", "face", "human", ""]).to(device)
probs, img_f, text_f = locate(preprocess(imgs), text)
print(probs)

tensor([[20.6530, 22.8280],
        [21.0456, 22.1278],
        [21.6541, 22.4598],
        [20.9627, 22.4379],
        [20.0259, 21.4705],
        [20.1362, 20.7447],
        [21.7284, 22.6539],
        [20.9259, 22.1536],
        [20.7730, 21.9828],
        [20.3810, 20.7559],
        [21.7437, 21.7931],
        [23.6621, 21.3526],
        [23.7533, 19.8215],
        [25.3439, 21.5343],
        [28.8745, 20.2418],
        [28.2159, 19.3399],
        [25.9313, 20.2448],
        [27.1215, 19.3355],
        [29.0941, 19.7587],
        [31.2711, 20.1906],
        [30.7352, 19.4998],
        [31.1156, 19.7632],
        [31.8747, 20.3074],
        [31.3972, 19.9737],
        [31.4849, 20.8653],
        [30.1148, 20.2362],
        [29.2274, 18.3738],
        [19.7151, 20.2330],
        [19.9522, 22.4351],
        [18.9488, 20.6751],
        [19.8379, 23.9496],
        [22.0884, 23.1437]])


In [33]:


#




with torch.no_grad():
    image_features = model.encode_image(image)
    text_features = model.encode_text(text)
    
    logits_per_image, logits_per_text = model(image, text)
    probs = logits_per_image.softmax(dim=-1).cpu().numpy()

print("Label probs:", probs) 

846.9667244850218 55.99693333333333


loading video: 0it [00:00, ?it/s]

tensor([[[[[252, 252, 252,  ...,  99, 104, 106],
           [252, 252, 252,  ..., 101, 106, 107],
           [252, 252, 252,  ..., 106, 109, 112],
           ...,
           [ 13,  13,  13,  ...,  87,  86,  86],
           [ 13,  13,  13,  ...,  87,  85,  84],
           [ 13,  13,  13,  ...,  86,  84,  81]],

          [[252, 252, 252,  ...,  82,  87,  89],
           [252, 252, 252,  ...,  84,  89,  90],
           [252, 252, 252,  ...,  89,  92,  95],
           ...,
           [ 13,  13,  13,  ...,  77,  76,  76],
           [ 13,  13,  13,  ...,  77,  75,  74],
           [ 13,  13,  13,  ...,  76,  74,  71]],

          [[252, 252, 252,  ...,  85,  90,  92],
           [252, 252, 252,  ...,  87,  92,  93],
           [252, 252, 252,  ...,  92,  95,  98],
           ...,
           [  8,   8,   8,  ...,  81,  80,  80],
           [  8,   8,   8,  ...,  81,  79,  78],
           [  8,   8,   8,  ...,  80,  78,  75]]],


         [[[252, 252, 252,  ..., 101, 101, 101],
           [2

NameError: name 'image' is not defined