In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

import tqdm
import random
import torch
import matplotlib.pyplot as plt

from utils.visualization import visualize_sample
from model.utils import create_vlm
from model.utils import VisualTextualTokenization
from data import get_dataset
from visual_tokenizer import get_visual_tokenizer
import json

In [2]:

from data import (
    ShareGPT4V,
    ImageNet,
    Cambrian,
    CLEVRCaption,
    PixmoDataset
)

dataset = PixmoDataset(root='/private/home/delong/workspace/data/pixmo-cap', split='val')

# dataset = CLEVRCaption(root='/private/home/delong/workspace/data/clevr-caption', split='val')

# dataset = ImageNet(root='/datasets01/imagenet_full_size/061417', split='train')

# dataset = ShareGPT4V(
#     root='/private/home/delong/workspace/data/ShareGPT4V',
#     split='sharegpt4v_instruct_gpt4-vision_cap100k.json')


# dataset = ShareGPT4V(
#     root='/private/home/delong/workspace/data/ShareGPT4V',
#     split='share-captioner_coco_lcs_sam_1246k_1107.json')


In [None]:
checkpoint = "/private/home/delong/workspace/subobjects-VLM/runs/pixmo_cap/Llama-3_2-1B-dinov2_small(768px)/superpixel/1224-1559-superpixel_slic(100t-768px)/runs/checkpoint-8283"
model, textual_tokenizer = create_vlm(
    checkpoint, 
    llm_class='smollm'
    )

model = model.cuda().half().eval()

In [4]:
image_resolution = 768
max_tokens = 100

# config = json.load(open('configs/visual_tokenizer/directsam/directsam_tiny_sa1b_2ep@0.05.json'))
# config = json.load(open('configs/visual_tokenizer/patch/patch_16_per_side_random.json'))
config = json.load(open('configs/visual_tokenizer/superpixel/superpixel_slic.json'))

visual_tokenizer = get_visual_tokenizer(**config, image_resolution=image_resolution, max_tokens=max_tokens)

vl_tokenizer = VisualTextualTokenization(textual_tokenizer, visual_tokenizer)

In [None]:
n_samples = 20
loss = 0
for _ in tqdm.tqdm(range(n_samples)):
    sample = dataset[random.randint(0, len(dataset))]
    inputs = vl_tokenizer([sample], eval=True)

    with torch.no_grad():
        outputs = model(**inputs)
        loss += outputs['loss'].item()

print(f"Loss: {loss / n_samples}")

In [None]:
sample = dataset[0]
# sample = dataset[random.randint(0, len(dataset))]

print(sample['text'])

label = sample['text'].split('<|assistant|>')[1].strip().replace(textual_tokenizer.eos_token, '')
sample['text'] = sample['text'].split('<|assistant|>')[0] + '<|assistant|>'

inputs = vl_tokenizer([sample], eval=True)


inputs_embeds, labels = model.prepare_inputs_embeds(
    inputs['text'], inputs['image'], inputs['masks']
)

outputs = model.generate(
    inputs_embeds=inputs_embeds,
    do_sample=False,
    max_new_tokens=dataset.max_text_tokens,
    eos_token_id = textual_tokenizer.eos_token_id,
    pad_token_id = textual_tokenizer.pad_token_id,
)
prediction = textual_tokenizer.decode(outputs[0], skip_special_tokens=True)

visualize_sample(sample, inputs)
print(label)
print('-' * 80)
print(prediction)