In [1]:
import gradio as gr
import torch
from transformers import AutoConfig, AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
from demo.cam import generate_gradcam
from captum.attr import LayerGradCam
from PIL import Image
from einops import rearrange

import numpy as np
import matplotlib.pyplot as plt
import os
import time

  from .autonotebook import tqdm as notebook_tqdm


Python version is above 3.10, patching the collections module.




In [2]:
torch.set_default_device("mps")
model_path = "deepseek-ai/Janus-Pro-1B"
config = AutoConfig.from_pretrained(model_path)
language_config = config.language_config
language_config._attn_implementation = 'eager'
vl_gpt = AutoModelForCausalLM.from_pretrained(model_path,
                                             language_config=language_config,
                                             trust_remote_code=True)

dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float16
# dtype = torch.bfloat32 if torch.cuda.is_available() else torch.float32

if torch.cuda.is_available():
    vl_gpt = vl_gpt.to(dtype).cuda()
else:
    # vl_gpt = vl_gpt.to(torch.float16)
    vl_gpt = vl_gpt.to(dtype)

vl_chat_processor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer
cuda_device = 'cuda' if torch.cuda.is_available() else 'mps'

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.
Some kwargs in processor config are unused and will not have any effect: sft_format, add_special_token, image_tag, m

In [3]:
def multimodal_understanding(image, question, seed, top_p, temperature):
    # Clear CUDA cache before generating
    torch.cuda.empty_cache()


    for param in vl_gpt.parameters():
        param.requires_grad = True
    
    # set seed
    torch.manual_seed(seed)
    np.random.seed(seed)
    torch.cuda.manual_seed(seed)
    

    # Get the last transformer block of the Vision Transformer (ViT)


    conversation = [
        {
            "role": "<|User|>",
            "content": f"<image_placeholder>\n{question}",
            "images": [image],
        },
        {"role": "<|Assistant|>", "content": ""},
    ]
    
    pil_images = [Image.fromarray(image)]
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(cuda_device, dtype=dtype)
    


    
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # print("prepared inputs", prepare_inputs)
    

    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False if temperature == 0 else True,
        use_cache=True,
        temperature=temperature,
        top_p=top_p,
    )


    
    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)



    print("generating guided gradcam...")

    import torch.nn as nn

    class ViTForGradCAM(nn.Module):
        def __init__(self, vision_model):
            super().__init__()
            self.vision_model = vision_model

        def forward(self, images):
            # Get the output from your ViT model.
            # Suppose the output shape is [batch, T2, D]
            outputs = self.vision_model(images)
            
            # Select the [CLS] token (assuming it's at index 0)
            cls_token = outputs[:, 0, :]  # shape: [batch, D]
            
            # Now, reduce the vector to a scalar.
            # Option 1: Simply take one element, e.g. the first element:
            scalar_output = cls_token[:, 0]  # shape: [batch]
            
            # Option 2: Or aggregate, for example using a linear layer or a pooling operation:
            # scalar_output = cls_token.mean(dim=1)  # shape: [batch]
            
            return scalar_output

    # Wrap your vision model
    vit_scalar_model = ViTForGradCAM(vl_gpt.vision_model)
    target_layer = vit_scalar_model.vision_model.vision_tower.blocks[-1].norm1

    bs, n = prepare_inputs.pixel_values.shape[0:2]
    images = rearrange(prepare_inputs.pixel_values, "b n c h w -> (b n) c h w")
    # [b x n, T2, D]
    images_embeds = vit_scalar_model(images)

    guided_gc = LayerGradCam(vit_scalar_model, layer=target_layer)
    print("generating attribute...")
    attribution = guided_gc.attribute(
        images,
        # target=0
    )
    print("generating saliency map...")
    saliency_map = generate_gradcam(attribution, pil_images[0])

    # return answer, [saliency_map]
    plt.imshow(saliency_map)
    plt.show()

In [7]:
image = Image.open("../images/doge.png")
image = np.array(image)
question = "explain this meme."
multimodal_understanding(image, question, 100, 0.95, 0.1)

RuntimeError: torch.cat(): all input tensors must be on the same device. Received mps:0 and cpu