In [1]:
import torch
from transformers import (
    LlavaForConditionalGeneration,
    AutoProcessor,
    CLIPVisionModel,
    CLIPImageProcessor,
)
from PIL import Image
import requests

device = "cuda:0" if torch.cuda.is_available() else "cpu"

model_id = "llava-hf/llava-1.5-7b-hf"
llava = LlavaForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    attn_implementation="flash_attention_2",
).to(device)

clip_model_id = "RCLIP/CLIP-ViT-L-FARE2"
clip_vision = CLIPVisionModel.from_pretrained(
    clip_model_id, torch_dtype=torch.float16
).to(device)
llava.vision_tower = clip_vision
llava.vision_tower.requires_grad_(False)

processor = AutoProcessor.from_pretrained(model_id)
robust_img_proc = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14")
robust_img_proc.size = {
    "height": clip_vision.config.image_size,
    "width": clip_vision.config.image_size,
}
processor.image_processor = robust_img_proc

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
img = Image.open(requests.get(url, stream=True).raw).convert("RGB")

prompt = "USER: <image>\nWhat do you see in this image?\nASSISTANT:"
inputs = processor(text=prompt, images=img, return_tensors="pt").to(device)

with torch.no_grad():
    out_ids = llava.generate(**inputs, max_new_tokens=50, do_sample=False)

print(processor.tokenizer.decode(out_ids[0], skip_special_tokens=True))

  from .autonotebook import tqdm as notebook_tqdm
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00,  9.09it/s]
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


USER:  
What do you see in this image?
ASSISTANT: The image features two cats lying down on a pink couch. One cat is positioned on the left side of the couch, while the other cat is on the right side. The cats appear to be relaxed and enjoying
