In [33]:
import requests
from PIL import Image
import torchvision.transforms as T

import torch
from transformers import AutoProcessor, LlavaForConditionalGeneration

model_id = "llava-hf/llava-1.5-7b-hf"
model = LlavaForConditionalGeneration.from_pretrained(
    model_id, 
    torch_dtype=torch.float16, 
    low_cpu_mem_usage=True, 
).to(1)

processor = AutoProcessor.from_pretrained(model_id)

# Define a chat history and use `apply_chat_template` to get correctly formatted prompt
# Each value in "content" has to be a list of dicts with types ("text", "image") 
conversation = [
    {

      "role": "user",
      "content": [
          {"type": "text", "text": "What are these?"},
          {"type": "image"},
        ],
    },
]
prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)

image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
raw_image = Image.open(requests.get(image_file, stream=True).raw)
inputs = processor(images=raw_image, text=prompt, return_tensors='pt').to(1, torch.float16)


Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00,  4.28it/s]


In [51]:
import torchvision.transforms as T
from PIL import Image

# Create the transformation pipeline
transform= T.Compose([
    T.Lambda(lambda img: img.convert("RGB")),  # Ensure image is in RGB.
    T.Resize(336, interpolation=T.InterpolationMode.BICUBIC),  # Resize: shortest edge = 336.
    T.CenterCrop((336, 336)),  # Center crop to 336x336.
    T.ToTensor(),  # Convert to tensor and scale pixels to [0, 1] (i.e. multiply by rescale_factor 0.00392).
])
normalize = T.Normalize(
        mean=[0.48145466, 0.4578275, 0.40821073],
        std=[0.26862954, 0.26130258, 0.27577711]
    )


In [42]:
image = T.ToTensor()(raw_image)
image.requires_grad = True
print(image.min(), image.max())

inputs = processor(images=raw_image, text=prompt, return_tensors='pt').to(1, torch.float16)["pixel_values"]

# print(processor.__dict__)
# print(dir(processor))
print(inputs)
print(inputs.min(), inputs.max())
print(inputs.shape)

tensor(0., grad_fn=<MinBackward1>) tensor(1., grad_fn=<MaxBackward1>)
tensor([[[[ 0.5435,  0.6455,  0.5581,  ...,  0.0909,  0.0033, -0.0696],
          [ 0.5435,  0.6167,  0.5435,  ...,  0.1201,  0.0179,  0.0617],
          [ 0.5581,  0.5581,  0.6602,  ...,  0.0909,  0.0764,  0.0617],
          ...,
          [ 1.8281,  1.8867,  1.8281,  ...,  1.4053,  1.4482,  1.5654],
          [ 1.8574,  1.9014,  1.8721,  ...,  1.4775,  1.4053,  1.4922],
          [ 1.8721,  1.9014,  1.9014,  ...,  1.4053,  1.2148,  1.4775]],

         [[-1.3623, -1.2715, -1.3770,  ..., -1.4219, -1.4824, -1.5117],
          [-1.3320, -1.2422, -1.3467,  ..., -1.4219, -1.4824, -1.4219],
          [-1.2422, -1.2871, -1.1973,  ..., -1.4668, -1.4668, -1.4824],
          ...,
          [ 0.0789,  0.1239,  0.0338,  ..., -0.7168, -0.6567, -0.5664],
          [ 0.1089,  0.1089,  0.0789,  ..., -0.6265, -0.7168, -0.6265],
          [ 0.1239,  0.1089,  0.0789,  ..., -0.6416, -0.8818, -0.5513]],

         [[-0.5562, -0.3853, -0.

In [52]:

image = transform(raw_image)
image.requires_grad = True
processed_image = normalize(image).to(1, torch.float16)

In [53]:
print(processed_image)
print(processed_image.min(), processed_image.max())
print(processed_image.shape)

print(torch.norm(processed_image - inputs))

print(processed_image.requires_grad)

tensor([[[ 0.5435,  0.6455,  0.5581,  ...,  0.0909,  0.0033, -0.0696],
         [ 0.5435,  0.6167,  0.5435,  ...,  0.1201,  0.0179,  0.0617],
         [ 0.5581,  0.5581,  0.6602,  ...,  0.0909,  0.0764,  0.0617],
         ...,
         [ 1.8281,  1.8867,  1.8281,  ...,  1.4053,  1.4482,  1.5654],
         [ 1.8574,  1.9014,  1.8721,  ...,  1.4775,  1.4053,  1.4922],
         [ 1.8721,  1.9014,  1.9014,  ...,  1.4053,  1.2148,  1.4775]],

        [[-1.3623, -1.2715, -1.3770,  ..., -1.4219, -1.4824, -1.5117],
         [-1.3320, -1.2422, -1.3467,  ..., -1.4219, -1.4824, -1.4219],
         [-1.2422, -1.2871, -1.1973,  ..., -1.4668, -1.4668, -1.4824],
         ...,
         [ 0.0789,  0.1239,  0.0338,  ..., -0.7168, -0.6567, -0.5664],
         [ 0.1089,  0.1089,  0.0789,  ..., -0.6265, -0.7168, -0.6265],
         [ 0.1239,  0.1089,  0.0789,  ..., -0.6416, -0.8818, -0.5513]],

        [[-0.5562, -0.3853, -0.4138,  ..., -0.8687, -0.8545, -0.8687],
         [-0.4563, -0.4421, -0.4849,  ..., -0

In [None]:
print(inputs.keys())
print(inputs['pixel_values'].shape)
print(inputs["pixel_values"].min(), inputs["pixel_values"].max())

output = model.generate(**inputs, max_new_tokens=200, do_sample=False)
print(processor.decode(output[0][2:], skip_special_tokens=True))

dict_keys(['input_ids', 'attention_mask', 'pixel_values'])
torch.Size([1, 3, 336, 336])
tensor(-1.7920, device='cuda:1', dtype=torch.float16) tensor(2.1465, device='cuda:1', dtype=torch.float16)


Expanding inputs for image tokens in LLaVa should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.50.


ER:  
What are these? ASSISTANT: These are two cats lying on a pink couch.
