In [23]:
import os
import torch
import requests
import transformers


from PIL import Image
from io import BytesIO
from torch.nn import functional as F

from transformers import AutoTokenizer, AutoModel, CLIPVisionModel,AutoProcessor
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
from llava.conversation import conv_templates, SeparatorStyle
from transformers import AutoTokenizer
from llava.mm_utils import tokenizer_image_token
from llava.model.language_model.llava_llama import LlavaLlamaForCausalLM

device = "cuda"

In [None]:
model_name = "liuhaotian/llava-v1.5-7b"
base_prompt = "What is in this image?"
image_url = "https://buffer.com/cdn-cgi/image/w=1000,fit=contain,q=90,f=auto/library/content/images/size/w600/2023/10/free-images.jpg"

# Load image from url
response = requests.get(image_url)
image_data = Image.open(BytesIO(response.content))
image_data

# Instantiate Model and its encoders

In [None]:
!huggingface-cli login --token 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

In [3]:
# Instantiate model with the simplest possible settings.
llava_model = LlavaLlamaForCausalLM.from_pretrained(
    model_name,
    torch_dtype = torch.float16, # So it can fit on my a100 better
).to(device)

# Text Encoder
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)

# Visual Encoder
vision_tower = llava_model.get_vision_tower()
vision_tower.load_model(device_map='auto')
image_processor = vision_tower.image_processor

You are using a model of type llava to instantiate a model of type llava_llama. This is not supported for all configurations of models and can yield errors.
  return self.fget.__get__(instance, owner)()
Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:02<00:00,  1.04s/it]


# Forward Pass

In [None]:
# If you're curious about what this function does
def prepare_prompt_into_expected_format(prompt):
    conv = conv_templates["llava_v1"].copy()

    # just one turn, always prepend image token
    inp = DEFAULT_IMAGE_TOKEN + '\n' + prompt
    
    conv.append_message(conv.roles[0], inp)
    conv.append_message(conv.roles[1], None)
    return conv.get_prompt()

## Generate Input Embeddings

In [None]:
# Turn user prompt into conversation format that vicuna (the LLM piece of LLaVA) is expecting.
prompt = prepare_prompt_into_expected_format(base_prompt)
prompt

In [None]:
# Get the textual embeddings for the prompt. Exactly the same if this were an LLM
input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
input_ids.shape

In [None]:
# Get the visual embeddings for the corresponding image
image_encodings = image_processor.preprocess(image_data, return_tensors='pt')['pixel_values'].half().cuda()
image_encodings

In [None]:
output_ids = llava_model.generate(input_ids, image_encodings)
tokenizer.batch_decode(output_ids, skip_special_tokens=True)

So this a very important step, as we have used the visual encoder to process an image into a tensor representation, which we will be able to project into a shared space, with the textual embeddings, to generate our output from. This visual encoder for LLaVA, along with many other multimodal models, is CLIP. 

## Clip



## Forward Pass

Now if you have any experience with Neural Networks, you're probably scratching your head right now, because you would expect these embeddings to share the same dimensions. For the unacclimated, deep learning is heavily based on matrix multiplication. These matrix multiplications can be heavily optimized to take advantage of the speed of GPUs. However, they require the inputs to be compatible dimensionally. Attempting to multiply matrices Tensors with mismatched dimensions will lead to the dreaded
```
RuntimeError: stack expects each tensor to be equal size, but got [3, 224, 224] at entry 0 and [3, 224, 336] at entry 3
```
So what is going on?

We'll this is where the projection matrix (the main piece of the multimodal puzzle) comes into play. 

In [None]:
# tokenizer = AutoTokenizer.from_pretrained(model_name)
prompt = "Hello, how are you doing today?"

# Convert string prompt into tokens
tokens = tokenizer.tokenize(prompt, return_tensors='pt')
print(tokens)

# Convert tokens to input_ids, by getting the index of that token in the vocabulary.
input_ids = tokenizer.convert_tokens_to_ids(tokens)
print(input_ids)

# Single step (aka the normal way)
input_ids = tokenizer(prompt, return_tensors='pt').input_ids.to(device)
input_ids

In [None]:
# The squeeze(0) is to convert 
output_ids = llava_model.generate(input_ids)
tokenizer.batch_decode(output_ids, skip_special_tokens=True)
# input_ids

In [None]:
image_processor.__dict__

In [None]:
llava_model.get_model().vision_tower

## CLIP

In [None]:
# openai/clip-vit-large-patch14-336
model = CLIPVisionModel.from_pretrained("openai/clip-vit-large-patch14-336")
processor = AutoProcessor.from_pretrained("openai/clip-vit-large-patch14-336")

In [None]:
image = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
inputs = processor(images=image, return_tensors="pt")
inputs
# outputs = model(**inputs)

In [None]:
outputs

In [None]:
last_hidden_state = outputs.last_hidden_state

In [None]:
outputs.last_hidden_state.shape

In [None]:
llava_model.get_vision_tower()(images=inputs.pixel_values).shape

In [None]:
embedding_layer(input_ids)

In [None]:
embedding_layer = llava_model.get_model().embed_tokens

# Basic Inference

In [44]:
base_prompt = "Hi, how are you? \n"
tokens = tokenizer(base_prompt, return_tensors='pt').to(device)
print(tokenizer.tokenize(base_prompt))
print(tokens.input_ids)

tokenizer.batch_decode(tokens.input_ids, skip_special_tokens=False)

['▁Hi', ',', '▁how', '▁are', '▁you', '?', '▁', '<0x0A>']
tensor([[    1,  6324, 29892,   920,   526,   366, 29973, 29871,    13]],
       device='cuda:0')


['<s> Hi, how are you? \n']

In [45]:
embedding_layer = llava_model.get_model().embed_tokens
embedding_layer(tokens.input_ids)

tensor([[[ 4.5471e-03, -3.8147e-03,  1.7242e-03,  ..., -8.7891e-03,
           2.5024e-03, -2.4719e-03],
         [ 2.5024e-02, -1.0254e-02, -1.3550e-02,  ...,  4.1199e-03,
          -4.3945e-03, -2.1851e-02],
         [-3.4142e-04, -3.7537e-03, -6.9580e-03,  ...,  7.9727e-04,
          -3.5095e-03,  4.8523e-03],
         ...,
         [-1.2512e-02,  1.4709e-02, -5.4321e-03,  ...,  6.8359e-03,
          -1.6861e-03, -5.6458e-04],
         [-1.2283e-03,  1.3199e-03, -1.2695e-02,  ...,  2.5940e-03,
          -1.1902e-03, -5.3406e-03],
         [-1.6212e-04, -2.1648e-04,  7.1335e-04,  ...,  3.5667e-04,
           4.3297e-04, -7.0572e-05]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<EmbeddingBackward0>)

In [93]:
print(input_ids.shape)
layers = llava_model.forward(input_ids).logits
print(layers.shape)

final_prediction_layer = layers[:, -1, :]
print(final_prediction_layer.shape)

predicted_token_id = F.softmax(final_prediction_layer).argmax().unsqueeze(dim=0)
print(predicted_token_id)

torch.Size([1, 10])
torch.Size([1, 10, 32000])
torch.Size([1, 32000])
tensor([29915], device='cuda:0')


  predicted_token_id = F.softmax(final_prediction_layer).argmax().unsqueeze(dim=0)


In [94]:
predicted_token = F.softmax(final_prediction_layer).argmax().unsqueeze(dim=0)
predicted_token

  predicted_token = F.softmax(final_prediction_layer).argmax().unsqueeze(dim=0)


tensor([29915], device='cuda:0')

In [95]:
tokenizer.decode(predicted_token)

"'"

In [88]:
input_ids = torch.cat((iputs, predicted_token.unsqueeze(1)), dim=1)

In [90]:
tokenizer.batch_decode(input_ids)

['<s> Hi, how are you? \n I']

In [91]:
iputs

tensor([[    1,  6324, 29892,   920,   526,   366, 29973, 29871,    13]],
       device='cuda:0')

In [92]:
input_ids

tensor([[    1,  6324, 29892,   920,   526,   366, 29973, 29871,    13,   306]],
       device='cuda:0')

In [102]:
predicted_token.item()

29915

In [111]:
base_prompt = "Hi, how are you? \n"
tokens = tokenizer(base_prompt, return_tensors='pt').to(device)
print(tokenizer.tokenize(base_prompt))
input_ids = tokens.input_ids

tokenizer.batch_decode(tokens.input_ids, skip_special_tokens=False)

eos_token_id = 2
predicted_token_id = torch.tensor([[0]])

# While we haven't predicted stop token
while predicted_token_id.item() != eos_token_id:

	output = llava_model.forward(input_ids)
	layers = output.logits
	final_prediction_layer = layers[:, -1, :]
	predicted_token_id = F.softmax(final_prediction_layer).argmax().unsqueeze(dim=0)
	
	# Concate predicted_token_id to existing sequence of token_ids
	input_ids = torch.cat((input_ids, predicted_token_id.unsqueeze(1)), dim=1)
	

# Response
tokenizer.batch_decode(input_ids)

['▁Hi', ',', '▁how', '▁are', '▁you', '?', '▁', '<0x0A>']


  predicted_token_id = F.softmax(final_prediction_layer).argmax().unsqueeze(dim=0)


["<s> Hi, how are you? \n I'm good, thanks for asking.  How about you?</s>"]