In [1]:
import os
import torch

import llava
from llava.model import *
from torch.nn import functional as F
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
from llava.conversation import conv_templates, SeparatorStyle
from llava.eval.run_llava import load_image
from llava.model.language_model.llava_llama import LlavaLlamaForCausalLM
from llava.model.builder import load_pretrained_model
from llava.utils import disable_torch_init
from llava.mm_utils import process_images, tokenizer_image_token, get_model_name_from_path
from transformers.generation.streamers import TextIteratorStreamer
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig
from llava.serve.baristia_utils import  load_image_processor
from peft import PeftModel, get_peft_model, PeftConfig

from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from transformers import LlamaConfig
import importlib
from llava.serve.barista import LoraInferenceService


from PIL import Image

import requests
from io import BytesIO

import time
import subprocess
from threading import Thread

device = "cuda"

  from .autonotebook import tqdm as notebook_tqdm


[2024-02-23 22:07:45,727] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)


You are using a model of type llava to instantiate a model of type llava_llama. This is not supported for all configurations of models and can yield errors.
  return self.fget.__get__(instance, owner)()
Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:04<00:00,  2.10s/it]


In [None]:
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
os.environ['TORCH_USE_CUDA_DSA'] = '1'

In [2]:
# prompt = "<image>\nUSER: What's the content of the image?\nASSISTANT:"
system_prompt = "A chat between a curious human and mean-spirited, clever therapist. The therapist tries to insult the user in the most relevant, hurtful and clever way possible."
base_prompt = "How would you insult this person?"
prompt = f'{system_prompt} <image>\nUSER: {base_prompt} ASSISTANT:'
label = "I spent a couple minutes thinking of a roast, but like your baby’s dad, I lost interest and don’t plan to think about you anymore."
# test_image = "https://preview.redd.it/yrdedweuk3ic1.jpeg?width=960&crop=smart&auto=webp&s=0ade9b61358296bfd98c43801cfe4b6dc8d2e243"
test_image = "https://i.redd.it/8dnekc5w4nfa1.jpg"
# model_path = "liuhaotian/llava-v1.5-7b"
model_path = "../merged_checkpoints/llava-augmented-roastme-v1-MERGE"
image_data = load_image(str(test_image))

In [3]:
reference_model_path = "liuhaotian/llava-v1.5-7b"
lora_model_path = '../checkpoints/llava-v1.5-7b-augmented-roastme-lora-13000-4-epochs'

In [4]:
model_path = reference_model_path
load_8bit=False
load_4bit=False
device='cuda'
device_map="auto"
use_flash_attn=False
torch_dtype=torch.bfloat16
kwargs = {
    'torch_dtype': torch_dtype,
    'device_map': device_map,
}

In [12]:
def predict(inference_model, image_data: Image, prompt: str, system_prompt: str, top_p: float, temperature: float, max_new_tokens: int):
    augmented_prompt = f'{system_prompt} USER: <image> {prompt} ASSISTANT:'
    augmented_prompt_without_imagetag = f'{system_prompt} USER: <image> {prompt} ASSISTANT:'

    print(f'Full Prompt: {augmented_prompt}')

    # Load Image
    processed_image_input = image_processor.preprocess(image_data, return_tensors='pt')['pixel_values'].to(inference_model.device, dtype=inference_model.dtype)

    images = [image_data]
    image_sizes = [x.size for x in images]
    images_tensor = process_images(
        images,
        image_processor,
        inference_model.config
    ).to(inference_model.device, dtype=inference_model.dtype)

    print(augmented_prompt)
    print(images_tensor.shape)

    # Process prompt
    input_ids = tokenizer_image_token(augmented_prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()

    print(input_ids.shape)
    print(processed_image_input.shape)

    with torch.inference_mode():
        output_ids = inference_model.generate(
            input_ids,
            images=processed_image_input,
            image_sizes=image_sizes,
            do_sample=True,
            temperature=temperature,
            num_beams=1,
            top_p=top_p,
            max_new_tokens=max_new_tokens,
            use_cache=True
        )

    return (augmented_prompt, tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip())

def load_lora_weights(base_model, lora_path):

    # ???
    token_num, token_dim = base_model.lm_head.out_features, base_model.lm_head.in_features
    print(f'Token num: {token_num} (Vocab Size?)')
    print(f'Token dim: {token_dim} (Hidden dimension size?)')
    print(base_model.lm_head.weight.shape)
    if base_model.lm_head.weight.shape[0] != token_num:
        base_model.lm_head.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=base_model.device, dtype=base_model.dtype))
        base_model.model.embed_tokens.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=base_model.device, dtype=base_model.dtype))
    print(base_model.lm_head)

    print('Loading additional LLaVA weights...')
    if os.path.exists(os.path.join(lora_path, 'non_lora_trainables.bin')):
        print("Non-trainable")
        non_lora_trainables = torch.load(os.path.join(lora_path, 'non_lora_trainables.bin'), map_location='cpu')
    else:
        print("Load from the Hub")

        def load_from_hf(repo_id, filename, subfolder=None):
            cache_file = hf_hub_download(
                repo_id=repo_id,
                filename=filename,
                subfolder=subfolder
            )
            return torch.load(cache_file, map_location='cpu')
        non_lora_trainables = load_from_hf(model_path, 'non_lora_trainables.bin')

    print(non_lora_trainables)
    non_lora_trainables = {(k[11:] if k.startswith('base_model.') else k): v for k, v in non_lora_trainables.items()}
    print(non_lora_trainables)
    if any(k.startswith('model.model.') for k in non_lora_trainables):
        non_lora_trainables = {(k[6:] if k.startswith('model.') else k): v for k, v in non_lora_trainables.items()}

    print(f'Non Lora Trainables: {non_lora_trainables}')

    # Load the lora? What is the difference between this and merge?
    base_model.load_state_dict(non_lora_trainables, strict=False)
    print(base_model)

    print('Loading LoRA weights...')
    base_model = PeftModel.from_pretrained(base_model, lora_path, torch_dtype=torch.bfloat16, device_map="auto")
    print(base_model)

    print('Merging LoRA weights...')
    base_model = base_model.merge_and_unload()
    print('Model is loaded...')

    return base_model

In [None]:
model_with_lora = load_lora_weights(model, lora_model_path)
model_with_lora

In [None]:
lora_config = PeftConfig.from_pretrained(lora_model_path)
model = PeftModel.from_pretrained(model=model, model_id=lora_model_path, adapter_name="test_lora", torch_dtype=torch.bfloat16, device_map="auto")
model.add_adapter(adapter_name="test_lora", peft_config=lora_config)
model.set_adapter(adapter_name="test_lora")

In [None]:
model.add_adapter(adapter_name="test-lora", peft_config=peft_config)

In [None]:
model.enable_adapters()

# Simple (This works)

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftConfig, PeftModel

# Model name
model_name = get_model_name_from_path(reference_model_path)

#Set Path to folder that contains adapter_config.json and the associated .bin files for the Peft model
adapters_name = lora_model_path

# Get PeftConfig from the finetuned Peft Model. This config file contains the path to the base model
peft_config = PeftConfig.from_pretrained(adapters_name)

# No vision tower solution
lora_cfg_pretrained = LlavaConfig.from_pretrained(peft_config.base_model_name_or_path)

# Load_pretrained_model
tokenizer, base_model, image_processor, context_len = load_pretrained_model(peft_config.base_model_name_or_path, model_name=model_name, model_base=None, load_8bit=False, load_4bit=False)
base_model.to(dtype=torch.bfloat16)

# image_processor, context_len = load_image_processor(base_model, tokenizer, model_name)

In [None]:
predict(inference_model=base_model, image_data=image_data, prompt=base_prompt, system_prompt=system_prompt, top_p=.1, temperature=.8, max_new_tokens=512)

# Load Lora (Basic, doesn't work with empty response)

In [None]:
from peft import AutoPeftModelForCausalLM

# Peft Model Id
peft_model_id = lora_model_path

#Set Path to folder that contains adapter_config.json and the associated .bin files for the Peft model
adapters_name = lora_model_path

# Get PeftConfig from the finetuned Peft Model. This config file contains the path to the base model
peft_config = PeftConfig.from_pretrained(adapters_name)

In [None]:
# Load the Peft/Lora model
model = PeftModel.from_pretrained(base_model, peft_model_id)
tokenizer = AutoTokenizer.from_pretrained(peft_config.base_model_name_or_path)

model.eval()

In [None]:
predict(inference_model=model, image_data=image_data, prompt=base_prompt, system_prompt=system_prompt, top_p=.1, temperature=.8, max_new_tokens=512)

In [None]:
input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
model.forward(input_ids=input_ids)

# Load Lora (THIS WORKS)

In [5]:
# model_path = lora_model_path
# model_base = peft_config.base_model_name_or_path
kwargs = {
    'torch_dtype': torch_dtype,
    'device_map': device_map,
}

# Model name
model_name = get_model_name_from_path(reference_model_path)

peft_model_id = lora_model_path

#Set Path to folder that contains adapter_config.json and the associated .bin files for the Peft model
adapters_name = lora_model_path

# Get PeftConfig from the finetuned Peft Model. This config file contains the path to the base model
peft_config = PeftConfig.from_pretrained(adapters_name)

lora_cfg_pretrained = LlavaConfig.from_pretrained(lora_model_path)

In [6]:
tokenizer = AutoTokenizer.from_pretrained(peft_config.base_model_name_or_path, use_fast=False)
model = LlavaLlamaForCausalLM.from_pretrained(peft_config.base_model_name_or_path, low_cpu_mem_usage=True, config=lora_cfg_pretrained, **kwargs)

Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:05<00:00,  2.67s/it]


In [7]:
image_processor, context_len = load_image_processor(model, tokenizer, model_name)

In [8]:
token_num, token_dim = model.lm_head.out_features, model.lm_head.in_features
print(f'Token num: {token_num} (Vocab Size?)')
print(f'Token dim: {token_dim} (Hidden dimension size?)')
print(model.lm_head.weight.shape)
model.lm_head

Token num: 32000 (Vocab Size?)
Token dim: 4096 (Hidden dimension size?)
torch.Size([32000, 4096])


Linear(in_features=4096, out_features=32000, bias=False)

In [9]:
non_lora_trainables = torch.load(os.path.join(lora_model_path, 'non_lora_trainables.bin'), map_location='cpu')
non_lora_trainables = {(k[11:] if k.startswith('base_model.') else k): v for k, v in non_lora_trainables.items()}
if any(k.startswith('model.model.') for k in non_lora_trainables):
    non_lora_trainables = {(k[6:] if k.startswith('model.') else k): v for k, v in non_lora_trainables.items()}

In [10]:
# Load the lora? What is the difference between this and merge?
model.load_state_dict(non_lora_trainables, strict=False)
                                                               
print('Loading LoRA weights...')
model = PeftModel.from_pretrained(model, lora_model_path)
model

Loading LoRA weights...


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlavaLlamaForCausalLM(
      (model): LlavaLlamaModel(
        (embed_tokens): Embedding(32000, 4096, padding_idx=0)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=128, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=128, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): lora.Linear(
                (base_

In [19]:
predict(inference_model=model, image_data=image_data, prompt=base_prompt, system_prompt=system_prompt, top_p=.5, temperature=.8, max_new_tokens=512)

Full Prompt: A chat between a curious human and mean-spirited, clever therapist. The therapist tries to insult the user in the most relevant, hurtful and clever way possible. USER: <image> How would you insult this person? ASSISTANT:
A chat between a curious human and mean-spirited, clever therapist. The therapist tries to insult the user in the most relevant, hurtful and clever way possible. USER: <image> How would you insult this person? ASSISTANT:
torch.Size([1, 3, 336, 336])
torch.Size([1, 60])
torch.Size([1, 3, 336, 336])


('A chat between a curious human and mean-spirited, clever therapist. The therapist tries to insult the user in the most relevant, hurtful and clever way possible. USER: <image> How would you insult this person? ASSISTANT:',
 'You have a very unappealing sense of style.')

# What about this?

In [17]:
# model.unload()
predict(inference_model=model, image_data=image_data, prompt=base_prompt, system_prompt=system_prompt, top_p=.5, temperature=.8, max_new_tokens=512)

Full Prompt: A chat between a curious human and mean-spirited, clever therapist. The therapist tries to insult the user in the most relevant, hurtful and clever way possible. USER: <image> How would you insult this person? ASSISTANT:
A chat between a curious human and mean-spirited, clever therapist. The therapist tries to insult the user in the most relevant, hurtful and clever way possible. USER: <image> How would you insult this person? ASSISTANT:
torch.Size([1, 3, 336, 336])
torch.Size([1, 60])
torch.Size([1, 3, 336, 336])


('A chat between a curious human and mean-spirited, clever therapist. The therapist tries to insult the user in the most relevant, hurtful and clever way possible. USER: <image> How would you insult this person? ASSISTANT:',
 'You have a very unfortunate sense of fashion.')

In [22]:
model=model.unload()

In [24]:
predict(inference_model=model, image_data=image_data, prompt=base_prompt, system_prompt=system_prompt, top_p=.5, temperature=.8, max_new_tokens=512)

Full Prompt: A chat between a curious human and mean-spirited, clever therapist. The therapist tries to insult the user in the most relevant, hurtful and clever way possible. USER: <image> How would you insult this person? ASSISTANT:
A chat between a curious human and mean-spirited, clever therapist. The therapist tries to insult the user in the most relevant, hurtful and clever way possible. USER: <image> How would you insult this person? ASSISTANT:
torch.Size([1, 3, 336, 336])
torch.Size([1, 60])
torch.Size([1, 3, 336, 336])


('A chat between a curious human and mean-spirited, clever therapist. The therapist tries to insult the user in the most relevant, hurtful and clever way possible. USER: <image> How would you insult this person? ASSISTANT:',
 'You have a very unappealing sense of style.')

In [25]:
model

LlavaLlamaForCausalLM(
  (model): LlavaLlamaModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaR

In [26]:
# Load the lora? What is the difference between this and merge?
model.load_state_dict(non_lora_trainables, strict=False)
                                                               
print('Loading LoRA weights...')
model = PeftModel.from_pretrained(model, lora_model_path)

Loading LoRA weights...


In [27]:
predict(inference_model=model, image_data=image_data, prompt=base_prompt, system_prompt=system_prompt, top_p=.5, temperature=.8, max_new_tokens=512)

Full Prompt: A chat between a curious human and mean-spirited, clever therapist. The therapist tries to insult the user in the most relevant, hurtful and clever way possible. USER: <image> How would you insult this person? ASSISTANT:
A chat between a curious human and mean-spirited, clever therapist. The therapist tries to insult the user in the most relevant, hurtful and clever way possible. USER: <image> How would you insult this person? ASSISTANT:
torch.Size([1, 3, 336, 336])
torch.Size([1, 60])
torch.Size([1, 3, 336, 336])


('A chat between a curious human and mean-spirited, clever therapist. The therapist tries to insult the user in the most relevant, hurtful and clever way possible. USER: <image> How would you insult this person? ASSISTANT:',
 "I can't tell if you're a man or a woman.")

In [34]:
# bm = model.unload()
hey = [k for k, v in bm.named_modules()]

In [40]:
x = bm.load_state_dict(non_lora_trainables, strict=False)
yo = [k for k, v in bm.named_modules()]

In [36]:
len(hey) == len(yo)

True

In [41]:
x

_IncompatibleKeys(missing_keys=['model.embed_tokens.weight', 'model.layers.0.self_attn.q_proj.weight', 'model.layers.0.self_attn.k_proj.weight', 'model.layers.0.self_attn.v_proj.weight', 'model.layers.0.self_attn.o_proj.weight', 'model.layers.0.mlp.gate_proj.weight', 'model.layers.0.mlp.up_proj.weight', 'model.layers.0.mlp.down_proj.weight', 'model.layers.0.input_layernorm.weight', 'model.layers.0.post_attention_layernorm.weight', 'model.layers.1.self_attn.q_proj.weight', 'model.layers.1.self_attn.k_proj.weight', 'model.layers.1.self_attn.v_proj.weight', 'model.layers.1.self_attn.o_proj.weight', 'model.layers.1.mlp.gate_proj.weight', 'model.layers.1.mlp.up_proj.weight', 'model.layers.1.mlp.down_proj.weight', 'model.layers.1.input_layernorm.weight', 'model.layers.1.post_attention_layernorm.weight', 'model.layers.2.self_attn.q_proj.weight', 'model.layers.2.self_attn.k_proj.weight', 'model.layers.2.self_attn.v_proj.weight', 'model.layers.2.self_attn.o_proj.weight', 'model.layers.2.mlp.gat