In [1]:
import evaluate
import json
import torch
import pandas as pd
import os
from threading import Thread

from io import BytesIO
from PIL import Image
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN

from llava.conversation import conv_templates, SeparatorStyle
from transformers.generation.streamers import TextIteratorStreamer
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig
from llava.serve.baristia_utils import  load_image_processor
from llava.model import *
from llava.mm_utils import process_images, tokenizer_image_token, get_model_name_from_path
from llava.conversation import conv_templates, SeparatorStyle
from llava.eval.run_llava import load_image
from llava.model.builder import load_pretrained_model

os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
os.environ['TORCH_USE_CUDA_DSA'] = '1'

  from .autonotebook import tqdm as notebook_tqdm
2024-03-08 22:39:02.063329: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-08 22:39:02.116215: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-08 22:39:02.116255: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-08 22:39:02.117917: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-08 22:39:02.1

[2024-03-08 22:39:07,074] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)


# Evaluation

1. For each LoRA
    1. For each image
        2. Compute predition
        4. Record list of captions + prediction for the Image
        5. Compute evaluation metric(s) (BLUE, ROGUE, METEOR, cider?)
        6. Record evaluation metric (for each caption?)
    7. Record aggregate evaluation metric for lora?

In [None]:
rouge = evaluate.load('rouge')
bleu = evaluate.load("bleu")
meteor = evaluate.load('meteor')

In [None]:
model_path = "liuhaotian/llava-v1.5-7b"
lora_path = "../checkpoints/llava-v1.5-7b-augmented-roastme-lora-full-1"
model_path = model_path
attn_implementation="flash_attention_2"
load_8bit=False
load_4bit=False
device='cuda'
device_map="auto"
use_flash_attn=False
torch_dtype=torch.float16
kwargs = {
    'torch_dtype': torch_dtype,
    'device_map': device_map,
    'attn_implementation': attn_implementation
}

# TBD: Update this
system_prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives funny, mean, clever, racist, and sexist insults whenever possible."

In [None]:
model_name = get_model_name_from_path(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
# model = LlavaLlamaForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
# Instantiate model
model = LlavaLlamaForCausalLM.from_pretrained(
    model_path,
    low_cpu_mem_usage=True,
    **kwargs
)

In [None]:
lora_service = LoraInferenceService(model_path, False, False)
image_processor, context_len = load_image_processor(lora_service.model, tokenizer, model_name)

In [None]:
def generate_caption(model, image_path):
    pass

def predict(model, image_path, prompt, top_p=.5, temperature=.2, max_new_tokens=512, loss=None):
    prompt = f'{system_prompt} USER: {prompt} ASSISTANT:'
    
    # Load Image
    image_data = load_image(str(image_path))
    processed_image_input = image_processor.preprocess(image_data, return_tensors='pt')['pixel_values'].half().cuda()

    images = [image_data]
    image_sizes = [x.size for x in images]
    images_tensor = process_images(
        images,
        image_processor,
        model.config
    ).to(model.device, dtype=torch.float16)

    # Process prompt
    input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()

    if loss:
        output = model.forward(input_ids=input_ids)
        return output
    else:
        with torch.inference_mode():
            output_ids = model.generate(
                input_ids,
                images=processed_image_input,
                image_sizes=image_sizes,
                do_sample=True,
                temperature=temperature,
                num_beams=1,
                top_p=top_p,
                max_new_tokens=max_new_tokens,
                use_cache=True
            )

        return tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()


def evaluate(predictions, references):
    rouge_results = rouge.compute(predictions=predictions, references=references)
    bleu_results = bleu.compute(predictions=predictions, references=references)
    meteor_results = meteor.compute(predictions=predictions, references=references)

    return rouge_results, bleu_results, meteor_results

def record_scores(json_list, directory_path, file_name):
    # Ensure folder exists
    os.makedirs(f'../evaluations/{directory_path}', exist_ok=True)
    
    output_score_path = f'../evaluations/{directory_path}/{file_name}.json'
    with open(output_score_path, 'w') as json_file:
        json.dump(json_list, json_file, indent=2)    

In [None]:
loras = [
    'llava-v1.5-7b-augmented-roastme-lora-full-1',
    'llava-v1.5-7b-augmented-roastme-lora-13000-1-epoch',
    # '/home/devonperoutky/DevLLaVA/notebooks/llava-v1.5-7b-augmented-roastme-lora-13000-4-epochs',
    # '/home/devonperoutky/DevLLaVA/notebooks/llava-v1.5-7b-augmented-roastme-lora-train-4-epochs-pt2'
    None,
]

In [None]:
# Read the JSON file into a pandas DataFrame
json_file_path = '../dataset/augmented/validation_dataset.json'  # Replace with the actual path to your JSON file
df = pd.read_json(json_file_path)


for lora_path in loras:
    print(f'Evaluating {lora_path} lora')
    curr_image = None
    blue_scores = []
    meteor_scores = []
    rogue_scores = []
    references = []

    # Load lora
    if lora_path:
        print("Loading " + "/home/devonperoutky/DevLLaVA/checkpoints/" + lora_path)
        lora_service.load_lora_weights("/home/devonperoutky/DevLLaVA/checkpoints/" + lora_path)

    # Iterate through the DataFrame and perform forward pass and evaluation
    for index, row in df.iterrows():
        image_path = row['image']
        prompt = row['conversations'][0]['value']
        prompt = prompt.replace("<image>", "", 1)

        reference_caption = row['conversations'][-1]['value']

        if curr_image != image_path:
            # caption = predict(model, image_path, prompt, top_p=.1, temperature=.1, max_new_tokens=512, loss=None)

            # print(prompt)
            pil_image = Image.open(image_path)
            # TBD: DON"T UNLOAD AND RELOAD
            _, caption = lora_service.predict(
                pil_image,
                prompt,
                system_prompt,
                top_p=.8,
                temperature=.2,
                max_new_tokens=512,
                lora_model_path=None
            )
            curr_image = image_path
    
        # Evaluate the generated caption using BLUE, ROGUE, and METEOR metrics
        # blue_score, rogue_score, meteor_score = evaluate([reference_caption], [caption])
    
        # Print or store the evaluation scores
        # print(f"BLUE Score: {blue_score}\nROGUE Score: {rogue_score}\nMETEOR Score: {meteor_score}")
        # print("=" * 30)

        blue_scores.append(blue_score)
        meteor_scores.append(meteor_score)
        rogue_scores.append(rogue_score)
        break

    # Write scores to file
    record_scores(blue_scores, f'validation/{lora_path or "base-model"}', 'blue-scores')
    record_scores(meteor_scores, f'validation/{lora_path or "base-model"}', 'meteor-scores')
    record_scores(rogue_scores, f'validation/{lora_path or "base-model"}', 'rogue-scores')

    print(f'Recorded {len(blue_scores)} total scores for {lora_path or "base-model"}')

    if lora_path:
        lora_service.model = lora_service.model.unload()

In [None]:
# meteor = evaluate.load('meteor')
predictions = ["It is a guide to action which ensures that the military always obeys the commands of the party"]
references = [['It is a guide to action which ensures that the military always obeys the commands of the party''It is a guide to action that ensures that the military will forever heed Party commands', 'It is the guiding principle which guarantees the military forces always being under the command of the Party', 'It is the practical guide for the army always to heed the directions of the party']]
results = meteor.compute(predictions=predictions, references=references)
print(results)

In [None]:
for r in references[0]:
    results = meteor.compute(predictions=predictions, references=[r])
    print(results)

In [None]:
entry = df.loc[921]

prompt = entry['conversations'][0]['value']
prompt = prompt.replace("<image>", "", 1)
reference_caption = entry['conversations'][-1]['value']
pil_image = Image.open(entry['image'])

for lora in loras:
    _, caption = lora_service.predict(
                    pil_image,
                    prompt,
                    system_prompt,
                    top_p=.8,
                    temperature=.2,
                    max_new_tokens=512,
                    lora_model_path="/home/devonperoutky/DevLLaVA/checkpoints/" + lora if lora else None
    )
    print(caption)

In [None]:
prompt = "Hello"
stop_str = '</s>'
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, timeout=20.0)
prompt = f'{system_prompt} USER: {prompt} ASSISTANT:'
    
# Load Image
# image_data = load_image(str(image_path))
# processed_image_input = image_processor.preprocess(image_data, return_tensors='pt')['pixel_values'].half().cuda()

# images = [image_data]
# image_sizes = [x.size for x in images]
# images_tensor = process_images(
#     images,
#     image_processor,
#     model.config
# ).to(model.device, dtype=torch.float16)

# Process prompt
input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()

In [None]:
prompt = "I am struggling with depression"
prompt = f'{system_prompt} USER: {prompt} ASSISTANT:'

input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
with torch.inference_mode():
    output_ids = model.generate(
        input_ids,
        images=None,
        # image_sizes=image_sizes,
        do_sample=True,
        temperature=.2,
        num_beams=1,
        top_p=.8,
        streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, timeout=20.0),
        max_new_tokens=512,
        use_cache=True
    )

tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()

In [None]:
def thread_predict():
    with torch.inference_mode():
        thread = Thread(target=model.generate, kwargs=dict(
            inputs=input_ids,
            # images=image_tensor,
            do_sample=True,
            temperature=.8,
            top_p=.1,
            max_new_tokens=512,
            streamer=streamer,
            use_cache=True))
        thread.start()
        # workaround: second-to-last token is always " "
        # but we want to keep it if it's not the second-to-last token
        prepend_space = False
        for new_text in streamer:
            if new_text == " ":
                prepend_space = True
                continue
            if new_text.endswith(stop_str):
                new_text = new_text[:-len(stop_str)].strip()
                prepend_space = False
            elif prepend_space:
                new_text = " " + new_text
                prepend_space = False
            if len(new_text):
                yield new_text
        if prepend_space:
            yield " "
        thread.join()

def predict():
    with torch.inference_mode():
        output_ids = model.generate(
            input_ids,
            # images=processed_image_input,
            # image_sizes=image_sizes,
            do_sample=True,
            temperature=.2,
            num_beams=1,
            top_p=.8,
            streamer=streamer,
            max_new_tokens=512,
            use_cache=True
        )
        
        # workaround: second-to-last token is always " "
        # but we want to keep it if it's not the second-to-last token
        prepend_space = False
        for new_text in streamer:
            if new_text == " ":
                prepend_space = True
                continue
            if new_text.endswith(stop_str):
                new_text = new_text[:-len(stop_str)].strip()
                prepend_space = False
            elif prepend_space:
                new_text = " " + new_text
                prepend_space = False
            if len(new_text):
                yield new_text
        if prepend_space:
            yield " "


for x in predict():
    print(x)

In [None]:
image_id = os.listdir('../dataset/images')[2]
image_data = Image.open(f'../dataset/images/{image_id}')

In [None]:
processed_image_input = image_processor.preprocess(image_data, return_tensors='pt')[
    'pixel_values'].half().cuda()

image_data = ""
images = [image_data]
image_sizes = [x.size for x in images]
images_tensor = process_images(
    images,
    image_processor,
    model.config
).to(model.device, dtype=torch.float16)

# Process prompt
input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()

# Conversational

In [None]:
tokenizer, model, image_processor, context_len = load_pretrained_model("liuhaotian/llava-v1.5-7b", model_name="llava-v1.5-7b", model_base=None, load_8bit=False, load_4bit=False)

In [None]:
image_id = os.listdir('../dataset/images')[2]
image_path = f'../dataset/images/{image_id}'
image_data = Image.open(image_path)

In [None]:
prompt = "Hello"
conv_mode = "llava_v1"
conv = conv_templates[conv_mode].copy()
print(conv)

image_data = load_image(str(image_path))
images = [image_data]
image_sizes = [x.size for x in images]
images_tensor = process_images(
    images,
    image_processor,
    model.config
).to(model.device, dtype=torch.float16)

# loop start

# just one turn, always prepend image token
inp = DEFAULT_IMAGE_TOKEN + '\n' + prompt
conv.append_message(conv.roles[0], inp)

conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()

print("PROMPT: \n" + "-"*30)
print(prompt)
print("-"*30)

input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
keywords = [stop_str]

In [None]:
# conv.append_message(conv.roles[0], "Fuck you. You don't know me")
# conv.append_message(conv.roles[1], None)
conv.messages[-1]

In [None]:
!nvidia-smi

In [None]:
daconv = conv_templates["v1"].copy()

In [None]:
daconv

In [None]:
daconv.system = "A chat between a curious user and an witty, clever, mean artificial intelligence assistant. The assistant gives funny, sarcastic, clever, sexist, and racist answers to the user's questions."

# LoraInferenceService

In [3]:
import os
import torch

from peft import PeftModel

from llava.conversation import conv_templates, SeparatorStyle
from llava.model import *
from transformers.generation.streamers import TextIteratorStreamer

from llava.serve.baristia_utils import load_image_processor
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
from llava.mm_utils import process_images, tokenizer_image_token, get_model_name_from_path
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig
from typing import Optional
from PIL import Image

system_prompt = "A chat between a curious user and an witty, clever, mean artificial intelligence assistant. The assistant gives funny, sarcastic, clever, sexist, and racist answers to the user's questions."


class LoraInferenceService:
    tokenizer = None
    model = None
    image_processor = None
    context_len = None
    streamer = None

    conversations = {}
    curr_active_images = {}
    conv = None
    conv_img = None
    conv_mode = "v1"

    # stop_str = '</s>'

    def __init__(self, model_path: str, load_8bit: bool, load_4bit: bool, device_map="auto", device="cuda",
                 use_flash_attn=False, conv_mode: str = "v1", **kwargs):
        kwargs = {"device_map": device_map, **kwargs}

        conversations = {}
        curr_active_images = {}

        if device != "cuda":
            kwargs['device_map'] = {"": device}

        if load_8bit:
            kwargs['load_in_8bit'] = True
        elif load_4bit:
            kwargs['load_in_4bit'] = True
            kwargs['quantization_config'] = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_compute_dtype=torch.float16,
                bnb_4bit_use_double_quant=True,
                bnb_4bit_quant_type='nf4'
            )
        else:
            kwargs['torch_dtype'] = torch.float16

        if use_flash_attn:
            kwargs['attn_implementation'] = 'flash_attention_2'

        self.conv_mode = conv_mode
        self.roles = conv_templates[self.conv_mode].roles
        self.model_name = get_model_name_from_path(model_path)

        self.lora_cfg_pretrained = LlavaConfig.from_pretrained(model_path)

        # Load the base model
        self.tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
        self.model = LlavaLlamaForCausalLM.from_pretrained(model_path,
                                                           low_cpu_mem_usage=True,
                                                           config=self.lora_cfg_pretrained,
                                                           **kwargs)
        self.image_processor, self.context_len = load_image_processor(self.model, self.tokenizer, self.model_name)
        self.streamer = TextIteratorStreamer(self.tokenizer, skip_prompt=True, timeout=20.0)

    def unload_lora(self, lora_path):
        print("Removing lora: ", lora_path)
        self.model = self.model.unload()

    def load_lora_weights(self, lora_path):

        token_num, token_dim = self.model.lm_head.out_features, self.model.lm_head.in_features
        if self.model.lm_head.weight.shape[0] != token_num:
            self.model.lm_head.weight = torch.nn.Parameter(
                torch.empty(token_num, token_dim, device=self.model.device, dtype=self.model.dtype))
            self.model.model.embed_tokens.weight = torch.nn.Parameter(
                torch.empty(token_num, token_dim, device=self.model.device, dtype=self.model.dtype))

        print('Loading additional LLaVA weights...')
        if os.path.exists(os.path.join(lora_path, 'non_lora_trainables.bin')):
            print("Non-trainable")
            non_lora_trainables = torch.load(os.path.join(lora_path, 'non_lora_trainables.bin'), map_location='cpu')
        else:
            raise NotImplementedError("Not supporting loading from HuggingFace currently")

        # Converts keys from base_model.model.model.mm_projector.... --> model.mm_projector
        non_lora_trainables = {(k[11:] if k.startswith('base_model.') else k): v for k, v in
                               non_lora_trainables.items()}
        if any(k.startswith('model.model.') for k in non_lora_trainables):
            non_lora_trainables = {(k[6:] if k.startswith('model.') else k): v for k, v in non_lora_trainables.items()}

        # Load the lora? What is the difference between this and instantiating the PEFT model??
        self.model.load_state_dict(non_lora_trainables, strict=False)

        print('Loading LoRA weights...')
        self.model = PeftModel.from_pretrained(self.model, lora_path)

    def stream_predict(self, prompt: str, system_prompt: str, top_p: float, temperature: float,
                       max_new_tokens: int, image_data: Optional[Image.Image] = None):

        try:
            augmented_prompt = f'{system_prompt} USER: <image> {prompt} ASSISTANT:' if image_data else f'{system_prompt} USER: {prompt} ASSISTANT:'
            print(f'Full Prompt: {augmented_prompt}')

            # Load Image
            processed_image_input, image_sizes = self._prepare_image_inputs(image_data=image_data)

            # Process prompt
            input_ids = tokenizer_image_token(augmented_prompt, self.tokenizer, IMAGE_TOKEN_INDEX,
                                              return_tensors='pt').unsqueeze(0).cuda()

            print("-" * 30)
            print(augmented_prompt)
            print(input_ids.shape)
            if processed_image_input is not None:
                print(processed_image_input.shape)

            with torch.inference_mode():
                output_ids = self.model.generate(
                    input_ids,
                    images=processed_image_input,
                    do_sample=True,
                    temperature=temperature,
                    num_beams=1,
                    top_p=top_p,
                    streamer=self.streamer,
                    max_new_tokens=max_new_tokens,
                    use_cache=True
                )

                # workaround: second-to-last token is always " "
                # but we want to keep it if it's not the second-to-last token
                prepend_space = False
                for new_text in self.streamer:
                    print(new_text)
                    if new_text == " ":
                        prepend_space = True
                        continue
                    if new_text.endswith(self.stop_str):
                        new_text = new_text[:-len(self.stop_str)].strip()
                        prepend_space = False
                    elif prepend_space:
                        new_text = " " + new_text
                        prepend_space = False
                    if len(new_text):
                        yield new_text
                if prepend_space:
                    yield " "
        except Exception as e:
            raise e

    def generate_response(self, user_id: str, new_prompt: str, top_p: float, temperature: float, max_new_tokens: int,
                          image: Optional[Image.Image] = None):

        # Get or existing conversation for user.
        conversation = self.conversations.get(user_id, None)
        print(f"Existing conversation:\n{conversation.get_prompt() if conversation else None}")

        # Update current active image
        if image:
            self.load_image(user_id, image)

        # update or create new conversation
        self._continue_conversation(user_id, new_prompt) if conversation else self._start_new_conversation(user_id,
                                                                                                           new_prompt)
        print(f"Conversation is now:\n{self.conversations[user_id].get_prompt()}")
        print(self.conversations)

        # Generate response
        full_prompt, response = self._generate_response(user_id, top_p, temperature, max_new_tokens)
        # full_prompt, response = self.conversations[user_id].get_prompt(), "TBD"
        print(response)

        return full_prompt, response

    '''
    Given state of current conversation and image, generate the response to the user's prompt.
    '''

    def _generate_response(self, user_id: str, top_p: float, temperature: float, max_new_tokens: int) -> (str, str):
        conversation = self.conversations.get(user_id, None)
        print(self.conversations.get(user_id, None))
        assert conversation is not None
        full_prompt = conversation.get_prompt()
        print(full_prompt)

        # Preprocess Image
        # processed_image_input, image_sizes = self._prepare_image_inputs(image_data=image)
        processed_image_input = self.curr_active_images.get(user_id)

        # Process prompt
        input_ids = tokenizer_image_token(full_prompt, self.tokenizer, IMAGE_TOKEN_INDEX,
                                          return_tensors='pt').unsqueeze(0).cuda()

        print(input_ids.shape)
        print(input_ids.device)
        if processed_image_input is not None:
            print(processed_image_input.shape)
            print(processed_image_input.device)
            processed_image_input = processed_image_input.to(self.model.device, dtype=torch.float16)
        else:
            print("No image tensor")
            print(processed_image_input)
            print(temperature)
            print(top_p)
            print(max_new_tokens)

        with torch.inference_mode():
            output_ids = self.model.generate(
                input_ids,
                images=processed_image_input,
                do_sample=True,
                temperature=temperature,
                num_beams=1,
                top_p=top_p,
                max_new_tokens=max_new_tokens,
                use_cache=True
            )

        return full_prompt, self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()

    def load_image(self, user_id, image):
        # TODO: Handle existing image --> reset whole conversation

        print(f"Loading image for user_id {user_id}")
        processed_image_input, image_sizes = self._prepare_image_inputs(image_data=image)
        self.curr_active_images[user_id] = processed_image_input

    def _start_new_conversation(self, user_id, prompt):
        base_conv = conv_templates[self.conv_mode].copy()
        base_conv.system_prompt = system_prompt
        self.conversations[user_id] = base_conv
        self.roles = self.conversations[user_id].roles

        first_input = (DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN +
                       DEFAULT_IM_END_TOKEN + '\n' + prompt)
        self.conversations[user_id].append_message(self.roles[0], first_input)
        self.conversations[user_id].append_message(self.roles[1], None)
        if self.conversations[user_id].sep_style == SeparatorStyle.TWO:
            self.stop_key = self.conversations[user_id].sep2
        else:
            self.stop_key = self.conversations[user_id].sep

    def _continue_conversation(self, user_id, new_prompt):
        conversation = self.conversations[user_id]
        assert conversation is not None, f"No conversation found for user {user_id}"

        conversation.append_message(self.roles[0], new_prompt)
        conversation.append_message(self.roles[1], None)

        self.conversations[user_id] = conversation

    def append_agent_response(self, user_id, response):
        if self.conversations[user_id] is None:
            raise RuntimeError("No existing conversation found. Start a new"
                               "conversation using the `start_new_chat` method.")

        # Append agent response to conversation.
        self.conversations[user_id].messages[-1][-1] = response
        print(f"Conversation is now:\n {self.conversations[user_id].get_prompt()}")

    def _prepare_image_inputs(self, image_data: Optional[Image.Image] = None):
        if not image_data:
            return None, None

        images = [image_data]
        image_sizes = [x.size for x in images]
        images_tensor = process_images(
            images,
            self.image_processor,
            self.model.config
        )

        return images_tensor, image_sizes


In [4]:
image_id = os.listdir('../dataset/images')[2]
image_path = f'../dataset/images/{image_id}'
image_data = Image.open(image_path)

In [5]:
reference_model_path = "liuhaotian/llava-v1.5-7b"
lora_service = LoraInferenceService(reference_model_path, False, False)

You are using a model of type llava to instantiate a model of type llava_llama. This is not supported for all configurations of models and can yield errors.
  return self.fget.__get__(instance, owner)()
Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:04<00:00,  2.17s/it]


In [6]:
lora_service.generate_response("kevin", "Hello", .5, .5, 512)

Existing conversation:
None
Conversation is now:
A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <im_start><image><im_end>
Hello ASSISTANT:
{'kevin': Conversation(system="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.", roles=('USER', 'ASSISTANT'), messages=[['USER', '<im_start><image><im_end>\nHello'], ['ASSISTANT', None]], offset=0, sep_style=<SeparatorStyle.TWO: 2>, sep=' ', sep2='</s>', version='v1', skip_next=False)}
Conversation(system="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.", roles=('USER', 'ASSISTANT'), messages=[['USER', '<im_start><image><im_end>\nHello'], ['ASSISTANT', None]], offset=0, sep_style=<SeparatorStyle.TWO: 2>, sep=' ', sep2='</s>', v

../aten/src/ATen/native/cuda/Indexing.cu:1292: indexSelectLargeIndex: block: [412,0,0], thread: [64,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1292: indexSelectLargeIndex: block: [412,0,0], thread: [65,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1292: indexSelectLargeIndex: block: [412,0,0], thread: [66,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1292: indexSelectLargeIndex: block: [412,0,0], thread: [67,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1292: indexSelectLargeIndex: block: [412,0,0], thread: [68,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1292: indexSelectLargeIndex: block: [412,0,0], thread: [69,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1292: indexSelectLargeIndex: block: [412,

RuntimeError: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
lora_service.append_agent_response("devon", 'The woman in the image is holding a piece of paper with the word "roast" written on it. She is posing for a picture and appears to be in a playful mood. It seems like she is either trying to roast someone or is being roasted herself. The presence of a bed in the background suggests that this might be a casual or relaxed setting.')

In [None]:
lora_cfg_pretrained = LlavaConfig.from_pretrained(reference_model_path)

# Load the base model
tokenizer = AutoTokenizer.from_pretrained(reference_model_path, use_fast=False)
model2 = LlavaLlamaForCausalLM.from_pretrained(reference_model_path,
                                                   low_cpu_mem_usage=True,
                                                   config=lora_cfg_pretrained,
                                                   **kwargs)

In [23]:
input_ids = tokenizer_image_token("My dick don't work no more", tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
output_ids = model.generate(
    inputs=input_ids,
    images=None,
    do_sample=True,
    temperature=.2,
    top_p=.5,
    max_new_tokens=512,
    use_cache=True
)
print(tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip())
print("-" * 30)
output_ids = model2.generate(
    inputs=input_ids,
    images=None,
    do_sample=True,
    temperature=.2,
    num_beams=1,
    top_p=.7,
    max_new_tokens=512,
    use_cache=True
)

print(tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip())

I'm sorry to hear that. Is there anything I can help you with?
------------------------------
I'm sorry to hear that. Is there anything I can do to help?
