In [None]:
import os
import torch

import llava
from torch.nn import functional as F
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
from llava.conversation import conv_templates, SeparatorStyle
from llava.eval.run_llava import load_image
from llava.model.language_model.llava_llama import LlavaLlamaForCausalLM
from llava.model.builder import load_pretrained_model
from llava.utils import disable_torch_init
from llava.mm_utils import process_images, tokenizer_image_token, get_model_name_from_path
from transformers.generation.streamers import TextIteratorStreamer
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from transformers import LlamaConfig
import importlib
from llava.serve.barista import LoraInferenceService


from PIL import Image

import requests
from io import BytesIO

import time
import subprocess
from threading import Thread

device = "cuda"

In [None]:
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
os.environ['TORCH_USE_CUDA_DSA'] = '1'

In [None]:
# prompt = "<image>\nUSER: What's the content of the image?\nASSISTANT:"
system_prompt = "A chat between a curious human and mean-spirited, clever therapist. The therapist tries to insult the user in the most relevant, hurtful and clever way possible."
base_prompt = "How would you insult this person?"
prompt = f'{system_prompt} <image>\nUSER: {base_prompt} ASSISTANT:'
label = "I spent a couple minutes thinking of a roast, but like your baby’s dad, I lost interest and don’t plan to think about you anymore."
# test_image = "https://preview.redd.it/yrdedweuk3ic1.jpeg?width=960&crop=smart&auto=webp&s=0ade9b61358296bfd98c43801cfe4b6dc8d2e243"
test_image = "https://i.redd.it/8dnekc5w4nfa1.jpg"
# model_path = "liuhaotian/llava-v1.5-7b"
model_path = "../merged_checkpoints/llava-augmented-roastme-v1-MERGE"

In [None]:
# Base
model_name = get_model_name_from_path(model_path)
tokenizer, model, image_processor, context_len = load_pretrained_model("liuhaotian/llava-v1.5-7b", model_name="llava-v1.5-7b", model_base=None, load_8bit=False, load_4bit=False)
model.to(model.device)

In [None]:
def prepare_inputs(image_path, prompt):
    # Configure conversational format to be Llava V1
    conv_mode = "llava_v1"
    conv = conv_templates[conv_mode].copy()

    # Prepare prompt based on configuration?
    inp = DEFAULT_IMAGE_TOKEN + '\n' + prompt
    conv.append_message(conv.roles[0], inp)

    conv.append_message(conv.roles[1], None)
    prompt = conv.get_prompt()

    # Load Image
    image_data = load_image(str(image_path))

    images = [image_data]
    image_sizes = [x.size for x in images]
    images_tensor = process_images(
        images,
        image_processor,
        model.config
    ).to(model.device, dtype=torch.float16)

    input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
    print(prompt)
    return input_ids, images_tensor, image_sizes
    
def prepare_label_ids(input_ids, label):
    return tokenizer.encode(label, return_tensors='pt', padding="max_length", max_length=input_ids.shape[1]).cuda()

def generate(image_path, prompt, top_p, temperature, max_new_tokens):
    input_ids, images_tensor, image_sizes = prepare_inputs(image_path, prompt)    

    print(input_ids.shape)
    print(label_input_ids.shape)
    print(images_tensor.shape)

    with torch.inference_mode():
        output_ids = model.generate(
            input_ids,
            images=images_tensor,
            image_sizes=image_sizes,
            do_sample=True,
            temperature=temperature,
            num_beams=1,
            top_p=top_p,
            max_new_tokens=max_new_tokens,
            use_cache=True
        )

    return tokenizer.batch_decode(output_ids, skip_special_tokens=True)


def forward(image_path, prompt, top_p, temperature, max_new_tokens, label):
    input_ids, images_tensor, image_sizes = prepare_inputs(image_path, prompt) 
    label_input_ids = tokenizer.encode(label, return_tensors='pt', padding="max_length", max_length=input_ids.shape[1]).cuda() if label else None
    
    print(input_ids.shape)
    print(images_tensor.shape)    
    
    return model.forward(input_ids, images=images_tensor, use_cache=True, image_sizes=image_sizes, labels=label_input_ids)

def the_call(image_path, prompt, top_p, temperature, max_new_tokens, label):
    input_ids, images_tensor, image_sizes = prepare_inputs(image_path, prompt) 
    label_input_ids = tokenizer.encode(label, return_tensors='pt', padding="max_length", max_length=input_ids.shape[1]).cuda()
    
    return model(input_ids=input_ids, images=images_tensor, use_cache=True, image_sizes=image_sizes, labels=label_input_ids)

In [None]:
# generate(test_image, prompt, 1.0, .2, 512)
forward(test_image, prompt, 1.0, .2, 512, None)
# call_outputs = the_call(test_image, prompt, 1.0, .2, 512, label)

In [None]:
forward_outputs.logits.shape[1]
len([n for n, m in model.named_modules()])

In [None]:
final_logit_layer = forward_outputs.logits[:,-1,:]
max_token_prob = F.softmax(final_logit_layer).argmax()
print(max_token_prob.shape)
prediction = tokenizer.batch_decode(max_token_prob.unsqueeze(0), skip_special_tokens=True, clean_up_tokenization_spaces=False)
print(f'Prediction {prediction} vs. {label} has loss: {forward_outputs.loss}')

In [None]:
for logits, labels in zip([forward_outputs.logits, call_outputs.logits], ["I", "YOU", label]):
    print(logits.shape)
    loss = None
    input_ids, images_tensor, image_sizes = prepare_inputs(test_image, prompt) 
    label_input_ids = tokenizer.encode(label, return_tensors='pt', padding="max_length", max_length=input_ids.shape[1]).cuda()
    config = LlamaConfig()
    
    print(input_ids.shape)
    print(label_input_ids.shape)
    print("1---")
    
    if label_input_ids is not None:
        # Shift so that tokens < n predict n
        shift_logits = logits[..., :-1, :].contiguous()
        # shift_logits = logits.contiguous()
        shift_labels = label_input_ids[..., 1:].contiguous()
        print(shift_logits.shape)
        print(shift_labels.shape)
        print("2-----")
        
        # Is 625 the sequence length?????

        # Flatten the tokens
        loss_fct = CrossEntropyLoss()
        shift_logits = shift_logits.view(-1, config.vocab_size)
        shift_labels = shift_labels.view(-1)
        
        # Enable model parallelism
        shift_labels = shift_labels.to(shift_logits.device)
        print(shift_logits.shape)
        print(shift_labels.shape)
        loss = loss_fct(shift_logits, shift_labels)
        
    print(f'Loss: {loss}')

In [None]:
config = LlamaConfig()
output_attentions = config.output_attentions
output_hidden_states = config.output_hidden_states
return_dict = config.use_return_dict
attention_mask = None
position_ids = None
past_key_values = None
inputs_embeds = None
use_cache = True
cache_position = None
config

In [None]:
test = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt')
tokenizer.batch_decode(test, skip_special_tokens=True)

In [None]:
input_ids, images_tensor, image_sizes = prepare_inputs(test_image, prompt)
label_input_ids = tokenizer.encode(label, return_tensors='pt', padding="max_length", max_length=input_ids.shape[1]).cuda()

(_input_ids, position_ids, attention_mask, past_key_values, inputs_embeds, labels) = model.prepare_inputs_labels_for_multimodal(input_ids=input_ids, position_ids=None, attention_mask=None, past_key_values=None, labels=label_input_ids, images=images_tensor, image_sizes=image_sizes)

outputs = model.forward(input_ids, position_ids=None, attention_mask=None, past_key_values=None, labels=label_input_ids, images=images_tensor, image_sizes=image_sizes)

In [None]:
logits = outputs.logits
for i in range(logits.shape[1]):
    logit_layer = logits[:,i ,:]
    max_token_prob = F.softmax(logit_layer).argmax()
    prediction = tokenizer.batch_decode(max_token_prob.unsqueeze(0), skip_special_tokens=True, clean_up_tokenization_spaces=False)
    print(f'Index {i} -> {prediction[0]}')

    
    # Shift so that tokens < n predict n
    shift_logits = logits[..., :-1, :].contiguous()
    # shift_logits = logits.contiguous()
    shift_labels = label_input_ids[..., 1:].contiguous()
    print(shift_logits.shape)
    print(shift_labels.shape)
    print("2-----")
    
    #     # Is 625 the sequence length?????

    #     # Flatten the tokens
    #     loss_fct = CrossEntropyLoss()
    #     shift_logits = shift_logits.view(-1, config.vocab_size)
    #     shift_labels = shift_labels.view(-1)
        
    #     # Enable model parallelism
    #     shift_labels = shift_labels.to(shift_logits.device)
    #     print(shift_logits.shape)
    #     print(shift_labels.shape)
    #     loss = loss_fct(shift_logits, shift_labels)

# Generate Loop

In [None]:
# Basic generate
def generate(image_path, prompt, top_p, temperature, max_new_tokens):
    input_ids, images_tensor, image_sizes = prepare_inputs(image_path, prompt)

    with torch.inference_mode():
        output_ids = model.generate(
            input_ids,
            images=images_tensor,
            image_sizes=image_sizes,
            do_sample=True,
            temperature=temperature,
            num_beams=1,
            top_p=top_p,
            max_new_tokens=max_new_tokens,
            use_cache=True
        )

    return tokenizer.batch_decode(output_ids, skip_special_tokens=True)

generate(test_image, prompt, .21, .1, 512)

In [None]:
def forward(model, image_path, prompt, top_p, temperature, max_new_tokens, label):
    input_ids, images_tensor, image_sizes = prepare_inputs(image_path, prompt) 
    label_input_ids = tokenizer.encode(label, return_tensors='pt', padding="max_length", max_length=input_ids.shape[1]).cuda() if label else None
    
    print(input_ids.shape)
    print(images_tensor.shape)    
    
    return model.forward(input_ids, images=images_tensor, use_cache=True, image_sizes=image_sizes, labels=label_input_ids)


# Variables
prompt = "How would you insult this person?"
augmented_prompt = "A chat between a curious human and mean-spirited, clever therapist. The therapist tries to insult the user in the most relevant, hurtful and clever way possible. <image>\nUSER: How would you insult this person? ASSISTANT:"
label = "I spent a couple minutes thinking of a roast, but like your baby’s dad, I lost interest and don’t plan to think about you anymore."
predicted_token = None

# Process prompt, image, and labels
input_ids, images_tensor, image_sizes = prepare_inputs(test_image, prompt) 
label_input_ids = tokenizer.encode(label, return_tensors='pt', padding="max_length", max_length=input_ids.shape[1]).cuda()
curr_prompt_ids = input_ids

for token in label_input_ids:
    print(f'Current Prompt: {curr_prompt_ids}')

    # Forward
    outputs = model.forward(curr_prompt_ids, images=images_tensor, image_sizes=image_sizes)

    # Grab final logits layer for prediction
    final_logits_layer = outputs.logits[:, :-1, :]
    print(f'Final Logits Layer: {final_logits_layer.shape}')

    # Prediction
    predicted_prob = F.softmax(final_logits_layer, dim=1).argmax()
    print(f'Predict Token: {predicted_prob}')
    print(f'Predict Token: {predicted_prob.unsqueeze(0)}')
    print(f'Prediction: {tokenizer.batch_decode(predicted_prob.unsqueeze(0), skip_special_tokens=True, clean_up_tokenization_spaces=False)}')

    # Iterate
    curr_prompt_ids += predicted_prob

    # Calculate loss?
    del outputs
    break

outputs = model.forward(input_ids, images=images_tensor, image_sizes=image_sizes)

# Loss

In [None]:
outputs.loss

# Matrix Loss

In [None]:
logits = outputs.logits

# Shift logits because we need logits to predict the next character
logits_for_predictions = logits[..., :-1, :]
yb = labels[..., 1:]
assert yb.shape == logits_for_predictions.shape[:2]

# View?
shift_logits = logits_for_predictions.view(-1, config.vocab_size)
shift_labels = yb.view(-1)
print(shift_logits.shape)
print(shift_labels.shape)

# Enable model parallelism
shift_labels = shift_labels.to(shift_logits.device)
loss_fct(shift_logits, shift_labels)

# Individually Calculated Loss

In [None]:
total_loss = 0
count = 0
for i in range(logits.shape[1]):    
    logit_layer = logits[:, i, :].contiguous()

    # Prediction
    max_token_prob = F.softmax(logit_layer).argmax()
    prediction = tokenizer.batch_decode(max_token_prob.unsqueeze(0), skip_special_tokens=True, clean_up_tokenization_spaces=False)

    # Label
    label_id = labels[0][i].unsqueeze(0)
    if label_id.item() == -100:
        break
        
    y = tokenizer.batch_decode(label_id, skip_special_tokens=True, clean_up_tokenization_spaces=False)
    
    loss_fct = CrossEntropyLoss()
    loss = loss_fct(logit_layer, label_id)
    print(f'{i} --> Prediction {prediction} vs {y} w/Loss: {loss}')
    
    total_loss += loss
    count += 1

print(total_loss / count)

In [None]:
shift_logits[-2, :]

In [None]:
einops.repeat(self.W_pos[:seq_len], "seq d_model -> batch seq d_model", batch=batch)

In [None]:
# x = torch.empty((4, 64))
x

In [None]:
x[:2]

In [None]:
reference_model_path = "liuhaotian/llava-v1.5-7b"
lora_model_path = '../checkpoints/llava-v1.5-7b-augmented-roastme-lora-train-8-epochs'

In [None]:
lora_service = LoraInferenceService(model_path=reference_model_path, load_8bit=False, load_4bit=False)
lora_service.model.device

# lora_service.load_lora_weights('../checkpoints/llava-v1.5-7b-augmented-roastme-lora-train-8-epochs')

In [None]:
# ???
# lora_service.tokenizer.pad_token = "[PAD]"
# lora_service.tokenizer.padding_side = "left"
# lora_service.model.half().cuda()
# lora_service.model = lora_service.unload_lora_weights()

# Load Image
image_data = load_image(str(test_image))

lora_service.predict(image_data=image_data, prompt=base_prompt, system_prompt=system_prompt, top_p=.1, temperature=.8, max_new_tokens=512, lora_model_path=lora_model_path)

In [9]:
model_paths = lora_model_path.split("/")
if model_paths[-1].startswith('checkpoint-'):
    model_name = model_paths[-2] + "_" + model_paths[-1]
else:
    print('here')
    model_name = model_paths[-1]

model_name

here


'llava-v1.5-7b-augmented-roastme-lora-train-8-epochs'