In [2]:
import os
import torch

import llava
from torch.nn import functional as F
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
from llava.conversation import conv_templates, SeparatorStyle
from llava.eval.run_llava import load_image
from llava.model.language_model.llava_llama import LlavaLlamaForCausalLM
from llava.model.builder import load_pretrained_model
from llava.utils import disable_torch_init
from llava.mm_utils import process_images, tokenizer_image_token, get_model_name_from_path
from transformers.generation.streamers import TextIteratorStreamer
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from transformers import LlamaConfig
import importlib


from PIL import Image

import requests
from io import BytesIO

import time
import subprocess
from threading import Thread

device = "cuda"

  from .autonotebook import tqdm as notebook_tqdm


[2024-03-15 06:54:41,851] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)


2024-03-15 06:54:42.496510: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-15 06:54:42.550204: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-15 06:54:42.550236: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-15 06:54:42.552583: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-15 06:54:42.561834: I tensorflow/core/platform/cpu_feature_guar

#!/bin/bash

deepspeed llava/train/train_mem.py \
    --lora_enable True --lora_r 256 --lora_alpha 512 --mm_projector_lr 2e-5 \
    --deepspeed ./scripts/zero3.json \
    --model_name_or_path liuhaotian/llava-v1.5-7b \
    --version v1 \
    --data_path /home/devonperoutky/LLaVA/dataset/augmented/train_dataset.json \
    --image_folder /home/devonperoutky/LLaVA/dataset/ \
    --vision_tower openai/clip-vit-large-patch14-336 \
    --mm_projector_type mlp2x_gelu \
    --mm_vision_select_layer -2 \
    --mm_use_im_start_end False \
    --mm_use_im_patch_token False \
    --image_aspect_ratio pad \
    --group_by_modality_length True \
    --bf16 True \
    --output_dir ./checkpoints/llava-v1.5-7b-augmented-roastme-lora-train-rank256-2 \
    --num_train_epochs 2 \
    --per_device_train_batch_size 8 \
    --per_device_eval_batch_size 4 \
    --gradient_accumulation_steps 1 \
    --evaluation_strategy "no" \
    --save_strategy "steps" \
    --save_steps 50000 \
    --save_total_limit 1 \
    --learning_rate 2e-4 \
    --weight_decay 0. \
    --warmup_ratio 0.03 \
    --lr_scheduler_type "cosine" \
    --logging_steps 1 \
    --tf32 True \
    --model_max_length 2048 \
    --gradient_checkpointing True \
    --dataloader_num_workers 4 \
    --lazy_preprocess True \
    --report_to wandb


In [None]:
def find_all_linear_names(model):
    cls = torch.nn.Linear
    lora_module_names = set()
    multimodal_keywords = ['mm_projector', 'vision_tower', 'vision_resampler']
    for name, module in model.named_modules():
        if any(mm_keyword in name for mm_keyword in multimodal_keywords):
            continue
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if 'lm_head' in lora_module_names: # needed for 16-bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)


In [None]:
model_name_or_path = "liuhaotian/llava-v1.5-7b"
training_args = None
attn_implementation="flash_attention_2"
torch_dtype = torch.bfloat16
bits = 16
gradient_checkpointing=True
lora_enable = True
lora_r = 128
lora_alpha = 256
model_max_length=2048
lora_dropout=.05
lora_bias=None
bf16 = True
fp16 = False
use_cache = False

bnb_model_from_pretrained_args = {}
if bits in [4, 8]:
    from transformers import BitsAndBytesConfig
    bnb_model_from_pretrained_args.update(dict(
        device_map={"": training_args.device},
        load_in_4bit=training_args.bits == 4,
        load_in_8bit=training_args.bits == 8,
        quantization_config=BitsAndBytesConfig(
            load_in_4bit=training_args.bits == 4,
            load_in_8bit=training_args.bits == 8,
            llm_int8_skip_modules=["mm_projector"],
            llm_int8_threshold=6.0,
            llm_int8_has_fp16_weight=False,
            bnb_4bit_compute_dtype=compute_dtype,
            bnb_4bit_use_double_quant=training_args.double_quant,
            bnb_4bit_quant_type=training_args.quant_type # {'fp4', 'nf4'}
        )
    ))

In [None]:
# Instantiate model
model = LlavaLlamaForCausalLM.from_pretrained(
    model_name_or_path,
    cache_dir=training_args,
    attn_implementation=attn_implementation,
    torch_dtype=torch_dtype,
    **bnb_model_from_pretrained_args
)

In [None]:
if gradient_checkpointing:
    if hasattr(model, "enable_input_require_grads"):
        model.enable_input_require_grads()
    else:
        def make_inputs_require_grad(module, input, output):
            output.requires_grad_(True)
        model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)

if lora_enable:
    from peft import LoraConfig, get_peft_model
    lora_config = LoraConfig(
        r=lora_r,
        lora_alpha=lora_alpha,
        target_modules=find_all_linear_names(model),
        lora_dropout=lora_dropout,
        bias=lora_bias,
        task_type="CAUSAL_LM",
    )
    if bits == 16:
        if bf16:
            model.to(torch.bfloat16)
        if fp16:
            model.to(torch.float16)
    rank0_print("Adding LoRA adapters...")
    model = get_peft_model(model, lora_config)

tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_name_or_path,
    cache_dir=cache_dir,
    model_max_length=model_max_length,
    padding_side="right",
    use_fast=False,
)

In [None]:
!nvidia-smi

# Simple Approach

In [None]:
# Default
model_path = "liuhaotian/llava-v1.5-7b"
model_base = None

In [None]:
model_name = get_model_name_from_path(model_path)
tokenizer, model, image_processor, context_len = load_pretrained_model(
    model_path, model_base, model_name
)

# Forward pass

In [None]:
# prompt = "<image>\nUSER: What's the content of the image?\nASSISTANT:"
base_prompt = "How would you insult this person?"
augmented_prompt = f'A chat between a curious human and mean-spirited, clever therapist. The therapist tries to insult the user in the most relevant, hurtful and clever way possible. USER: <image> {base_prompt} ASSISTANT:'
test_image = "https://preview.redd.it/yrdedweuk3ic1.jpeg?width=960&crop=smart&auto=webp&s=0ade9b61358296bfd98c43801cfe4b6dc8d2e243"
# model_path = "liuhaotian/llava-v1.5-7b"
model_path = "../merged_checkpoints/llava-augmented-roastme-v1-MERGE"
label_text = "I spent a couple minutes thinking of a roast, but like your baby’s dad, I lost interest and don’t plan to think about you anymore."

In [None]:
def predict(image_path, prompt, top_p, temperature, max_new_tokens, loss):
    # Configure conversational format to be Llava V1
    conv_mode = "llava_v1"
    conv = conv_templates[conv_mode].copy()

    # Prepare prompt based on configuration?
    inp = DEFAULT_IMAGE_TOKEN + '\n' + prompt
    conv.append_message(conv.roles[0], inp)

    conv.append_message(conv.roles[1], None)
    prompt = conv.get_prompt()

    # Load Image
    image_data = load_image(str(image_path))
    processed_image_input = image_processor.preprocess(image_data, return_tensors='pt')['pixel_values'].half().cuda()

    images = [image_data]
    image_sizes = [x.size for x in images]
    images_tensor = process_images(
        images,
        image_processor,
        model.config
    ).to(model.device, dtype=torch.float16)

    print(prompt)
    print(images_tensor.shape)

    # Process prompt
    input_ids = tokenizer_image_token(augmented_prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()

    print(input_ids.shape)
    print(processed_image_input.shape)

    if loss:
        output = model.forward(input_ids=input_ids)
        return output
    else:
        with torch.inference_mode():
            output_ids = model.generate(
                input_ids,
                images=processed_image_input,
                image_sizes=image_sizes,
                do_sample=True,
                temperature=temperature,
                num_beams=1,
                top_p=top_p,
                max_new_tokens=max_new_tokens,
                use_cache=True
            )

        return tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()

In [None]:
predict(test_image, prompt, 1.0, .2, 512, False)

# Forward

In [1]:


# Load Image
image_data = load_image(str(test_image))
images = [image_data]
image_sizes = [x.size for x in images]
images_tensor = process_images(
    images,
    image_processor,
    model.config
).to(model.device, dtype=torch.float16)
processed_image_input = image_processor.preprocess(image_data, return_tensors='pt')['pixel_values'].half().cuda()


# Process prompt
print(augmented_prompt)
input_ids = tokenizer_image_token(augmented_prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
print(input_ids.shape)
print(processed_image_input.shape)


model.forward(input_ids, images=images_tensor, image_sizes=image_sizes)

NameError: name 'conv_templates' is not defined

In [None]:
def compute_loss(xb, yb):
    print(xb.shape)
    print(yb.shape)

    # Shift so that tokens < n predict n
    shift_logits = xb[..., :-1, :].contiguous()
    shift_labels = yb[..., 1:].contiguous()
    print(shift_logits.shape)
    print(shift_labels.shape)
    
    loss_fct = CrossEntropyLoss()
    shift_logits = xb.view(-1, model.config.vocab_size)
    shift_labels = yb.view(-1)
    
    # Enable model parallelism
    shift_labels = shift_labels.to(shift_logits.device)
    print(shift_logits.shape)
    print(shift_labels.shape)
    return loss_fct(shift_logits, shift_labels)

In [None]:
curr_ids = input_ids
preds = torch.empty(0, device="cuda", dtype=torch.int64)
alt_preds = torch.empty(0, device="cuda", dtype=torch.int64)
empty_tensor = torch.empty(0, 1)
labels = tokenizer.encode(label_text, return_tensors='pt', padding="max_length", max_length=curr_ids.shape[1]).cuda()

# Remove start token
# labels = labels[..., 1:].contiguous()

for i in range(5):
    (_input_ids, position_ids, attention_mask, past_key_values, inputs_embeds, labels_embeds) = model.prepare_inputs_labels_for_multimodal(input_ids=input_ids, position_ids=None, attention_mask=None, past_key_values=None, labels=labels, images=images_tensor, image_sizes=image_sizes)
    print(f'Input EMbeds: {inputs_embeds.shap\e}')
    print(f'Labels EMbeds: {labels_embeds.shape}')
    output = model.forward(_input_ids, images=images_tensor, image_sizes=image_sizes, inputs_embeds=inputs_embeds)

    final_layer = output.logits[:, -1, :]
    idx_layer = output.logits[:, i, :]

    print(compute_loss(output.logits, labels_embeds))
    
    predicted_token = F.softmax(final_layer).argmax().unsqueeze(dim=0)
    alt_predicted_token = F.softmax(idx_layer).argmax().unsqueeze(dim=0)
    
    preds = torch.cat((preds, predicted_token), dim=0)
    alt_preds = torch.cat((alt_preds, alt_predicted_token), dim=0)
    curr_ids = torch.cat((input_ids, preds.unsqueeze(dim=0)), dim=1)

print(tokenizer.batch_decode(preds))
print(tokenizer.batch_decode(alt_preds))

In [None]:
labels = tokenizer.encode(label, return_tensors='pt', padding="max_length", max_length=input_ids.shape[1]).cuda()

print(labels.shape)
print(input_ids.shape)
labels

In [None]:
labels[..., 1:].contiguous()

In [None]:
labels[:, 1:].contiguous()

In [1]:
model(input_ids, images=images_tensor, image_sizes=image_sizes)

NameError: name 'model' is not defined