In [3]:
# !pip install torch==2.2.0 transformers==4.44.2 numpy==1.24.3 pillow==10.3.0
# !pip install bitsandbytes
# !pip install git+https://github.com/huggingface/peft.git
# !pip install git+https://github.com/huggingface/transformers@21fac7abba2a37fae86106f87fcf9974fd1e3830 accelerate

import torch
from PIL import Image
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# load model
try:
    quantization_config = BitsAndBytesConfig(load_in_4bit=True)
    model = AutoModelForCausalLM.from_pretrained("AIDC-AI/Ovis1.6-Gemma2-9B",
                                                 torch_dtype=torch.bfloat16,
                                                 multimodal_max_length=8192,
                                                 quantization_config=quantization_config,
                                                 low_cpu_mem_usage=True,
                                                 trust_remote_code=True,
                                                 device_map="auto")
    logger.info("Model loaded successfully")
except Exception as e:
    logger.error(f"Failed to load the model: {str(e)}")
    raise

if model is None:
    raise ValueError("Failed to load the model. Please check the model name and your internet connection.")

try:
    text_tokenizer = model.get_text_tokenizer()
    visual_tokenizer = model.get_visual_tokenizer()
    logger.info("Tokenizers loaded successfully")
except AttributeError:
    logger.error("Failed to get tokenizers. Attempting to load tokenizer separately.")
    try:
        text_tokenizer = AutoTokenizer.from_pretrained("AIDC-AI/Ovis1.6-Gemma2-9B")
        visual_tokenizer = model.get_visual_tokenizer()  # Assuming this is a custom method
        logger.info("Tokenizers loaded separately")
    except Exception as e:
        logger.error(f"Failed to load tokenizers: {str(e)}")
        raise

INFO:accelerate.utils.modeling:We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

INFO:__main__:Model loaded successfully
INFO:__main__:Tokenizers loaded successfully


In [6]:
# enter image path and prompt
image_path = input("Enter image path: ")
image = Image.open(image_path)
text = input("Enter prompt: ")
query = f'<image>\n{text}'

# format conversation
prompt, input_ids, pixel_values = model.preprocess_inputs(query, [image])
attention_mask = torch.ne(input_ids, text_tokenizer.pad_token_id)
input_ids = input_ids.unsqueeze(0).to(device=model.device, dtype=torch.long)
attention_mask = attention_mask.unsqueeze(0).to(device=model.device, dtype=torch.long)
pixel_values = [pixel_values.to(dtype=torch.bfloat16, device=model.device)]  # Change to float32

# generate output
with torch.inference_mode():
    gen_kwargs = dict(
        max_new_tokens=1024,
        do_sample=False,
        top_p=None,
        top_k=None,
        temperature=None,
        repetition_penalty=None,
        eos_token_id=model.generation_config.eos_token_id,
        pad_token_id=text_tokenizer.pad_token_id,
        use_cache=True
    )
    output_ids = model.generate(input_ids, pixel_values=pixel_values, attention_mask=attention_mask, **gen_kwargs)[0]
    output = text_tokenizer.decode(output_ids, skip_special_tokens=True)
    print(f'Output:\n{output}')


Enter image path:  ./tes.png
Enter prompt:  describe


RuntimeError: self and mat2 must have the same dtype, but got BFloat16 and Byte