In [2]:
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
import os
import requests
from PIL import Image
import torch

# os.environ["CUDA_VISIBLE_DEVICES"] = "3"

model = Qwen2VLForConditionalGeneration.from_pretrained(
    "../model/Qwen/Qwen2-VL-2B-Instruct-AWQ", device_map="auto"
)
processor = AutoProcessor.from_pretrained("/data/model/Qwen/Qwen2-VL-2B-Instruct-AWQ")

# Image
url_1 = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
image_1 = Image.open(requests.get(url_1, stream=True).raw)
url_2 = "https://pic2.zhimg.com/v2-284d76d52cc507a0637ee06913aa07bf_1440w.jpg"
image_2 = Image.open(requests.get(url_2, stream=True).raw)

image_2

ValueError: You current version of `autoawq` does not support module quantization skipping, please upgrade `autoawq` package to at least 0.1.8.

In [28]:
# Messages containing multiple images and a text query
messages = [
    {
        "role": "user",
        "content": [
            {"type": "text", "text": "1."},
            {"type": "image"},
            {"type": "text", "text": "2."},
            {"type": "image"},
            {
                "type": "text",
                "text": "Judge whether the two images are similar or not., You should say 'Yes' or 'No'.",
            },
        ],
    }
]

# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
text

"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n1.<|vision_start|><|image_pad|><|vision_end|>2.<|vision_start|><|image_pad|><|vision_end|>Judge whether the two images are similar or not., You should say 'Yes' or 'No'.<|im_end|>\n<|im_start|>assistant\n"

In [32]:
image_inputs = [image_1, image_2]
inputs = processor(
    text=[text],
    images=image_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")

# Inference
generate_kwargs = {
    "do_sample": True,
    "temperature": 0.8,
}

# Generate the next token probabilities
with torch.no_grad():
    outputs = model(**inputs)
    next_token_logits = outputs.logits[:, -1, :]
    next_token_probs = torch.softmax(next_token_logits, dim=-1)

# Get the top 5 tokens with the highest probabilities
top_k_probs, top_k_indices = torch.topk(next_token_probs, k=5, dim=-1)
print("Top 5 token probabilities:", top_k_probs)
# print("Top 5 token indices:", top_k_indices)

# decode the top 5 tokens
top_k_tokens = processor.batch_decode(top_k_indices.view(-1, 1))
print("Top 5 tokens:", top_k_tokens)

Top 5 token probabilities: tensor([[0.5830, 0.3647, 0.0048, 0.0040, 0.0031]], device='cuda:0',
       dtype=torch.float16)
Top 5 tokens: ['No', 'Yes', 'This', '根据', 'The']


In [13]:
len(inputs.input_ids[0])

1336

### VLLM to accelerate the inference

In [1]:
import torch
from transformers import AutoTokenizer, AutoProcessor
import vllm
from logits_processor_zoo.vllm import MultipleChoiceLogitsProcessor
import os

model_path = "../model/Qwen/Qwen2-VL-2B-Instruct-AWQ"

tokenizer = AutoTokenizer.from_pretrained(model_path)
processor = AutoProcessor.from_pretrained(model_path)

llm = vllm.LLM(
    model_path,
    quantization="awq",
    tensor_parallel_size=2,
    dtype="half",
    max_model_len=5120,
    disable_log_stats=True,
    enforce_eager=True,
)

llm

  from .autonotebook import tqdm as notebook_tqdm
2025-01-09 14:22:38,113	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


INFO 01-09 14:22:48 config.py:350] This model supports multiple tasks: {'embedding', 'generate'}. Defaulting to 'generate'.
INFO 01-09 14:22:48 config.py:1020] Defaulting to use mp for distributed inference
INFO 01-09 14:22:48 llm_engine.py:249] Initializing an LLM engine (v0.6.4.post1) with config: model='../model/Qwen/Qwen2-VL-2B-Instruct-AWQ', speculative_config=None, tokenizer='../model/Qwen/Qwen2-VL-2B-Instruct-AWQ', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=5120, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=2, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq, enforce_eager=True, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_for

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  1.34it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  1.34it/s]



INFO 01-09 14:22:55 model_runner.py:1077] Loading model weights took 1.1682 GB
[1;36m(VllmWorkerProcess pid=1022144)[0;0m INFO 01-09 14:22:55 model_runner.py:1077] Loading model weights took 1.1682 GB
[1;36m(VllmWorkerProcess pid=1022144)[0;0m INFO 01-09 14:23:00 worker.py:232] Memory profiling results: total_gpu_memory=10.75GiB initial_memory_usage=1.96GiB peak_torch_memory=1.52GiB memory_usage_post_profile=2.43GiB non_torch_memory=1.25GiB kv_cache_size=6.90GiB gpu_memory_utilization=0.90
INFO 01-09 14:23:00 worker.py:232] Memory profiling results: total_gpu_memory=10.75GiB initial_memory_usage=1.96GiB peak_torch_memory=1.52GiB memory_usage_post_profile=2.43GiB non_torch_memory=1.25GiB kv_cache_size=6.90GiB gpu_memory_utilization=0.90
INFO 01-09 14:23:01 distributed_gpu_executor.py:57] # GPU blocks: 32311, # CPU blocks: 18724
INFO 01-09 14:23:01 distributed_gpu_executor.py:61] Maximum concurrency for 5120 tokens per request: 100.97x


<vllm.entrypoints.llm.LLM at 0x7f13e9f9c950>

In [6]:
from PIL import Image
import requests

# Image
url_1 = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
url_2 = "https://pic2.zhimg.com/v2-284d76d52cc507a0637ee06913aa07bf_1440w.jpg"

# Messages containing multiple images and a text query
messages = [
    {
        "role": "user",
        "content": [
            {"type": "text", "text": "1."},
            {
                "type": "image_url",
                "image_url": {
                    "url": url_1,
                },
            },
            {"type": "text", "text": "2."},
            {"type": "image_url", "image_url": {"url": url_2}},
            {
                "type": "text",
                "text": "Judge whether the two images are similar or not., You should say 'Yes' or 'No'.",
            },
        ],
    }
]

messages = str(messages)

# Preparation for inference
# text = processor.apply_chat_template(
#     messages, tokenize=False, add_generation_prompt=True
# )

# text
# texts = [text for _ in range(1000)]

In [8]:
messages

'[{\'role\': \'user\', \'content\': [{\'type\': \'text\', \'text\': \'1.\'}, {\'type\': \'image_url\', \'image_url\': {\'url\': \'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg\'}}, {\'type\': \'text\', \'text\': \'2.\'}, {\'type\': \'image_url\', \'image_url\': {\'url\': \'https://pic2.zhimg.com/v2-284d76d52cc507a0637ee06913aa07bf_1440w.jpg\'}}, {\'type\': \'text\', \'text\': "Judge whether the two images are similar or not., You should say \'Yes\' or \'No\'."}]}]'

In [7]:
logits_processor = MultipleChoiceLogitsProcessor(tokenizer, choices=["Yes", "No"])

response = llm.generate(
    [messages],
    vllm.SamplingParams(
        n=1,
        top_k=1,
        temperature=0,
        max_tokens=1,
        logits_processors=[logits_processor],
    ),
    use_tqdm=True,
)

for resp in response:
    print(resp.outputs[0].text)

Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  5.83it/s, est. speed input: 1009.08 toks/s, output: 5.87 toks/s]

No



