In [1]:
%pip list | grep torch

torch                    2.5.1
torchao                  0.6.1
torchaudio               2.5.1
torchvision              0.20.1
Note: you may need to restart the kernel to use updated packages.


In [2]:
import torch
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
    
MODEL = "/root/Qwen2-VL-Finetune/output/qwen_2.5_awq"

# default: Load the model on the available device(s)
# model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
#     MODEL, torch_dtype="auto", device_map="auto"
# )

# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    MODEL,
    torch_dtype=torch.float16,
    attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processer
processor = AutoProcessor.from_pretrained(MODEL)

# The default range for the number of visual tokens per image in the model is 4-16384.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# min_pixels = 256*28*28
# max_pixels = 1280*28*28
# processor = AutoProcessor.from_pretrained(MODEL, min_pixels=min_pixels, max_pixels=max_pixels)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [3]:


messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": "https://sun9-9.userapi.com/s/v1/ig2/OrdG0mXaWsClrqfY7ssI9avprWp_pRNim3jwu7PbXOeBRHXmlMC0tFKE0yoQiuqNv0jNwgDuppeymPT3XGYZQzb-.jpg?quality=95&as=32x43,48x64,72x96,108x144,160x214,240x321,360x481,480x642,540x722,640x856,718x960&from=bu&u=FmjzhCfdQkiZU4L0y8i1M_4VtRbR9dgDabBbQEEbrK0&cs=452x604",
            },
            {
                "type": "image",
                "image": "https://sun9-34.userapi.com/s/v1/ig2/J3a8Jpk6DY3RWvlfCTiAX9Bn-UzGI1j2QTQShV1zBMatTsdQ50xLH8JgiUYS3H1JxiOs9MkIwOMcT11JjB1YpwNU.jpg?quality=95&as=32x43,48x64,72x96,108x144,160x213,240x320,360x479,480x639,540x719,600x799&from=bu&u=sWW5DeUku7Lr9JjhGDczZyAHWf2FEx0u-7Rlz0tvyew&cs=454x604",
            },
            {
                "type": "image",
                "image": "https://sun9-68.userapi.com/s/v1/ig2/AQSKdoZ-0folAM0d_ak4__oaUoRAApD1YmW3QKck73OiymRonqBALcMtB8l72TeVI6BeUt6TPnLG8aQy_quRxZUH.jpg?quality=95&as=32x43,48x64,72x96,108x144,160x213,240x320,360x480,480x640,540x720,640x853,720x960,736x981&from=bu&u=nVkrNErVssolqXu30B8uzCjTfwsdo94uK3GC8Fp7Nh8&cs=453x604",
            },
            {
                "type": "image",
                "image": "https://sun9-9.userapi.com/s/v1/ig2/SZzc-DUhPGVXaXwgUZHy86EdXr3vmMCvLXSiVKXavnXtoKZ-EvWQerJ58A8QG6qF9pWs17tIfGevs3E8wrL748hL.jpg?quality=95&as=32x31,48x47,72x70,108x106,160x156,240x235,360x352,480x469,540x528,640x626,720x704,1080x1056,1280x1251,1440x1408,1600x1564&from=bu&u=kJa9BN3r-UHuva2CZnYbWVussDV-W6oOwPn5BsQWn1I&cs=604x590",
            },
            {
                "type": "text", 
                "text": "\n## Task\nExtract all product parameters from the given description and images, and return the data in JSON format. Maintain the original language of the product description in the output.\n\n## Product description\nРаспродажа платья\n💥️ Ткань Атлас + сетка\n💥️ Римент в комплекте Длина 123-125си\n💥️ Размер (50-52)(54-56)(58-60)\n💥️ Цена 500 рублей\n💥 Цвет без выбора"
            },
        ],
    }
]

# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")

# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=2048)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text[0])

```json
[
    {
        "params": [
            {
                "name": "наименование",
                "values": [
                    "платье"
                ]
            },
            {
                "name": "материал",
                "values": [
                    "атлас",
                    "сетка"
                ]
            },
            {
                "name": "особенности",
                "values": [
                    "без выбора цвета",
                    "распродажа"
                ]
            },
            {
                "name": "пол",
                "values": [
                    "женский"
                ]
            },
            {
                "name": "длина изделия",
                "values": [
                    "123-125 см"
                ]
            },
            {
                "name": "категория",
                "values": [
                    "женская одежда",
                    "повседневная одежда",
                    