In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2"
from transformers.models.qwen2_vl.video_processing_qwen2_vl import Qwen2VLVideoProcessor
from transformers.video_utils import VideoMetadata
import torch
from typing import Optional
import soundfile as sf
from modelscope import snapshot_download
from modelscope import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor
from qwen_omni_utils import process_mm_info

In [2]:
model_dir = snapshot_download(
    'Qwen/Qwen2.5-Omni-3B',
    cache_dir="../../Qwen/cache/modelscope"
)

Downloading Model from https://www.modelscope.cn to directory: ../../Qwen/cache/modelscope/Qwen/Qwen2.5-Omni-3B


2025-12-17 09:06:34,886 - modelscope - INFO - Target directory already exists, skipping creation.


In [3]:
from transformers.image_utils import SizeDict

class FixedResQwen2VLVideoProcessor(Qwen2VLVideoProcessor):
    def _preprocess(
        self, videos, do_resize=True, size=None, interpolation=None, **kwargs
    ):
        # 固定分辨率
        fixed_size = SizeDict(height=224, width=224)
        for i, video in enumerate(videos):
            videos[i] = self.resize(video, size=fixed_size, interpolation=interpolation)
        return super()._preprocess(videos, do_resize=False, size=fixed_size, interpolation=interpolation, **kwargs)


In [4]:
model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
    model_dir,
    device_map="balanced",
    dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
)
model.disable_talker()

video_processor = FixedResQwen2VLVideoProcessor.from_pretrained(model_dir)
# video_processor.do_sample_frames = True
# video_processor.fps = 2.0

processor = Qwen2_5OmniProcessor.from_pretrained(
    model_dir,
    video_processor=video_processor,
)

Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_section'}
You are attempting to use Flash Attention 2 without specifying a torch dtype. This might lead to unexpected behaviour


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.


In [5]:
conversation = [
    {
        "role": "system",
        "content": [
            {"type": "text", "text": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."}
        ],
    },
    {
        "role": "user",
        "content": [
            {"type": "video", "video": "./test.mp4"},
            {"type": "text", "text": "Could you detail events during different time segments? Format strictly:\nFrom xx to xx, event1.\nFrom xx to xx, event2.\n..."}
        ],
    },
]

USE_AUDIO_IN_VIDEO = True

text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
for msg in conversation:
    if msg["role"] == "user":
        for ele in msg["content"]:
            if ele.get("type") == "video":
                ele["fps"] = 2.0   # 设置 1fps

audios, images, videos = process_mm_info(conversation, use_audio_in_video=USE_AUDIO_IN_VIDEO)
inputs = processor(text=text, audio=audios, images=images, videos=videos, return_tensors="pt", padding=True, use_audio_in_video=USE_AUDIO_IN_VIDEO)
inputs = inputs.to(model.device).to(model.dtype)

print("video_grid_thw:", inputs["video_grid_thw"])

  audios.append(librosa.load(path, sr=16000)[0])
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
qwen-vl-utils using torchvision to read video.


video_grid_thw: tensor([[165,  16,  16]], device='cuda:0')


In [None]:
text_ids = model.generate(**inputs, use_audio_in_video=USE_AUDIO_IN_VIDEO, return_audio=False)

text = processor.batch_decode(text_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
print(text)


['system\nYou are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.\nuser\nCould you detail events during different time segments? Format strictly:\nFrom xx to xx, event1.\nFrom xx to xx, event2.\n...\nassistant\nFrom 0.00 to 10.00, the video starts with a black screen and then a white logo appears.\nFrom 10.00 to 15.00, a man is shown holding a remote control.\nFrom 15.00 to 20.00, the man is seen wearing a helmet and looking down.\nFrom 20.00 to 25.00, the man is holding a remote control again.\nFrom 25.00 to 30.00, the man is holding a remote control and looking at it.\nFrom 30.00 to 35.00, the man is holding a remote control and looking at it.\nFrom 35.00 to 40.00, the man is holding a remote control and looking at it.\nFrom 40.00 to 45.00, the man is holding a remote control and looking at it.\nFrom 45.00 to 50.00, the man is holding a remote control and looking at it.\nFrom 50.

In [9]:
print(inputs["pixel_values_videos"].shape)


torch.Size([42240, 1176])
