In [8]:
import os
# os.environ['HTTP_PROXY'] = 'http://127.0.0.1:7890'
# os.environ['HTTPS_PROXY'] = 'http://127.0.0.1:7890'
os.environ["CUDA_VISIBLE_DEVICES"] = "1,2,3"

In [4]:
from transformers.models.qwen2_vl.video_processing_qwen2_vl import Qwen2VLVideoProcessor
from transformers.video_utils import VideoMetadata
import torch
from typing import Optional

In [5]:
class FixedFrameQwen2VLVideoProcessor(Qwen2VLVideoProcessor):
    def __init__(self, num_frames=100, **kwargs):
        super().__init__(**kwargs)
        self.fixed_num_frames = num_frames

    def sample_frames(
        self,
        metadata: VideoMetadata,
        temporal_patch_size: Optional[int] = None,
        **kwargs,
    ):
        temporal_patch_size = temporal_patch_size or self.temporal_patch_size

        # 对齐 temporal_patch_size（Qwen2.5 默认 = 2）
        num_frames = round(self.fixed_num_frames / temporal_patch_size) * temporal_patch_size
        num_frames = min(num_frames, metadata.total_num_frames)

        indices = torch.linspace(
            0,
            metadata.total_num_frames - 1,
            steps=num_frames,
        ).long()

        return indices

In [6]:
import soundfile as sf
from modelscope import snapshot_download
from modelscope import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor
from qwen_omni_utils import process_mm_info

model_dir = snapshot_download(
    'Qwen/Qwen2.5-Omni-3B',
    cache_dir="../../Qwen/cache/modelscope"
)

2025-12-16 23:54:21,992 - modelscope - INFO - Target directory already exists, skipping creation.


Downloading Model from https://www.modelscope.cn to directory: ../../Qwen/cache/modelscope/Qwen/Qwen2.5-Omni-3B


In [9]:
model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
    model_dir,
    device_map="cuda:0",
    dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
)
model.disable_talker()

video_processor = FixedFrameQwen2VLVideoProcessor.from_pretrained(model_dir)
processor = Qwen2_5OmniProcessor.from_pretrained(
    model_dir,
    video_processor=video_processor,
)

Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_section'}


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.


In [10]:
conversation = [
    {
        "role": "system",
        "content": [
            {"type": "text", "text": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."}
        ],
    },
    {
        "role": "user",
        "content": [
            {"type": "video", "video": "./test.mp4"},
            {"type": "text", "text": "Could you detail events during different time segments? Format strictly:\nFrom xx to xx, event1.\nFrom xx to xx, event2.\n..."}
        ],
    },
]

USE_AUDIO_IN_VIDEO = True

text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
audios, images, videos = process_mm_info(conversation, use_audio_in_video=USE_AUDIO_IN_VIDEO)
inputs = processor(text=text, audio=audios, images=images, videos=videos, return_tensors="pt", padding=True, use_audio_in_video=USE_AUDIO_IN_VIDEO)
inputs = inputs.to(model.device).to(model.dtype)

print("video_grid_thw:", inputs["video_grid_thw"])

  audios.append(librosa.load(path, sr=16000)[0])
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
qwen-vl-utils using torchvision to read video.


video_grid_thw: tensor([[59, 26, 34]], device='cuda:0')


In [11]:
text_ids = model.generate(**inputs, use_audio_in_video=USE_AUDIO_IN_VIDEO, return_audio=False)

text = processor.batch_decode(text_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
print(text)

['system\nYou are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.\nuser\nCould you detail events during different time segments? Format strictly:\nFrom xx to xx, event1.\nFrom xx to xx, event2.\n...\nassistant\nFrom 0.00 to 10.00, a man is standing in front of a tree.\nFrom 10.00 to 15.00, the man climbs up the tree.\nFrom 15.00 to 20.00, the man climbs down the tree.\nFrom 20.00 to 25.00, the man climbs up the tree again.\nFrom 25.00 to 30.00, the man climbs down the tree.\nFrom 30.00 to 35.00, the man climbs up the tree once more.\nFrom 35.00 to 40.00, the man climbs down the tree.\nFrom 40.00 to 45.00, the man climbs up the tree for the third time.\nFrom 45.00 to 50.00, the man climbs down the tree.\nFrom 50.00 to 55.00, the man climbs up the tree for the fourth time.\nFrom 55.00 to 60.00, the man climbs down the tree.\nFrom 60.00 to 65.00, the man climbs up the tree for the fif