In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2"
import torch

from transformers import TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
from modelscope import snapshot_download
from modelscope import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor, Qwen2_5OmniThinkerForConditionalGeneration

from transformers.models.qwen2_vl.video_processing_qwen2_vl import Qwen2VLVideoProcessor
from transformers.video_utils import VideoMetadata
from typing import Optional
from qwen_omni_utils import process_mm_info
import json
from torch.utils.data import Dataset
from transformers.image_utils import SizeDict

In [2]:
class FixedResQwen2VLVideoProcessor(Qwen2VLVideoProcessor):
    def _preprocess(
        self, videos, do_resize=True, size=None, interpolation=None, **kwargs
    ):
        # 固定分辨率
        fixed_size = SizeDict(height=224, width=224)
        for i, video in enumerate(videos):
            videos[i] = self.resize(video, size=fixed_size, interpolation=interpolation)
        return super()._preprocess(videos, do_resize=False, size=fixed_size, interpolation=interpolation, **kwargs)

In [3]:
class OmniVideoConversationDataset(Dataset):
    def __init__(
        self,
        json_path: str,
        video_root: str,
    ):
        with open(json_path, "r") as f:
            self.data = json.load(f)

        self.video_root = video_root

    def __len__(self):
        return len(self.data)

    def _build_text(self, conversations):
        messages = []
        for turn in conversations:
            if turn["from"] == "human":
                role = "user"
            elif turn["from"] == "gpt":
                role = "assistant"
            else:
                continue

            messages.append({
                "role": role,
                "content": turn["value"]
            })

        return messages

    def __getitem__(self, idx):
        sample = self.data[idx]
        messages = self._build_text(sample["conversations"])
        video_id = sample["id"]
        video_path = os.path.join(self.video_root, f"{video_id}.mp4")

        return {
            "text": messages,
            "videos": [video_path],
        }

def build_prompt(messages):
    prompt = ""
    for m in messages:
        if m["role"] == "user":
            prompt += f"<human>{m['content']}</human>"
        elif m["role"] == "assistant":
            prompt += f"<gpt>{m['content']}</gpt>"
    return prompt

class QwenOmniDataCollator:
    # def __init__(self, processor, fps=1.0):
    def __init__(self, processor):
        self.processor = processor
        # self.fps = fps

    def __call__(self, features):
        texts = [build_prompt(f["text"]) for f in features]
        print(texts[0])

        # 视频路径列表
        videos = [f["videos"][0] if f.get("videos") else None for f in features]

        
        # text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
        # audios, videos = process_mm_info(conversation, use_audio_in_video=USE_AUDIO_IN_VIDEO)
        batch = self.processor(
            text=texts,
            # audio=audios,
            videos=videos,
            padding=True,
            return_tensors="pt",
            use_audio_in_video=True,
            # fps=self.fps
        )

        print(batch["video_grid_thw"].shape)
        print(batch["pixel_values_videos"].shape)
        
        for k, v in batch.items():
            if isinstance(v, torch.Tensor):
                print(k, v.shape, v.numel() * v.element_size() / 1024**3, "GB")

        return batch

train_dataset = OmniVideoConversationDataset(
    json_path="../../LongVALE/data/longvale-sft-bp-7k.json",
    video_root="../../LongVALE/raw_videos_train/video_train_7240/"
)

In [4]:
model_path = snapshot_download(
    'Qwen/Qwen2.5-Omni-3B',
    cache_dir="../../Qwen/cache/modelscope"
)

model = Qwen2_5OmniThinkerForConditionalGeneration.from_pretrained(
    model_path,
    dtype=torch.bfloat16,
    device_map="balanced",
    trust_remote_code=True,
    use_safetensors=True
)

video_processor = FixedResQwen2VLVideoProcessor.from_pretrained(model_path)
video_processor.do_sample_frames = True
video_processor.fps = 2.0

processor = Qwen2_5OmniProcessor.from_pretrained(
    model_path,
    video_processor=video_processor,
)

2025-12-17 09:01:15,578 - modelscope - INFO - Target directory already exists, skipping creation.
Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_section'}


Downloading Model from https://www.modelscope.cn to directory: ../../Qwen/cache/modelscope/Qwen/Qwen2.5-Omni-3B


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.


In [5]:
config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "up_proj", "down_proj"],
    # target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.05,
    bias="none",
)

model = get_peft_model(model, config)

for name, param in model.named_parameters():
    if (
        "audio_tower" in name
        or "visual" in name
    ):
        param.requires_grad = False
model.gradient_checkpointing_enable()
model.config.use_cache = False

model.print_trainable_parameters()

trainable params: 22,413,312 || all params: 4,734,622,720 || trainable%: 0.4734


In [6]:
batch_size = 1

args = TrainingArguments(
    output_dir="./r_models",
    remove_unused_columns=False,
    eval_strategy="no",
    save_strategy="epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=2,
    # per_device_eval_batch_size=batch_size,
    bf16=True,
    num_train_epochs=2, # 5 -> 2
    logging_steps=5,
    load_best_model_at_end=False,
    label_names=["labels"],
)

In [7]:
data_collator = QwenOmniDataCollator(processor)

trainer = Trainer(
    model=model,
    # model=model.thinker,
    args=args,
    train_dataset=train_dataset,
    processing_class=processor,
    data_collator=data_collator,
)

Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [8]:
print(type(processor.video_processor))

<class '__main__.FixedResQwen2VLVideoProcessor'>


In [9]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 151645, 'bos_token_id': None, 'pad_token_id': 151643}.


<human><video>
Could you tell me what happened from <s3> to <e3> in the video?</human><gpt>In a wooded area, people gather around a white tent, chatting and preparing for an outdoor adventure, while a man carrying a backpack walks purposefully through the woods.</gpt><human>Share what transpired from <s17> to <e17> in the video.</human><gpt>Under a large white tent, people in blue jackets gather around a dog lying on the ground. One person kneels beside it, while another lies down with their head resting on the dog's back. The dog remains calm and relaxed, enjoying the company of its human companions.</gpt><human>Between which two frames can we witness in a cozy living room, a woman with glasses and a blue sweatshirt sits on a beige sofa, smiling and sharing her thoughts and experiences with the viewer, while the soft lighting creates a warm and inviting atmosphere occurring in the video?</human><gpt>From <s21> to <e21>.</gpt><human>Explain what happened from <s8> to <e8> in the video.



torch.Size([1, 3])
torch.Size([98304, 1176])
input_ids torch.Size([1, 1469]) 1.0944902896881104e-05 GB
attention_mask torch.Size([1, 1469]) 1.0944902896881104e-05 GB
pixel_values_videos torch.Size([98304, 1176]) 0.4306640625 GB
video_grid_thw torch.Size([1, 3]) 2.2351741790771484e-08 GB
video_second_per_grid torch.Size([1]) 3.725290298461914e-09 GB
<human><video>
During which frames can we see as the green-gloved hand reaches into the well-stocked refrigerator, grabbing items like milk and cheese from the bottom shelf, a voice from off-screen casually says "Okay happening in the video?</human><gpt>From <s4> to <e4>.</gpt><human>Can you describe what occurred from <s2> to <e2> in the video?</human><gpt>Zooming out from the refrigerator control panel, the camera reveals a child's shark puppet playfully reaching for the ice dispenser. "Hey guys! Hey! Where did you go?" a surprised and happy child's voice calls out from behind the camera.  "I went to take a nap, but... Oh, well that's awes



ValueError: Videos features and image tokens do not match: tokens: 0, features 24576