In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,5,6"
import torch

from transformers import TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
from modelscope import snapshot_download
from modelscope import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor, Qwen2_5OmniThinkerForConditionalGeneration

from transformers.models.qwen2_vl.video_processing_qwen2_vl import Qwen2VLVideoProcessor
from transformers.video_utils import VideoMetadata
from typing import Optional
from qwen_omni_utils import process_mm_info
import json
from torch.utils.data import Dataset
from transformers.image_utils import SizeDict

In [2]:
class FixedResQwen2VLVideoProcessor(Qwen2VLVideoProcessor):
    def _preprocess(
        self, videos, do_resize=True, size=None, interpolation=None, **kwargs
    ):
        # 固定分辨率
        fixed_size = SizeDict(height=224, width=224)
        for i, video in enumerate(videos):
            videos[i] = self.resize(video, size=fixed_size, interpolation=interpolation)
        return super()._preprocess(videos, do_resize=False, size=fixed_size, interpolation=interpolation, **kwargs)

In [3]:
class OmniVideoConversationDataset(Dataset):
    def __init__(
        self,
        json_path: str,
        video_root: str,
    ):
        with open(json_path, "r") as f:
            self.data = json.load(f)

        self.video_root = video_root

    def __len__(self):
        return len(self.data)

    def _build_text(self, conversations):
        messages = []
        for turn in conversations:
            if turn["from"] == "human":
                role = "user"
            elif turn["from"] == "gpt":
                role = "assistant"
            else:
                continue

            messages.append({
                "role": role,
                "content": turn["value"]
            })

        return messages

    def __getitem__(self, idx):
        sample = self.data[idx]
        video_id = sample["id"]
        video_path = os.path.join(self.video_root, f"{video_id}.mp4")

        conversation = [
            {
                "role": "system",
                "content": [
                    {"type": "text", "text": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."}
                ],
            },
            {
                "role": "user",
                "content": [
                    {"type": "video", "video": video_path},
                    {"type": "text", "text": sample["conversations"][0]["value"]},
                ],
            },
            {
                "role": "assistant",
                "content": [
                    {"type": "text", "text": sample["conversations"][1]["value"]},
                ],
            },
        ]

        return {"conversation": conversation}



class QwenOmniDataCollator:
    def __init__(self, processor):
        self.processor = processor
        self.tokenizer = processor.tokenizer

    def __call__(self, features):
        texts = []
        videos = []
        audios = []
        labels_list = []

        for f in features:
            conversation = f["conversation"]

            # ---------- 1. 拼完整 prompt ----------
            full_text = self.processor.apply_chat_template(
                conversation,
                tokenize=False,
                add_generation_prompt=False,
            )

            # ---------- 2. 构造 labels（后缀 assistant） ----------
            assistant_text = conversation[-1]["content"][0]["text"]

            full_ids = self.tokenizer(
                full_text,
                add_special_tokens=False,
            )["input_ids"]

            assistant_ids = self.tokenizer(
                assistant_text,
                add_special_tokens=False,
            )["input_ids"]

            labels = [-100] * len(full_ids)
            labels[-len(assistant_ids):] = assistant_ids

            texts.append(full_text)
            labels_list.append(labels)

            # ---------- 3. 多模态 ----------
            for msg in conversation:
                if msg["role"] == "user":
                    for ele in msg["content"]:
                        if ele.get("type") == "video":
                            ele["fps"] = 0.5

            audios_, _, videos_ = process_mm_info(
                conversation, use_audio_in_video=True
            )

            videos.append(videos_[0] if videos_ else None)
            audios.append(audios_[0] if audios_ else None)

        # ---------- 4. 一次性 processor ----------
        batch = self.processor(
            text=texts,
            videos=videos,
            audio=audios,
            padding=True,
            return_tensors="pt",
            use_audio_in_video=True,
        )

        print(batch["video_grid_thw"].shape) 
        print(batch["pixel_values_videos"].shape) 
        for k, v in batch.items(): 
            if isinstance(v, torch.Tensor): 
                print(k, v.shape, v.numel() * v.element_size() / 1024**3, "GB")


        # ---------- 5. pad labels ----------
        max_len = batch["input_ids"].shape[1]
        padded_labels = []

        for lab in labels_list:
            padded = lab + [-100] * (max_len - len(lab))
            padded_labels.append(padded)

        batch["labels"] = torch.tensor(padded_labels, dtype=torch.long)

        return batch


train_dataset = OmniVideoConversationDataset(
    json_path="../../LongVALE/data/longvale-sft-bp-7k.json",
    video_root="../../LongVALE/raw_videos_train/video_train_7240/"
)

In [4]:
model_path = snapshot_download(
    'Qwen/Qwen2.5-Omni-3B',
    cache_dir="../../Qwen/cache/modelscope"
)

model = Qwen2_5OmniThinkerForConditionalGeneration.from_pretrained(
    model_path,
    dtype=torch.bfloat16,
    device_map="balanced",
    trust_remote_code=True,
    use_safetensors=True
)

# model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
#     model_path,
#     dtype=torch.bfloat16,
#     device_map="balanced",
#     trust_remote_code=True,
#     use_safetensors=True
# )
# model = model.thinker

video_processor = FixedResQwen2VLVideoProcessor.from_pretrained(model_path)

processor = Qwen2_5OmniProcessor.from_pretrained(
    model_path,
    video_processor=video_processor,
)

Downloading Model from https://www.modelscope.cn to directory: ../../Qwen/cache/modelscope/Qwen/Qwen2.5-Omni-3B


2025-12-18 01:23:14,240 - modelscope - INFO - Target directory already exists, skipping creation.
Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_section'}


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.


In [5]:
config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "up_proj", "down_proj"],
    # target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.05,
    bias="none"
)

model = get_peft_model(model, config)

for name, param in model.named_parameters():
    if (
        "audio_tower" in name
        or "visual" in name
    ):
        param.requires_grad = False

model.gradient_checkpointing_enable()
model.config.use_cache = False


model.print_trainable_parameters()

trainable params: 22,413,312 || all params: 4,734,622,720 || trainable%: 0.4734


In [6]:
batch_size = 1

args = TrainingArguments(
    output_dir="./r_models",
    remove_unused_columns=False,
    eval_strategy="no",
    save_strategy="epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=2,
    # per_device_eval_batch_size=batch_size,
    bf16=True,
    fp16=False,
    num_train_epochs=2, # 5 -> 2
    logging_steps=5,
    load_best_model_at_end=False,
)

In [7]:
data_collator = QwenOmniDataCollator(processor)

trainer = Trainer(
    model=model,
    # model=model.thinker,
    args=args,
    train_dataset=train_dataset,
    data_collator=data_collator,
)

Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [8]:
print(type(processor.video_processor))

<class '__main__.FixedResQwen2VLVideoProcessor'>


In [9]:
trainer.train()

  librosa.load(
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
qwen-vl-utils using decord to read video.


torch.Size([1, 3])
torch.Size([33024, 1176])
input_ids torch.Size([1, 15866]) 0.00011821091175079346 GB
attention_mask torch.Size([1, 15866]) 0.00011821091175079346 GB
pixel_values_videos torch.Size([33024, 1176]) 0.14467620849609375 GB
video_grid_thw torch.Size([1, 3]) 2.2351741790771484e-08 GB
video_second_per_grid torch.Size([1]) 3.725290298461914e-09 GB
feature_attention_mask torch.Size([1, 30000]) 0.00011175870895385742 GB
input_features torch.Size([1, 128, 30000]) 0.01430511474609375 GB


  librosa.load(
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


torch.Size([1, 3])
torch.Size([2560, 1176])
input_ids torch.Size([1, 1807]) 1.3463199138641357e-05 GB
attention_mask torch.Size([1, 1807]) 1.3463199138641357e-05 GB
pixel_values_videos torch.Size([2560, 1176]) 0.0112152099609375 GB
video_grid_thw torch.Size([1, 3]) 2.2351741790771484e-08 GB
video_second_per_grid torch.Size([1]) 3.725290298461914e-09 GB
feature_attention_mask torch.Size([1, 30000]) 0.00011175870895385742 GB
input_features torch.Size([1, 128, 30000]) 0.01430511474609375 GB


  librosa.load(
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


torch.Size([1, 3])
torch.Size([7424, 1176])
input_ids torch.Size([1, 5794]) 4.316866397857666e-05 GB
attention_mask torch.Size([1, 5794]) 4.316866397857666e-05 GB
pixel_values_videos torch.Size([7424, 1176]) 0.03252410888671875 GB
video_grid_thw torch.Size([1, 3]) 2.2351741790771484e-08 GB
video_second_per_grid torch.Size([1]) 3.725290298461914e-09 GB
feature_attention_mask torch.Size([1, 30000]) 0.00011175870895385742 GB
input_features torch.Size([1, 128, 30000]) 0.01430511474609375 GB


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


OutOfMemoryError: CUDA out of memory. Tried to allocate 8.98 GiB. GPU 2 has a total capacity of 23.69 GiB of which 6.64 GiB is free. Including non-PyTorch memory, this process has 17.02 GiB memory in use. Of the allocated memory 16.63 GiB is allocated by PyTorch, and 90.49 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
from inspect import signature, getdoc

# 查看 forward 方法的签名

sig = signature(model.forward)
print("Forward method parameters:")
for name, param in sig.parameters.items():
    print(f"  {name}: {param.annotation} = {param.default}")

# 查看参数总数
print(f"\nTotal parameters in forward: {len(sig.parameters)}")
print("\nSignature:", sig)

Forward method parameters:
  args: Any = <class 'inspect._empty'>
  kwargs: Any = <class 'inspect._empty'>

Total parameters in forward: 2

Signature: (*args: 'Any', **kwargs: 'Any')
