In [1]:
import os
# os.environ['HTTP_PROXY'] = 'http://127.0.0.1:7890'
# os.environ['HTTPS_PROXY'] = 'http://127.0.0.1:7890'
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2"

In [2]:
import torch
from transformers import TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
from modelscope import snapshot_download
from modelscope import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor, Qwen2_5OmniThinkerForConditionalGeneration

from transformers.models.qwen2_vl.video_processing_qwen2_vl import Qwen2VLVideoProcessor
from transformers.video_utils import VideoMetadata
from typing import Optional
from qwen_omni_utils import process_mm_info

In [3]:
import json
from torch.utils.data import Dataset

class OmniVideoConversationDataset(Dataset):
    def __init__(
        self,
        json_path: str,
        video_root: str,
    ):
        with open(json_path, "r") as f:
            self.data = json.load(f)

        self.video_root = video_root

    def __len__(self):
        return len(self.data)

    def _build_text(self, conversations):
        messages = []
        for turn in conversations:
            if turn["from"] == "human":
                role = "user"
            elif turn["from"] == "gpt":
                role = "assistant"
            else:
                continue

            messages.append({
                "role": role,
                "content": turn["value"]
            })

        return messages

    def __getitem__(self, idx):
        sample = self.data[idx]
        messages = self._build_text(sample["conversations"])
        video_id = sample["id"]
        video_path = os.path.join(self.video_root, f"{video_id}.mp4")

        return {
            "text": messages,
            "videos": [video_path],
        }

def build_prompt(messages):
    prompt = ""
    for m in messages:
        if m["role"] == "user":
            prompt += f"<human>{m['content']}</human>"
        elif m["role"] == "assistant":
            prompt += f"<gpt>{m['content']}</gpt>"
    return prompt

class QwenOmniDataCollator:
    def __init__(self, processor, num_frames=50):
        self.processor = processor
        self.num_frames = num_frames

    def __call__(self, features):
        texts = [build_prompt(f["text"]) for f in features]
        print(texts[0])

        # 视频路径列表
        videos = [f["videos"][0] if f.get("videos") else None for f in features]

        
        # text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
        # audios, videos = process_mm_info(conversation, use_audio_in_video=USE_AUDIO_IN_VIDEO)
        batch = self.processor(
            text=texts,
            # audio=audios,
            videos=videos,
            video_num_frames=self.num_frames,
            padding=True,
            return_tensors="pt",
            use_audio_in_video=True
        )

        print(batch["video_grid_thw"].shape)
        print(batch["pixel_values_videos"].shape)
        
        for k, v in batch.items():
            if isinstance(v, torch.Tensor):
                print(k, v.shape, v.numel() * v.element_size() / 1024**3, "GB")

        return batch


class FixedFrameQwen2VLVideoProcessor(Qwen2VLVideoProcessor):
    def __init__(self, num_frames=50, **kwargs):
        super().__init__(**kwargs)
        self.fixed_num_frames = num_frames

    def sample_frames(
        self,
        metadata: VideoMetadata,
        temporal_patch_size: Optional[int] = None,
        **kwargs,
    ):
        temporal_patch_size = temporal_patch_size or self.temporal_patch_size

        # 对齐 temporal_patch_size（Qwen2.5 默认 = 2）
        num_frames = round(self.fixed_num_frames / temporal_patch_size) * temporal_patch_size
        num_frames = min(num_frames, metadata.total_num_frames)

        indices = torch.linspace(
            0,
            metadata.total_num_frames - 1,
            steps=num_frames,
        ).long()

        return indices


In [4]:
train_dataset = OmniVideoConversationDataset(
    json_path="../../LongVALE/data/longvale-sft-bp-7k.json",
    video_root="../../LongVALE/raw_videos_train/video_train_7240/"
)

thinker

github上看到还有就是用 model= model.thinker

In [5]:
model_path = snapshot_download(
    'Qwen/Qwen2.5-Omni-3B',
    cache_dir="../../Qwen/cache/modelscope"
)

model = Qwen2_5OmniThinkerForConditionalGeneration.from_pretrained(
    model_path,
    dtype=torch.bfloat16,
    device_map="balanced",
    trust_remote_code=True,
    use_safetensors=True
)

video_processor = FixedFrameQwen2VLVideoProcessor.from_pretrained(model_path)
processor = Qwen2_5OmniProcessor.from_pretrained(
    model_path,
    video_processor=video_processor,
)

Downloading Model from https://www.modelscope.cn to directory: ../../Qwen/cache/modelscope/Qwen/Qwen2.5-Omni-3B


2025-12-17 07:22:06,253 - modelscope - INFO - Target directory already exists, skipping creation.
Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_section'}


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.


```shell
           ┌─ gate_proj ── activation ──┐
x ──┬──────┤                            ⊙ ── down_proj ── out
    │      └─ up_proj   ───────────────┘
```

+ 不调 gate_proj

    trainable params: 31,158,272 || all params: 4,734,622,720 || trainable%: 0.6581

+ 调 gate_proj

    trainable params: 41,084,928 || all params: 4,744,549,376 || trainable%: 0.8659

In [6]:
config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "up_proj", "down_proj"],
    # target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.05,
    bias="none",
)

model = get_peft_model(model, config)

for name, param in model.named_parameters():
    if (
        "audio_tower" in name
        or "visual" in name
    ):
        param.requires_grad = False

model.gradient_checkpointing_enable()

model.config.use_cache = False

model.print_trainable_parameters()

batch_size = 1

args = TrainingArguments(
    output_dir="./r_models",
    remove_unused_columns=False,
    eval_strategy="no",
    save_strategy="epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=2,
    # per_device_eval_batch_size=batch_size,
    bf16=True,
    num_train_epochs=2, # 5 -> 2
    logging_steps=5,
    save_steps=100,
    load_best_model_at_end=False,
    label_names=["labels"],
)

trainable params: 22,413,312 || all params: 4,734,622,720 || trainable%: 0.4734


In [7]:
for name, module in model.named_modules():
    if hasattr(module, "lora_A"):
        print(name)

base_model.model.audio_tower.layers.0.self_attn.k_proj
base_model.model.audio_tower.layers.0.self_attn.v_proj
base_model.model.audio_tower.layers.0.self_attn.q_proj
base_model.model.audio_tower.layers.1.self_attn.k_proj
base_model.model.audio_tower.layers.1.self_attn.v_proj
base_model.model.audio_tower.layers.1.self_attn.q_proj
base_model.model.audio_tower.layers.2.self_attn.k_proj
base_model.model.audio_tower.layers.2.self_attn.v_proj
base_model.model.audio_tower.layers.2.self_attn.q_proj
base_model.model.audio_tower.layers.3.self_attn.k_proj
base_model.model.audio_tower.layers.3.self_attn.v_proj
base_model.model.audio_tower.layers.3.self_attn.q_proj
base_model.model.audio_tower.layers.4.self_attn.k_proj
base_model.model.audio_tower.layers.4.self_attn.v_proj
base_model.model.audio_tower.layers.4.self_attn.q_proj
base_model.model.audio_tower.layers.5.self_attn.k_proj
base_model.model.audio_tower.layers.5.self_attn.v_proj
base_model.model.audio_tower.layers.5.self_attn.q_proj
base_model

In [8]:
data_collator = QwenOmniDataCollator(processor)

trainer = Trainer(
    model=model,
    # model=model.thinker,
    args=args,
    train_dataset=train_dataset,
    processing_class=processor,
    data_collator=data_collator,
)

Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [9]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 151645, 'bos_token_id': None, 'pad_token_id': 151643}.
Keyword argument `video_num_frames` is not a valid argument for this processor and will be ignored.


<human><video>
Could you tell me what happened from <s3> to <e3> in the video?</human><gpt>In a wooded area, people gather around a white tent, chatting and preparing for an outdoor adventure, while a man carrying a backpack walks purposefully through the woods.</gpt><human>Share what transpired from <s17> to <e17> in the video.</human><gpt>Under a large white tent, people in blue jackets gather around a dog lying on the ground. One person kneels beside it, while another lies down with their head resting on the dog's back. The dog remains calm and relaxed, enjoying the company of its human companions.</gpt><human>Between which two frames can we witness in a cozy living room, a woman with glasses and a blue sweatshirt sits on a beige sofa, smiling and sharing her thoughts and experiences with the viewer, while the soft lighting creates a warm and inviting atmosphere occurring in the video?</human><gpt>From <s21> to <e21>.</gpt><human>Explain what happened from <s8> to <e8> in the video.



torch.Size([1, 3])
torch.Size([9309664, 1176])
input_ids torch.Size([1, 1469]) 1.0944902896881104e-05 GB
attention_mask torch.Size([1, 1469]) 1.0944902896881104e-05 GB
pixel_values_videos torch.Size([9309664, 1176]) 40.7850923538208 GB
video_grid_thw torch.Size([1, 3]) 2.2351741790771484e-08 GB
video_second_per_grid torch.Size([1]) 3.725290298461914e-09 GB


OutOfMemoryError: CUDA out of memory. Tried to allocate 40.79 GiB. GPU 0 has a total capacity of 23.69 GiB of which 16.96 GiB is free. Process 69705 has 1.13 GiB memory in use. Process 77552 has 1.13 GiB memory in use. Including non-PyTorch memory, this process has 4.26 GiB memory in use. Of the allocated memory 3.93 GiB is allocated by PyTorch, and 43.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

pixel_values_videos torch.Size([9309664, 1176]) 40.7850923538208 GB

均匀采样 100 帧没生效？

In [None]:
print(type(processor.video_processor))
print(hasattr(processor.video_processor, "sample_frames"))

<class '__main__.FixedFrameQwen2VLVideoProcessor'>
True
