In [None]:
! pip install yt-dlp ffmpeg-python decord

In [None]:
! pip install qwen-vl-utils[decord]==0.0.8

In [None]:
from google.colab import userdata
import os

os.environ["HF_TOKEN"] = userdata.get("HF_TOKEN")

In [None]:
import yt_dlp
from yt_dlp.utils import download_range_func
# from moviepy.editor import VideoFileClip
import os
import os.path as osp
import ffmpeg

# Список видео и параметры нарезки
videos = [
    # (ссылка, старт_сек, продолжительность, название)
    ("https://youtu.be/E8gmARGvPlI", 21, 20, "christmas"),
    ("https://youtu.be/9Xvp7_KETrE", 183, 20, "photocamera"),
    ("https://youtu.be/EiUy8aJJKOQ", 707, 20, "lecture"),
    # ("https://youtu.be/9bZkp7q19f0", 24, 20, "gangam_style"),
    # ("https://youtu.be/XU75_nskEMY", 29, 20, "ot_vinta"),
    ("https://youtu.be/hSK_f1L0-xE", 51, 20, "radiotapok"),
    # ("https://youtu.be/cR0ou0U3m2g", 21, 30, "ryba"),
    ("https://youtu.be/n8i53TtQ6IQ", 35, 25, "mass_effect"),
    ("https://youtu.be/sDruogVbfRg", 314, 30, "naruto"),
    # ("https://youtu.be/N03iShqFXmE", 125, 53, "zhukov"),
]

OUTPUT_FOLDER = "videos"
OUTPUT_FILES = {}

def cut_video(start, end, input_file, output_file):
    ffmpeg.input(input_file, ss=start, to=end).output(output_file, c='copy').run(overwrite_output=True)

def download_and_cut(youtube_url, start_t, duration, out_file):
    """Скачивает видео и нарезает нужный фрагмент"""

    print("Downloading video")

    os.makedirs(OUTPUT_FOLDER, exist_ok=True)

    download_name = osp.join(OUTPUT_FOLDER, f"full_{out_file}.mp4")

    cut_border = 5

    ydl_opts = {
        'format': 'mp4/best',
        'outtmpl': download_name,
        'quiet': True,
        'noplaylist': True,
        'download_ranges': download_range_func(None, [(start_t - cut_border, start_t + duration)])
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([youtube_url])

    print("Cutting video")

    output_file = osp.join(OUTPUT_FOLDER, f"{out_file}.mp4")
    cut_video(cut_border, cut_border + duration, download_name, output_file)
    os.remove(download_name)
    # Обрезка необходима чтобы убрать черные кадры в начале, обращующиеся при скачивании
    OUTPUT_FILES[out_file] = output_file


for idx, (url, start, dur, name) in enumerate(videos, 1):
    print(f"Processing video {name} ({idx}/{len(videos)})...")
    try:
        download_and_cut(url, start, dur, name)
        print(f"Video {name} ({idx}/{len(videos)}) is processed")
    except Exception as e:
        print(f"Error processing {url}: {e}")

print("All videos processed!")

In [None]:
osp.abspath(OUTPUT_FILES['christmas'])

In [None]:
from transformers import Qwen2VLForConditionalGeneration, Qwen2_5_VLForConditionalGeneration, Qwen3VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
import gc


class ModelOutputBuilder:
    def __init__(self, type_model, model):
        self.model_name = model
        self.type_model = type_model

    def __enter__(self):
        self.model = self.type_model.from_pretrained(self.model_name, torch_dtype="auto", device_map="auto")
        self.processor = AutoProcessor.from_pretrained(self.model_name)
        return self

    def __exit__(self, *args, **kwargs):
        del self.model
        del self.processor
        gc.collect()

    def __call__(self, video, prompt):
        messages = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "video",
                        "video": f"file://{osp.abspath(OUTPUT_FILES[video])}",
                        "max_pixels": 360 * 420,
                        "fps": 1.0,
                    },
                    {"type": "text", "text": prompt},
                ],
            }
        ]

        text = self.processor.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
        image_inputs, video_inputs = process_vision_info(messages)
        inputs = self.processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
            padding=True,
            return_tensors="pt",
        )
        inputs = inputs.to("cuda")


        generated_ids = self.model.generate(**inputs, max_new_tokens=4000)
        generated_ids_trimmed = [
            out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
        ]
        output_text = self.processor.batch_decode(
            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
        )

        return output_text[0]

In [None]:
with ModelOutputBuilder(Qwen2_5_VLForConditionalGeneration, "Qwen/Qwen2.5-VL-3B-Instruct") as model:
    print(model("christmas", "Посмотри на видео и ответь, что на нем изображено"))

In [None]:
import gc

del model
gc.collect()

In [None]:
import os, shlex, subprocess, base64
from pathlib import Path
from IPython.display import HTML, display

def ensure_ffmpeg():
    from shutil import which
    if which("ffmpeg") is None:
        # на Colab обычно уже есть; оставляю на всякий случай
        !apt -y -qq update >/dev/null
        !apt -y -qq install ffmpeg >/dev/null

def reencode_to_h264(src_path: str|Path) -> Path:
    ensure_ffmpeg()
    src = Path(src_path)
    assert src.exists(), f"File not found: {src}"
    dst = src.with_name(src.stem + "_h264.mp4")
    if dst.exists() and dst.stat().st_size > 0:
        return dst
    cmd = (
        f'ffmpeg -y -i {shlex.quote(str(src))} '
        f'-vcodec libx264 -pix_fmt yuv420p -profile:v baseline -level 3.0 '
        f'-movflags +faststart -an {shlex.quote(str(dst))} -loglevel error'
    )
    subprocess.run(cmd, shell=True, check=True)
    assert dst.exists() and dst.stat().st_size > 0, "Re-encode failed: output empty"
    return dst

def show_video(path: str|Path, width: int = 560, force_reencode: bool = True):
    p = Path(path)
    assert p.exists(), f"Input path missing: {p}"
    if force_reencode:
        p = reencode_to_h264(p)
    # читаем и встраиваем как base64 — обходим капризы Video(...)
    with open(p, "rb") as f:
        b64 = base64.b64encode(f.read()).decode("utf-8")
    html = f"""
    <video width="{width}" controls playsinline>
      <source src="data:video/mp4;base64,{b64}" type="video/mp4">
      Your browser does not support the video tag.
    </video>
    <div style="font-size:12px;color:#666">File: {p} • {p.stat().st_size/1024:.1f} KB</div>
    """
    return HTML(html)

# Пример:
display(show_video(OUTPUT_FILES['christmas'], width=320, force_reencode=True))

In [None]:
from tqdm.auto import tqdm
from textwrap import fill


def print_f(text):
    print(fill(text, width=120))


def compare_answers(prompt):
    models = [
        (Qwen2VLForConditionalGeneration, "Qwen/Qwen2-VL-2B-Instruct"),
        (Qwen2_5_VLForConditionalGeneration, "Qwen/Qwen2.5-VL-3B-Instruct"),
        (Qwen3VLForConditionalGeneration, "Qwen/Qwen3-VL-4B-Instruct"),
    ]

    answers = {}

    for model_type, model_name in models:
        print_f(f"Started processing by model: {model_name}")
        with ModelOutputBuilder(model_type, model_name) as model:
            for name in tqdm(list(OUTPUT_FILES.keys())):
                answer = model(name, prompt)
                val_t = answers.get(name, {})
                val_t[model_name.split("/")[-1]] = answer
                answers[name] = val_t

        gc.collect()

    for name, answer in answers.items():
        print_f(f"Video: {name}")
        display(show_video(OUTPUT_FILES[name], width=320, force_reencode=True))

        print()
        print_f(f"Q: {prompt}")

        for model_name, model_answer in answer.items():
            print_f(f"A {model_name}: {model_answer}")

        print()
        print_f("---------------------------------")
        print()

In [None]:
compare_answers("Describe the video")

In [None]:
compare_answers("Опиши видео")

In [None]:
compare_answers("Опиши видео на русском языке")

In [None]:
compare_answers("Сколько людей находятся на видео?")

In [None]:
compare_answers("Назови жанр видео")

In [None]:
compare_answers("Какой жанр у данного видеофрагмента:\n1. Фильм\n2. Трейлер видеоигры\n3. Мультфильм\n4. Лекция\n5. Другое\nОтветь одной цифрой")