In [1]:
!nvidia-smi

import torch

if torch.cuda.is_available():
  print('Using GPU')
  device = 'cuda'
else:
  print('CUDA not available. Please connect to a GPU instance if possible.')
  device = 'cpu'

/bin/bash: line 1: nvidia-smi: command not found
CUDA not available. Please connect to a GPU instance if possible.


In [2]:
!git clone https://github.com/hkchengrex/Cutie.git
%cd Cutie
!pip install -e .

Cloning into 'Cutie'...
remote: Enumerating objects: 609, done.[K
remote: Counting objects: 100% (238/238), done.[K
remote: Compressing objects: 100% (73/73), done.[K
remote: Total 609 (delta 199), reused 165 (delta 165), pack-reused 371 (from 1)[K
Receiving objects: 100% (609/609), 2.81 MiB | 10.69 MiB/s, done.
Resolving deltas: 100% (308/308), done.
/content/Cutie
Obtaining file:///content/Cutie
  Installing build dependencies ... [?25l[?25hdone
  Checking if build backend supports build_editable ... [?25l[?25hdone
  Getting requirements to build editable ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing editable metadata (pyproject.toml) ... [?25l[?25hdone
Collecting thinplate@ git+https://github.com/cheind/py-thin-plate-spline (from cutie==1.0.0)
  Cloning https://github.com/cheind/py-thin-plate-spline to /tmp/pip-install-epn_ji0t/thinplate_dc715d241d094420a98e2dd00eef5901
  Running command git clone --filter=blob:none --quiet https:/

In [3]:
%cd /content/Cutie
!python cutie/utils/download_models.py

/content/Cutie
Downloading coco_lvis_h18_itermask.pth to /content/Cutie/cutie/utils/../../weights...
100% 40.7M/40.7M [00:01<00:00, 26.8MiB/s]
Downloading cutie-base-mega.pth to /content/Cutie/cutie/utils/../../weights...
100% 140M/140M [00:05<00:00, 26.6MiB/s]


In [6]:
%cd /content/Cutie/

import torch
from omegaconf import open_dict
from hydra import compose, initialize_config_dir
from hydra.core.global_hydra import GlobalHydra

from cutie.model.cutie import CUTIE
from cutie.inference.utils.args_utils import get_dataset_cfg

# ---- choose device ----
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# 如果你想强制不用GPU，就取消下一行注释：
# DEVICE = "cpu"

# ---- clear hydra (notebook) ----
if GlobalHydra.instance().is_initialized():
    GlobalHydra.instance().clear()

with torch.inference_mode():
    initialize_config_dir(
        version_base="1.3.2",
        config_dir="/content/Cutie/cutie/config",
        job_name="eval_config",
    )
    cfg = compose(config_name="eval_config")

    with open_dict(cfg):
        cfg["weights"] = "./weights/cutie-base-mega.pth"
        if cfg.get("mem_every", None) is None:
            cfg["mem_every"] = 5
        if cfg.get("stagger_updates", None) is None:
            cfg["stagger_updates"] = 0

    _ = get_dataset_cfg(cfg)

    cutie = CUTIE(cfg).to(DEVICE).eval()
    model_weights = torch.load(cfg.weights, map_location=DEVICE)
    cutie.load_weights(model_weights)

print("CUTIE loaded OK:", cfg.weights, "| device:", DEVICE)


/content/Cutie
CUTIE loaded OK: ./weights/cutie-base-mega.pth | device: cpu


In [10]:
import os, cv2, tempfile
import numpy as np
from PIL import Image
import torch
import gradio as gr
from omegaconf import open_dict

import traceback
from cutie.inference.inference_core import InferenceCore
from gui.interactive_utils import image_to_torch, torch_prob_to_numpy_mask, index_numpy_to_one_hot_torch, overlay_davis

DEFAULT_VIDEO = "echo[1].mp4"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

def _resolve_video_path(p):
    cand = [p, os.path.join("/content", p)]
    for c in cand:
        if os.path.exists(c):
            return c
    raise gr.Error(f"Video not found: {p}")

def _get_video_info(video_path):
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        raise gr.Error(f"Cannot open video: {video_path}")
    fps = cap.get(cv2.CAP_PROP_FPS)
    fps = 30.0 if (fps is None or fps <= 1e-3) else float(fps)
    n = int(cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0)
    w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH) or 0)
    h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT) or 0)
    cap.release()
    return fps, n, w, h

def _read_frame(video_path, frame_idx):
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        raise gr.Error(f"Cannot open video: {video_path}")
    cap.set(cv2.CAP_PROP_POS_FRAMES, int(frame_idx))
    ok, frame = cap.read()
    cap.release()
    if (not ok) or frame is None:
        raise gr.Error(f"Failed to read frame {frame_idx}")
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    return frame, Image.fromarray(frame_rgb)

def _editor_value_from_frame(frame_pil):
    # 关键：让你“在这一帧上画”，不是黑底
    # ImageEditor 需要 composite 字段，否则你之前会 KeyError
    return {"background": frame_pil, "layers": [], "composite": frame_pil}

def _mask_from_editor(editor_value):
    """
    从 ImageEditor 取 mask：用 composite 和 background 的像素差分得到前景区域
    你在帧上画的地方会改变 composite 像素 -> diff>阈值 -> mask=1
    """
    if editor_value is None:
        raise gr.Error("Mask editor is empty. Please paint on the frame.")
    bg = editor_value.get("background", None)
    comp = editor_value.get("composite", None) or bg
    if bg is None or comp is None:
        raise gr.Error("ImageEditor returned no background/composite.")

    bg = bg.convert("RGB")
    comp = comp.convert("RGB")
    bg_arr = np.array(bg).astype(np.int16)
    cp_arr = np.array(comp).astype(np.int16)

    if bg_arr.shape != cp_arr.shape:
        raise gr.Error("Editor output size mismatch. Try reloading the frame.")

    diff = np.abs(cp_arr - bg_arr).sum(axis=-1)  # H,W
    mask = (diff > 25).astype(np.uint8)          # 阈值可调：越大越不敏感
    return mask

def _save_overlay_video(frames_bgr, fps, out_mp4):
    h, w = frames_bgr[0].shape[:2]
    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    vw = cv2.VideoWriter(out_mp4, fourcc, fps, (w, h))
    for f in frames_bgr:
        vw.write(f)
    vw.release()


def load_video(video_path_str):
    vp = _resolve_video_path(video_path_str)
    fps, n, w, h = _get_video_info(vp)
    _, frame0_pil = _read_frame(vp, 0)
    editor_init = _editor_value_from_frame(frame0_pil)

    info = f"Loaded: {vp} | frames={n} | fps={fps:.2f} | size={w}x{h}"

    # 选帧 slider：0..n-1
    frame_idx_update = gr.update(minimum=0, maximum=max(0, n-1), value=0, step=1)

    # frames_to_propagate 上限：n-1-当前帧（当前帧=0）
    max_prop = max(1, (n - 1) - 0)
    frames_to_prop_update = gr.update(minimum=1, maximum=max_prop, value=min(200, max_prop), step=1)

    # max_internal_size：建议不超过 max(w,h)，越小越快越糊
    max_side = max(w, h)
    default_mis = min(max_side, 720)  # 你也可以改成 640/800 等
    max_internal_update = gr.update(minimum=256, maximum=max(256, max_side), value=default_mis, step=32)

    return vp, frame0_pil, editor_init, frame_idx_update, frames_to_prop_update, max_internal_update, info


def show_frame(video_path_str, frame_idx):
    vp = _resolve_video_path(video_path_str)
    fps, n, w, h = _get_video_info(vp)

    frame_idx = int(frame_idx)
    _, frame_pil = _read_frame(vp, frame_idx)

    # 切到该帧时，frames_to_propagate 上限跟着变：n-1-当前帧
    max_prop = max(1, (n - 1) - frame_idx)
    frames_to_prop_update = gr.update(minimum=1, maximum=max_prop, value=min(200, max_prop), step=1)

    return frame_pil, _editor_value_from_frame(frame_pil), frames_to_prop_update

def run_track(video_path_str, start_frame_idx, editor_value, frames_to_propagate, max_internal_size):
    vp = _resolve_video_path(video_path_str)
    fps, n, w, h = _get_video_info(vp)

    start_frame_idx = int(start_frame_idx)
    frames_to_propagate = int(frames_to_propagate)
    max_internal_size = int(max_internal_size)

    remaining = (n - 1) - start_frame_idx
    if remaining <= 0:
        raise gr.Error(f"No remaining frames from start_frame={start_frame_idx}. video_frames={n}")
    frames_to_propagate = max(1, min(frames_to_propagate, remaining))

    mask_index = _mask_from_editor(editor_value)
    if mask_index.sum() < 10:
        raise gr.Error("Mask too small / empty. Please paint a larger region on the frame.")

    # 关键：mask resize 回视频尺寸
    if mask_index.shape[0] != h or mask_index.shape[1] != w:
        mask_index = cv2.resize(mask_index, (w, h), interpolation=cv2.INTER_NEAREST)

    # 写入 cfg
    with open_dict(cfg):
        cfg["max_internal_size"] = max_internal_size

    processor = InferenceCore(cutie, cfg=cfg)

    cap = cv2.VideoCapture(vp)
    if not cap.isOpened():
        raise gr.Error(f"Cannot open video: {vp}")
    cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame_idx)

    overlay_frames_bgr = []
    current = 0
    torch.cuda.empty_cache()

    with torch.inference_mode():
        #with torch.amp.autocast(device_type="cuda", enabled=(DEVICE == "cuda")):

        amp_device = "cuda" if DEVICE == "cuda" else "cpu"
        with torch.amp.autocast(device_type=amp_device, enabled=(DEVICE == "cuda")):


            while cap.isOpened() and current < frames_to_propagate:
                ok, frame = cap.read()
                if (not ok) or frame is None:
                    break

                frame_torch = image_to_torch(frame, device=DEVICE)

                if current == 0:
                    # 打印确认 max_internal_size 已写进去
                    print("cfg.max_internal_size =", cfg.get("max_internal_size", None))

                    mask_torch = index_numpy_to_one_hot_torch(mask_index, 2).to(DEVICE)  # num_objects=1 => 2
                    pred = processor.step(frame_torch, mask_torch[1:], idx_mask=False)
                else:
                    pred = processor.step(frame_torch)

                pred_index = torch_prob_to_numpy_mask(pred)  # 0/1

                vis = overlay_davis(frame, pred_index)

                # overlay_davis 有时返回 PIL / np / RGB/BGR 不一致：这里统一成 BGR 写视频
                if isinstance(vis, Image.Image):
                    vis = np.array(vis)
                if vis.dtype != np.uint8:
                    vis = vis.astype(np.uint8)

                # 如果 vis 看起来是 RGB（多数情况），转 BGR；如果它已经像 BGR（跟原frame更接近），不转
                if vis.ndim == 3 and vis.shape[2] == 3:
                    diff_as_bgr = np.mean(np.abs(vis.astype(np.int16) - frame.astype(np.int16)))
                    vis_as_bgr_from_rgb = cv2.cvtColor(vis, cv2.COLOR_RGB2BGR)
                    diff_as_rgb = np.mean(np.abs(vis_as_bgr_from_rgb.astype(np.int16) - frame.astype(np.int16)))
                    vis_bgr = vis if diff_as_bgr <= diff_as_rgb else vis_as_bgr_from_rgb
                else:
                    vis_bgr = frame  # 异常情况就退回原帧

                overlay_frames_bgr.append(vis_bgr)
                current += 1

    cap.release()

    if len(overlay_frames_bgr) == 0:
        raise gr.Error("No frames processed. Check video path / start_frame.")

    out_dir = tempfile.mkdtemp(prefix="cutie_ui_")
    overlay_mp4 = os.path.join(out_dir, "overlay.mp4")
    _save_overlay_video(overlay_frames_bgr, fps, overlay_mp4)

    status = (
        f"Done. video={os.path.basename(vp)} | size={w}x{h} | fps={fps:.2f} | "
        f"start={start_frame_idx} | processed={len(overlay_frames_bgr)} | "
        f"frames_to_propagate(clamped)={frames_to_propagate} | max_internal_size={max_internal_size}"
    )
    return overlay_mp4, status


def run_track_safe(video_path_str, start_frame_idx, editor_value, frames_to_propagate, max_internal_size):
    try:
        return run_track(video_path_str, start_frame_idx, editor_value, frames_to_propagate, max_internal_size)
    except Exception as e:
        traceback.print_exc()
        raise gr.Error(str(e))



with gr.Blocks() as demo:
    gr.Markdown("## CUTIE (Preview video in Gradio + draw mask on a selected frame)")

    video_path = gr.Textbox(label="Video path (in /content)", value=DEFAULT_VIDEO)
    with gr.Row():
        load_btn = gr.Button("Load video")
        info = gr.Textbox(label="Info", interactive=False)

    with gr.Row():
        orig_video = gr.Video(label="Original Video (preview here)")
        overlay_video = gr.Video(label="Overlay Video (result preview)")

    gr.Markdown("### 1) Use the video player to preview (pause/seek).  2) Choose a frame index below to annotate (Gradio can't read the paused timestamp).")

    with gr.Row():
        frame_idx = gr.Slider(0, 0, value=0, step=1, label="Frame index to annotate (acts like pause point)")
        show_btn = gr.Button("Load this frame for annotation")

    with gr.Row():
        frame_view = gr.Image(label="Selected Frame", type="pil", interactive=False)
        mask_editor = gr.ImageEditor(label="Paint directly ON the frame (your strokes define the mask)", type="pil")

    #frames_to_prop = gr.Slider(1, 1000, value=200, step=1, label="frames_to_propagate")


    #frames_to_prop = gr.Slider(1, 1, value=1, step=1, label="frames_to_propagate (auto limited)")

    frames_to_prop = gr.Slider(1, 1, value=200, step=1, label="frames_to_propagate (auto max = remaining frames)")

    max_internal_size = gr.Slider(
    256, 1024, value=720, step=32,
    label="max_internal_size (max internal side; smaller=faster, lower quality)"
)

    run_btn = gr.Button("Run CUTIE from this frame")
    status = gr.Textbox(label="Status", interactive=False)


    load_btn.click(
    load_video,
    inputs=[video_path],
    outputs=[orig_video, frame_view, mask_editor, frame_idx, frames_to_prop, max_internal_size, info],
    queue=False
)

    show_btn.click(
    show_frame,
    inputs=[video_path, frame_idx],
    outputs=[frame_view, mask_editor, frames_to_prop],
    queue=False
)
    run_btn.click(
    run_track_safe,
    inputs=[video_path, frame_idx, mask_editor, frames_to_prop, max_internal_size],
    outputs=[overlay_video, status],
    queue=True   # 建议 True，更稳，避免 “Unexpected token '<'”
)




demo.launch(debug=True)


  with torch.cuda.amp.autocast(enabled=False):
  with torch.cuda.amp.autocast(enabled=False):


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://eae439bf074393f571.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


  with torch.cuda.amp.autocast(enabled=False):


  with torch.cuda.amp.autocast(enabled=False):
  with torch.cuda.amp.autocast(enabled=False):
  with torch.cuda.amp.autocast(enabled=False):
  with torch.cuda.amp.autocast(enabled=False):
  with torch.cuda.amp.autocast(enabled=False):
  with torch.cuda.amp.autocast(enabled=False):
  with torch.cuda.amp.autocast(enabled=False):


cfg.max_internal_size = 720


  with torch.cuda.amp.autocast(enabled=False):
  with torch.cuda.amp.autocast(enabled=False):
  with torch.cuda.amp.autocast(enabled=False):
  with torch.cuda.amp.autocast(enabled=False):
  with torch.cuda.amp.autocast(enabled=False):
  with torch.cuda.amp.autocast(enabled=False):
  with torch.cuda.amp.autocast(enabled=False):
  with torch.cuda.amp.autocast(enabled=False):
  with torch.cuda.amp.autocast(enabled=False):
  with torch.cuda.amp.autocast(enabled=False):
  with torch.cuda.amp.autocast(enabled=False):
  with torch.cuda.amp.autocast(enabled=False):
  with torch.cuda.amp.autocast(enabled=False):
  with torch.cuda.amp.autocast(enabled=False):
  with torch.cuda.amp.autocast(enabled=False):
  with torch.cuda.amp.autocast(enabled=False):
  with torch.cuda.amp.autocast(enabled=False):
  with torch.cuda.amp.autocast(enabled=False):
  with torch.cuda.amp.autocast(enabled=False):
  with torch.cuda.amp.autocast(enabled=False):
  with torch.cuda.amp.autocast(enabled=False):
  with torch.

Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://eae439bf074393f571.gradio.live




In [11]:
!grep -R "max_internal_size" -n /content/Cutie/cutie | head


grep: /content/Cutie/cutie/inference/__pycache__/inference_core.cpython-312.pyc: binary file matches
/content/Cutie/cutie/config/video_config.yaml:20:max_internal_size: 480
/content/Cutie/cutie/config/gui_config.yaml:17:max_internal_size: 480
/content/Cutie/cutie/config/eval_config.yaml:23:max_internal_size: -1
/content/Cutie/cutie/inference/inference_core.py:31:        self.max_internal_size = cfg.max_internal_size
/content/Cutie/cutie/inference/inference_core.py:208:        if self.max_internal_size > 0:
/content/Cutie/cutie/inference/inference_core.py:211:            if min_side > self.max_internal_size:
/content/Cutie/cutie/inference/inference_core.py:213:                new_h = int(h / min_side * self.max_internal_size)
/content/Cutie/cutie/inference/inference_core.py:214:                new_w = int(w / min_side * self.max_internal_size)
