In [1]:
from transformers import AutoProcessor
processor = AutoProcessor.from_pretrained(
    r"A:\hf_models\Qwen2-VL-2B-Instruct",
    min_pixels=256*28*28,
    max_pixels=1024*28*28,
)


The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. 


In [None]:
import os
import cv2
import time
import json
import re
from collections import deque
import numpy as np
from PIL import Image

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info


# =========================
# USER CONFIG
# =========================
VIDEO_PATH = r"A:\Context Aware CCTV Surviellance system\input.mp4"

# You can switch to quantized variants if you choose to download them instead:
# MODEL_ID = "Qwen/Qwen2-VL-2B-Instruct-AWQ"
# MODEL_ID = "Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4"
MODEL_ID = r"A:\hf_models\Qwen2-VL-2B-Instruct"


# Sampling and gating
SAMPLE_EVERY_SEC = 1.0          # sample ~1 frame per second (recommended start)
MOTION_MEANABS_THRESH = 6.0     # lower -> more sensitive; higher -> fewer VLM calls

# Alert smoothing
WINDOW_N = 5                    # last N VLM decisions
TRIGGER_K = 2                   # alert if >=K positives in window
ALERT_COOLDOWN_SEC = 15         # avoid spamming

# Output
OUT_DIR = "outputs"
LOG_JSONL = os.path.join(OUT_DIR, "cctv_events.jsonl")


# =========================
# HELPERS
# =========================
def ensure_dir(p):
    os.makedirs(p, exist_ok=True)

def fmt_hhmmss(seconds: float) -> str:
    seconds = int(max(0, seconds))
    h = seconds // 3600
    m = (seconds % 3600) // 60
    s = seconds % 60
    return f"{h:02d}:{m:02d}:{s:02d}"

def extract_json(text: str):
    """
    Tries to extract a JSON object from model text, even if it adds extra words.
    """
    if text is None:
        return None
    text = text.strip()

    # common case: valid JSON already
    try:
        return json.loads(text)
    except Exception:
        pass

    # try to find first {...} block
    m = re.search(r"\{.*\}", text, flags=re.DOTALL)
    if not m:
        return None
    try:
        return json.loads(m.group(0))
    except Exception:
        return None

def write_jsonl(path, obj):
    with open(path, "a", encoding="utf-8") as f:
        f.write(json.dumps(obj, ensure_ascii=False) + "\n")

def motion_score(prev_bgr, curr_bgr):
    """
    Cheap gate: mean absolute difference on grayscale downscaled frames.
    """
    prev = cv2.cvtColor(prev_bgr, cv2.COLOR_BGR2GRAY)
    curr = cv2.cvtColor(curr_bgr, cv2.COLOR_BGR2GRAY)
    prev = cv2.resize(prev, (160, 90))
    curr = cv2.resize(curr, (160, 90))
    return float(np.mean(cv2.absdiff(prev, curr)))


# =========================
# LOAD MODEL
# =========================
ensure_dir(OUT_DIR)

# Reduce VRAM/time by limiting visual tokens during preprocessing. [web:244]
# These numbers are a practical starting point; adjust if you need more detail.
processor = AutoProcessor.from_pretrained(
    MODEL_ID,
    min_pixels=256 * 28 * 28,
    max_pixels=1024 * 28 * 28,
)

model = Qwen2VLForConditionalGeneration.from_pretrained(
    MODEL_ID,
    torch_dtype="auto",
    device_map="auto",
)
model.eval()

print("Model device:", next(model.parameters()).device)


def vlm_analyze(frames_pil, ts_text):
    """
    frames_pil: list of 3 PIL images (t-1, t, t+1)
    Returns parsed dict (or fallback).
    """
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": f"You are a CCTV safety monitor. Timestamp {ts_text}. Frames are in time order."},
                {"type": "image", "image": frames_pil[0]},
                {"type": "image", "image": frames_pil[1]},
                {"type": "image", "image": frames_pil[2]},
                {"type": "text", "text":
                    "Task: describe what is happening across these 3 frames and decide if it is critical/suspicious.\n"
                    "Critical/suspicious examples: robbery/theft, assault/fight, weapon visible, accident/crash, fire/smoke, vandalism/destruction, forced entry/break-in.\n"
                    "If it's normal activity, mark it non-critical.\n"
                    "Return ONLY JSON exactly in this schema:\n"
                    "{"
                    "\"critical\": true/false,"
                    "\"type\": \"robbery_theft|assault_fight|weapon_visible|accident_crash|fire_smoke|vandalism_destruction|forced_entry|normal|unknown\","
                    "\"confidence\": 0.0-1.0,"
                    "\"description\": \"short\""
                    "}\n"
                    "If unsure, set critical=false, type=\"unknown\", confidence<=0.4."
                },
            ],
        }
    ]

    prompt = processor.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        add_vision_id=True,
    )

    images, videos = process_vision_info(messages)
    inputs = processor(
        text=[prompt],
        images=images,
        videos=videos,
        padding=True,
        return_tensors="pt",
    ).to(model.device)

    with torch.no_grad():
        gen_ids = model.generate(**inputs, max_new_tokens=160)

    txt = processor.batch_decode(gen_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    j = extract_json(txt)

    if isinstance(j, dict):
        # normalize fields
        j.setdefault("critical", False)
        j.setdefault("type", "unknown")
        j.setdefault("confidence", 0.0)
        j.setdefault("description", "")
        return j, txt

    # fallback
    return {
        "critical": False,
        "type": "unknown",
        "confidence": 0.0,
        "description": "Model output was not valid JSON",
    }, txt


# =========================
# VIDEO LOOP
# =========================
cap = cv2.VideoCapture(VIDEO_PATH)
if not cap.isOpened():
    raise RuntimeError(f"Could not open video: {VIDEO_PATH}")

fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
sample_every_frames = max(1, int(fps * SAMPLE_EVERY_SEC))

print("FPS:", fps, "sample_every_frames:", sample_every_frames)

frame_idx = 0
last_sample_bgr = None

# 3-frame buffer for VLM (t-1, t, t+1) on sampled frames
sample_buf_bgr = deque(maxlen=3)

# for smoothing alerts
decision_window = deque(maxlen=WINDOW_N)
last_alert_time = 0

while True:
    ok, frame_bgr = cap.read()
    if not ok:
        break

    if frame_idx % sample_every_frames != 0:
        frame_idx += 1
        continue

    t_sec = frame_idx / fps
    ts = fmt_hhmmss(t_sec)

    # motion gate
    if last_sample_bgr is None:
        score = 999.0
    else:
        score = motion_score(last_sample_bgr, frame_bgr)

    last_sample_bgr = frame_bgr.copy()

    # keep buffer of sampled frames
    sample_buf_bgr.append(frame_bgr)

    # If not enough temporal context, just log lightweight info
    if len(sample_buf_bgr) < 3:
        evt = {
            "ts": ts,
            "t_sec": round(t_sec, 3),
            "tag": "LOG",
            "motion_score": round(score, 3),
            "note": "warming_up_buffer",
        }
        print(f"{ts} | LOG   | motion={evt['motion_score']} | warming_up_buffer")
        write_jsonl(LOG_JSONL, evt)
        frame_idx += 1
        continue

    # only call VLM if motion suggests "something changed"
    if score < MOTION_MEANABS_THRESH:
        evt = {
            "ts": ts,
            "t_sec": round(t_sec, 3),
            "tag": "LOG",
            "motion_score": round(score, 3),
            "note": "no_significant_change",
        }
        print(f"{ts} | LOG   | motion={evt['motion_score']} | no_significant_change")
        write_jsonl(LOG_JSONL, evt)
        frame_idx += 1
        continue

    # Prepare 3 frames for the VLM
    frames_pil = []
    for bgr in list(sample_buf_bgr):
        rgb = cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB)
        frames_pil.append(Image.fromarray(rgb))

    result, raw_text = vlm_analyze(frames_pil, ts_text=ts)

    critical = bool(result.get("critical", False))
    conf = float(result.get("confidence", 0.0))
    typ = str(result.get("type", "unknown"))
    desc = str(result.get("description", ""))

    # Decide alert vs log using smoothing
    positive = critical and conf >= 0.70
    decision_window.append(1 if positive else 0)

    should_alert = (sum(decision_window) >= TRIGGER_K) and ((time.time() - last_alert_time) > ALERT_COOLDOWN_SEC)

    tag = "ALERT" if should_alert else "LOG"
    if should_alert:
        last_alert_time = time.time()

    # "unethical" tag requirement: mark unethical when critical
    unethical = bool(critical)

    evt = {
        "ts": ts,
        "t_sec": round(t_sec, 3),
        "tag": tag,
        "unethical": unethical,
        "motion_score": round(score, 3),
        "critical": critical,
        "type": typ,
        "confidence": round(conf, 3),
        "description": desc[:240],
        "model_raw_preview": raw_text[:240],
        "window_sum": int(sum(decision_window)),
    }

    print(f"{ts} | {tag:<5} | unethical={unethical} critical={critical} conf={evt['confidence']} type={typ} | {desc[:120]}")
    write_jsonl(LOG_JSONL, evt)

    frame_idx += 1

cap.release()
print("Done. Logs:", LOG_JSONL)


preprocessor_config.json:   0%|          | 0.00/347 [00:00<?, ?B/s]

chat_template.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
import torch
from PIL import Image
from transformers import pipeline, BitsAndBytesConfig

quant = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

pipe = pipeline(
    "image-text-to-text",
    model=MODEL_ID,
    model_kwargs={"quantization_config": quant, "device_map": "auto"},
)

# Confirm device (should say cuda:0 if GPU)
print(next(pipe.model.parameters()).device)

img = Image.open("test.png").convert("RGB")

messages = [{
    "role": "user",
    "content": [
        {"type": "image", "image": img},
        {"type": "text", "text":
            "Traffic CCTV task: decide if there is a road accident/collision/crash in this frame. "
            "Reply with exactly one sentence."
        },
    ],
}]

out = pipe(text=messages, max_new_tokens=5, max_length=None, return_full_text=False)
print(out[0]["generated_text"])


No model was supplied, defaulted to llava-hf/llava-onevision-qwen2-0.5b-ov-hf and revision 2c9ba3b.
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.79G [00:00<?, ?B/s]