In [2]:
from huggingface_hub import hf_hub_download

repo_id = "Qwen/Qwen2-VL-2B-Instruct"
local_dir = r"A:\hf_models\Qwen2-VL-2B-Instruct"

for fname in [
    "model-00001-of-00002.safetensors",
    "model-00002-of-00002.safetensors",
]:
    path = hf_hub_download(
        repo_id=repo_id,
        filename=fname,
        local_dir=local_dir,
        force_download=True,
    )
    print("Downloaded:", path)


model-00001-of-00002.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

Downloaded: A:\hf_models\Qwen2-VL-2B-Instruct\model-00001-of-00002.safetensors


model-00002-of-00002.safetensors:   0%|          | 0.00/429M [00:00<?, ?B/s]

Downloaded: A:\hf_models\Qwen2-VL-2B-Instruct\model-00002-of-00002.safetensors


In [1]:
from transformers import AutoProcessor
processor = AutoProcessor.from_pretrained(
    r"A:\hf_models\Qwen2-VL-2B-Instruct",
    min_pixels=256*28*28,
    max_pixels=1024*28*28,
)


The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. 


In [5]:
import os
import cv2
import time
from collections import deque

import numpy as np
from PIL import Image

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info


# =========================
# USER CONFIG
# =========================
VIDEO_PATH = r"A:\Context Aware CCTV Surviellance system\accident.mp4"
MODEL_ID = r"A:\hf_models\Qwen2-VL-2B-Instruct"

# Video sampling
SAMPLE_EVERY_SEC = 1.0          # sample ~1 frame per second
MOTION_MEANABS_THRESH = 6.0     # lower -> more sensitive; higher -> fewer VLM calls
HEARTBEAT_SEC = 10.0            # force a description at least every N seconds (even if low motion)

# Alert smoothing (reduce one-off false alerts)
WINDOW_N = 5
TRIGGER_K = 2
ALERT_COOLDOWN_SEC = 15

# Output
OUT_DIR = "outputs"
LOG_TXT = os.path.join(OUT_DIR, "cctv_events.txt")   # simple readable log
LOG_JSONL = os.path.join(OUT_DIR, "cctv_events.jsonl")  # optional structured log


# =========================
# HELPERS
# =========================
def ensure_dir(p):
    os.makedirs(p, exist_ok=True)

def fmt_hhmmss(seconds: float) -> str:
    seconds = int(max(0, seconds))
    h = seconds // 3600
    m = (seconds % 3600) // 60
    s = seconds % 60
    return f"{h:02d}:{m:02d}:{s:02d}"

def write_line(path, line: str):
    with open(path, "a", encoding="utf-8") as f:
        f.write(line.rstrip() + "\n")

def write_jsonl(path, obj: dict):
    import json
    with open(path, "a", encoding="utf-8") as f:
        f.write(json.dumps(obj, ensure_ascii=False) + "\n")

def motion_score(prev_bgr, curr_bgr) -> float:
    prev = cv2.cvtColor(prev_bgr, cv2.COLOR_BGR2GRAY)
    curr = cv2.cvtColor(curr_bgr, cv2.COLOR_BGR2GRAY)
    prev = cv2.resize(prev, (160, 90))
    curr = cv2.resize(curr, (160, 90))
    return float(np.mean(cv2.absdiff(prev, curr)))


# =========================
# LOAD MODEL
# =========================
ensure_dir(OUT_DIR)

processor = AutoProcessor.from_pretrained(
    MODEL_ID,
    min_pixels=256 * 28 * 28,
    max_pixels=1024 * 28 * 28,
)

model = Qwen2VLForConditionalGeneration.from_pretrained(
    MODEL_ID,
    torch_dtype="auto",
    device_map="auto",
)
model.eval()

print("Model device:", next(model.parameters()).device)


def vlm_one_line(frames_pil, ts_text: str):
    """
    Returns: (model_tag, description, raw_line)
    model_tag: "ALERT" or "LOG"
    """
    messages = [{
        "role": "user",
        "content": [
            {"type": "text", "text": f"You are monitoring a home CCTV camera. Timestamp {ts_text}. Frames are in time order."},
            {"type": "image", "image": frames_pil[0]},
            {"type": "image", "image": frames_pil[1]},
            {"type": "image", "image": frames_pil[2]},
            {"type": "text", "text":
                "Write exactly ONE LINE.\n"
                "If anything suspicious/unsafe/unethical is happening (robbery/theft, assault/fight, weapon visible, accident/crash, fire/smoke, vandalism/destruction, forced entry), output:\n"
                "ALERT: <short description>\n"
                "Otherwise output:\n"
                "LOG: <short description>\n"
                "Also mention useful non-alert info if visible (e.g., a stray dog/animal entered, delivery person at door).\n"
                "No JSON. No extra lines."
            },
        ],
    }]

    prompt = processor.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        add_vision_id=True,
    )

    images, videos = process_vision_info(messages)
    inputs = processor(
        text=[prompt],
        images=images,
        videos=videos,
        padding=True,
        return_tensors="pt",
    ).to(model.device)

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=80,
            do_sample=False,
            temperature=0.0,
        )

    # Decode ONLY the newly generated tokens (avoid prompt echo that breaks parsing/format).
    # This is the key fix for your “output not valid JSON / messy output” issue.
    trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, output_ids)]
    text = processor.batch_decode(
        trimmed,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True,
    )[0].strip()

    # Force single line + enforce prefix
    text = " ".join(text.splitlines()).strip()
    up = text.upper()
    if up.startswith("ALERT:"):
        tag = "ALERT"
        desc = text.split(":", 1)[1].strip()
    elif up.startswith("LOG:"):
        tag = "LOG"
        desc = text.split(":", 1)[1].strip()
    else:
        tag = "LOG"
        desc = text

    # Keep it short for logs
    desc = desc[:200].strip()
    raw_line = f"{tag}: {desc}"
    return tag, desc, raw_line


# =========================
# VIDEO LOOP
# =========================
cap = cv2.VideoCapture(VIDEO_PATH)
if not cap.isOpened():
    raise RuntimeError(f"Could not open video: {VIDEO_PATH}")

fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
sample_every_frames = max(1, int(fps * SAMPLE_EVERY_SEC))
heartbeat_frames = max(1, int(fps * HEARTBEAT_SEC))

print("FPS:", fps, "sample_every_frames:", sample_every_frames)

frame_idx = 0
last_sample_bgr = None

# 3 sampled frames: t-1, t, t+1
sample_buf_bgr = deque(maxlen=3)

# alert smoothing
decision_window = deque(maxlen=WINDOW_N)
last_alert_time = 0

# for better “what is happening” logging
last_desc = "Starting up."
last_model_tag = "LOG"
last_vlm_frame_idx = -10**9

while True:
    ok, frame_bgr = cap.read()
    if not ok:
        break

    if frame_idx % sample_every_frames != 0:
        frame_idx += 1
        continue

    t_sec = frame_idx / fps
    ts = fmt_hhmmss(t_sec)

    # motion score on sampled frames
    if last_sample_bgr is None:
        score = 999.0
    else:
        score = motion_score(last_sample_bgr, frame_bgr)
    last_sample_bgr = frame_bgr.copy()

    sample_buf_bgr.append(frame_bgr)

    # Need 3 frames before calling VLM
    if len(sample_buf_bgr) < 3:
        line = f"{ts} | LOG   | warming_up | {last_desc}"
        print(line)
        write_line(LOG_TXT, line)
        write_jsonl(LOG_JSONL, {"ts": ts, "t_sec": round(t_sec, 3), "tag": "LOG", "note": "warming_up", "desc": last_desc})
        frame_idx += 1
        continue

    # Decide whether to call VLM now
    due_heartbeat = (frame_idx - last_vlm_frame_idx) >= heartbeat_frames
    due_motion = score >= MOTION_MEANABS_THRESH
    should_call_vlm = due_motion or due_heartbeat

    if should_call_vlm:
        frames_pil = []
        for bgr in list(sample_buf_bgr):
            rgb = cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB)
            frames_pil.append(Image.fromarray(rgb))

        model_tag, desc, raw_line = vlm_one_line(frames_pil, ts_text=ts)
        last_desc = desc
        last_model_tag = model_tag
        last_vlm_frame_idx = frame_idx

        # Smooth alerts
        positive = (model_tag == "ALERT")
        decision_window.append(1 if positive else 0)

        should_alert = (sum(decision_window) >= TRIGGER_K) and ((time.time() - last_alert_time) > ALERT_COOLDOWN_SEC)
        system_tag = "ALERT" if should_alert else "LOG"
        if should_alert:
            last_alert_time = time.time()

        line = f"{ts} | {system_tag:<5} | motion={score:.3f} | {raw_line}"
        print(line)
        write_line(LOG_TXT, line)
        write_jsonl(LOG_JSONL, {
            "ts": ts,
            "t_sec": round(t_sec, 3),
            "system_tag": system_tag,
            "model_tag": model_tag,
            "motion_score": round(score, 3),
            "description": desc,
            "raw_line": raw_line,
            "window_sum": int(sum(decision_window)),
        })
    else:
        # Even when skipping VLM, keep logs readable by repeating last known description
        line = f"{ts} | LOG   | motion={score:.3f} | last={last_model_tag}: {last_desc}"
        print(line)
        write_line(LOG_TXT, line)
        write_jsonl(LOG_JSONL, {
            "ts": ts,
            "t_sec": round(t_sec, 3),
            "system_tag": "LOG",
            "note": "skipped_vlm_low_motion",
            "motion_score": round(score, 3),
            "last_model_tag": last_model_tag,
            "last_description": last_desc,
        })

    frame_idx += 1

cap.release()
print("Done.")
print("Readable log:", LOG_TXT)
print("JSONL log:", LOG_JSONL)


Loading weights:   0%|          | 0/729 [00:00<?, ?it/s]

Model device: cuda:0
FPS: 60.00287306574532 sample_every_frames: 60
00:00:00 | LOG   | warming_up | Starting up.
00:00:00 | LOG   | warming_up | Starting up.
00:00:01 | LOG   | motion=8.672 | LOG: A car has hit a pedestrian on the sidewalk.
00:00:02 | LOG   | motion=9.055 | LOG: A car has crashed into another car on the road.
Done.
Readable log: outputs\cctv_events.txt
JSONL log: outputs\cctv_events.jsonl


In [None]:
import os
import cv2
import time
from collections import deque

import numpy as np
from PIL import Image

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info


# =========================
# USER CONFIG
# =========================
VIDEO_PATH = r"A:\Context Aware CCTV Surviellance system\input.mp4"
MODEL_ID = r"A:\hf_models\Qwen2-VL-2B-Instruct"

# SPEED OPTIMIZATIONS:
CLIP_DURATION_SEC = 4.0
FRAMES_PER_CLIP = 4              # Reduced from 5 → 40% faster preprocessing
MOTION_THRESH_CLIP = 0.5         # Skip boring clips (lower = more sensitive)

# Image resolution (lower = faster, but less detail)
MIN_PIXELS = 128 * 28 * 28       # Reduced from 256 → faster image encoding
MAX_PIXELS = 512 * 28 * 28       # Reduced from 1024 → faster image encoding

# Generation
MAX_NEW_TOKENS = 80              # Reduced from 150 → faster generation

# Alert smoothing
WINDOW_N = 3
TRIGGER_K = 2
ALERT_COOLDOWN_SEC = 10

# Output
OUT_DIR = "outputs"
LOG_TXT = os.path.join(OUT_DIR, "cctv_clips.txt")
LOG_JSONL = os.path.join(OUT_DIR, "cctv_clips.jsonl")


# =========================
# HELPERS
# =========================
def ensure_dir(p):
    os.makedirs(p, exist_ok=True)

def fmt_hhmmss(seconds: float) -> str:
    seconds = int(max(0, seconds))
    h = seconds // 3600
    m = (seconds % 3600) // 60
    s = seconds % 60
    return f"{h:02d}:{m:02d}:{s:02d}"

def fmt_time_range(start_sec: float, end_sec: float) -> str:
    return f"{fmt_hhmmss(start_sec)} - {fmt_hhmmss(end_sec)}"

def write_line(path, line: str):
    with open(path, "a", encoding="utf-8") as f:
        f.write(line.rstrip() + "\n")

def write_jsonl(path, obj: dict):
    import json
    with open(path, "a", encoding="utf-8") as f:
        f.write(json.dumps(obj, ensure_ascii=False) + "\n")

def motion_score(prev_bgr, curr_bgr) -> float:
    prev = cv2.cvtColor(prev_bgr, cv2.COLOR_BGR2GRAY)
    curr = cv2.cvtColor(curr_bgr, cv2.COLOR_BGR2GRAY)
    prev = cv2.resize(prev, (160, 90))
    curr = cv2.resize(curr, (160, 90))
    return float(np.mean(cv2.absdiff(prev, curr)))


# =========================
# LOAD MODEL
# =========================
ensure_dir(OUT_DIR)

processor = AutoProcessor.from_pretrained(
    MODEL_ID,
    min_pixels=MIN_PIXELS,
    max_pixels=MAX_PIXELS,
)

model = Qwen2VLForConditionalGeneration.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.float16,    # Use FP16 for faster inference
    device_map="auto",
)
model.eval()

print("Model device:", next(model.parameters()).device)
print(f"Speed optimizations: {FRAMES_PER_CLIP} frames/clip, {MAX_NEW_TOKENS} max tokens, resolution {MIN_PIXELS//784}-{MAX_PIXELS//784} visual tokens")


def vlm_describe_clip(frames_pil: list, time_range: str):
    """
    frames_pil: list of PIL images sampled from the clip
    Returns: (model_tag, detailed_description, raw_line)
    """
    # Shorter prompt = faster processing
    content = [
        {"type": "text", "text": f"CCTV {time_range}. {len(frames_pil)} frames in order."}
    ]
    for pil_img in frames_pil:
        content.append({"type": "image", "image": pil_img})
    
    content.append({"type": "text", "text":
        "In maximum 2 sentences: what's happening? "
        "If suspicious (lethals, robbery, assault, weapon, accident, fire, vandalism, forced entry, running), start with ALERT:. "
        "Otherwise start with LOG:"
    })

    messages = [{"role": "user", "content": content}]

    prompt = processor.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        add_vision_id=True,
    )

    images, videos = process_vision_info(messages)
    inputs = processor(
        text=[prompt],
        images=images,
        videos=videos,
        padding=True,
        return_tensors="pt",
    ).to(model.device)

    with torch.no_grad():
        # Mixed precision inference for speed
        with torch.amp.autocast('cuda', dtype=torch.float16):
            output_ids = model.generate(
                **inputs,
                max_new_tokens=MAX_NEW_TOKENS,
                do_sample=False,
                num_beams=1,
                use_cache=True,
            )


    # Decode only new tokens (trim prompt)
    trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, output_ids)]
    text = processor.batch_decode(
        trimmed,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True,
    )[0].strip()

    # Parse tag
    up = text.upper()
    if up.startswith("ALERT:"):
        tag = "ALERT"
        desc = text.split(":", 1)[1].strip() if ":" in text else text
    elif up.startswith("LOG:"):
        tag = "LOG"
        desc = text.split(":", 1)[1].strip() if ":" in text else text
    else:
        tag = "LOG"
        desc = text

    raw_line = f"{tag}: {desc}"
    return tag, desc, raw_line


# =========================
# VIDEO LOOP (CLIP-BASED)
# =========================
cap = cv2.VideoCapture(VIDEO_PATH)
if not cap.isOpened():
    raise RuntimeError(f"Could not open video: {VIDEO_PATH}")

fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
clip_frames_count = int(fps * CLIP_DURATION_SEC)

print(f"FPS: {fps}, Total frames: {total_frames}, Clip size: {clip_frames_count} frames ({CLIP_DURATION_SEC}s)")
print(f"Estimated {total_frames // clip_frames_count} clips to process\n")

# Alert smoothing
decision_window = deque(maxlen=WINDOW_N)
last_alert_time = 0

clip_num = 0
skipped_low_motion = 0
start_time = time.time()

while True:
    # Calculate clip boundaries
    clip_start_frame = clip_num * clip_frames_count
    clip_end_frame = clip_start_frame + clip_frames_count
    
    if clip_start_frame >= total_frames:
        break
    
    # Set video position to clip start
    cap.set(cv2.CAP_PROP_POS_FRAMES, clip_start_frame)
    
    clip_start_sec = clip_start_frame / fps
    clip_end_sec = min(clip_end_frame, total_frames) / fps
    time_range = fmt_time_range(clip_start_sec, clip_end_sec)
    
    # Read all frames in this clip
    clip_frames_bgr = []
    motion_scores = []
    prev_bgr = None
    
    for local_idx in range(clip_frames_count):
        ok, frame_bgr = cap.read()
        if not ok:
            break
        clip_frames_bgr.append(frame_bgr)
        
        # Track motion
        if prev_bgr is not None:
            motion_scores.append(motion_score(prev_bgr, frame_bgr))
        prev_bgr = frame_bgr
    
    if len(clip_frames_bgr) < FRAMES_PER_CLIP:
        clip_num += 1
        continue
    
    avg_motion = np.mean(motion_scores) if motion_scores else 0.0
    
    # SPEED GATE: skip low-motion clips (static scene)
    if avg_motion < MOTION_THRESH_CLIP:
        line = f"Clip {clip_num:03d} | {time_range} | SKIP  | (no significant activity)"
        print(line)
        write_line(LOG_TXT, line)
        write_jsonl(LOG_JSONL, {
            "clip_num": clip_num,
            "time_range": time_range,
            "start_sec": round(clip_start_sec, 3),
            "end_sec": round(clip_end_sec, 3),
            "system_tag": "SKIP",
            "avg_motion": round(avg_motion, 3),
            "note": "low_motion_skip",
        })
        skipped_low_motion += 1
        clip_num += 1
        continue
    
    # Sample representative frames evenly from the clip
    indices = np.linspace(0, len(clip_frames_bgr) - 1, FRAMES_PER_CLIP, dtype=int)
    sampled_bgr = [clip_frames_bgr[i] for i in indices]
    
    # Convert to PIL
    frames_pil = []
    for bgr in sampled_bgr:
        rgb = cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB)
        frames_pil.append(Image.fromarray(rgb))
    
    # Get VLM description for this clip
    clip_inference_start = time.time()
    model_tag, desc, raw_line = vlm_describe_clip(frames_pil, time_range)
    clip_inference_time = time.time() - clip_inference_start
    
    # Smooth alerts
    positive = (model_tag == "ALERT")
    decision_window.append(1 if positive else 0)
    
    should_alert = (sum(decision_window) >= TRIGGER_K) and ((time.time() - last_alert_time) > ALERT_COOLDOWN_SEC)
    system_tag = "ALERT" if should_alert else "LOG"
    if should_alert:
        last_alert_time = time.time()
    
    # Log output with timing
    line = f"Clip {clip_num:03d} | {time_range} | {system_tag:<5} | {clip_inference_time:.1f}s | {raw_line}"
    print(line)
    write_line(LOG_TXT, line)
    
    write_jsonl(LOG_JSONL, {
        "clip_num": clip_num,
        "time_range": time_range,
        "start_sec": round(clip_start_sec, 3),
        "end_sec": round(clip_end_sec, 3),
        "system_tag": system_tag,
        "model_tag": model_tag,
        "avg_motion": round(avg_motion, 3),
        "inference_time_sec": round(clip_inference_time, 3),
        "description": desc,
        "raw_line": raw_line,
        "window_sum": int(sum(decision_window)),
    })
    
    clip_num += 1

cap.release()
elapsed = time.time() - start_time
print(f"\nDone in {elapsed/60:.1f} minutes.")
print(f"Processed {clip_num} clips ({skipped_low_motion} skipped for low motion)")
print(f"Average: {elapsed/max(1, clip_num-skipped_low_motion):.1f}s per analyzed clip")
print("Readable log:", LOG_TXT)
print("JSONL log:", LOG_JSONL)


The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. 


Loading weights:   0%|          | 0/729 [00:00<?, ?it/s]

Model device: cuda:0
Speed optimizations: 4 frames/clip, 80 max tokens, resolution 128-512 visual tokens
FPS: 25.0, Total frames: 906, Clip size: 100 frames (4.0s)
Estimated 9 clips to process



The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Clip 000 | 00:00:00 - 00:00:04 | LOG   | motion=2.68 | 3.6s | ALERT: The CCTV footage shows two individuals entering a jewelry store. They are seen walking towards the display case and appear to be looking at the items.
Clip 001 | 00:00:04 - 00:00:08 | ALERT | motion=1.82 | 5.9s | ALERT: The scene shows a long table in a jewelry store with several display cases. Two individuals are present, one of whom is holding a gun. The other individual is standing nearby. The setting appears to be a well-lit, modern store with a polished floor and a dark-colored wall. The individuals are dressed in dark clothing, and the gun is visible in their hands. The atmosphere suggests a
Clip 002 | 00:00:08 - 00:00:12 | LOG   | motion=1.05 | 5.6s | LOG: A person wearing a black hoodie and black pants is walking towards a long table in a jewelry store. The table is covered with various jewelry displays. The person appears to be carrying a black bag. The camera angle is from the security camera's perspective. 