In [2]:
from huggingface_hub import hf_hub_download

repo_id = "Qwen/Qwen2-VL-2B-Instruct"
local_dir = r"A:\hf_models\Qwen2-VL-2B-Instruct"

for fname in [
    "model-00001-of-00002.safetensors",
    "model-00002-of-00002.safetensors",
]:
    path = hf_hub_download(
        repo_id=repo_id,
        filename=fname,
        local_dir=local_dir,
        force_download=True,
    )
    print("Downloaded:", path)


model-00001-of-00002.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

Downloaded: A:\hf_models\Qwen2-VL-2B-Instruct\model-00001-of-00002.safetensors


model-00002-of-00002.safetensors:   0%|          | 0.00/429M [00:00<?, ?B/s]

Downloaded: A:\hf_models\Qwen2-VL-2B-Instruct\model-00002-of-00002.safetensors


In [1]:
from transformers import AutoProcessor
processor = AutoProcessor.from_pretrained(
    r"A:\hf_models\Qwen2-VL-2B-Instruct",
    min_pixels=256*28*28,
    max_pixels=1024*28*28,
)


The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. 


In [3]:
import os
import cv2
import time
from collections import deque

import numpy as np
from PIL import Image

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info


# =========================
# USER CONFIG
# =========================
VIDEO_PATH = r"A:\Context Aware CCTV Surviellance system\accident.mp4"
MODEL_ID = r"A:\hf_models\Qwen2-VL-2B-Instruct"

# SPEED OPTIMIZATIONS:
CLIP_DURATION_SEC = 2.0
FRAMES_PER_CLIP = 4              # Reduced from 5 → 40% faster preprocessing
MOTION_THRESH_CLIP = 0.5         # Skip boring clips (lower = more sensitive)

# Image resolution (lower = faster, but less detail)
MIN_PIXELS = 128 * 28 * 28       # Reduced from 256 → faster image encoding
MAX_PIXELS = 512 * 28 * 28       # Reduced from 1024 → faster image encoding

# Generation
MAX_NEW_TOKENS = 80              # Reduced from 150 → faster generation

# Alert smoothing
WINDOW_N = 3
TRIGGER_K = 2
ALERT_COOLDOWN_SEC = 10

# Output
OUT_DIR = "outputs"
LOG_TXT = os.path.join(OUT_DIR, "cctv_clips.txt")
LOG_JSONL = os.path.join(OUT_DIR, "cctv_clips.jsonl")


# =========================
# HELPERS
# =========================
def ensure_dir(p):
    os.makedirs(p, exist_ok=True)

def fmt_hhmmss(seconds: float) -> str:
    seconds = int(max(0, seconds))
    h = seconds // 3600
    m = (seconds % 3600) // 60
    s = seconds % 60
    return f"{h:02d}:{m:02d}:{s:02d}"

def fmt_time_range(start_sec: float, end_sec: float) -> str:
    return f"{fmt_hhmmss(start_sec)} - {fmt_hhmmss(end_sec)}"

def write_line(path, line: str):
    with open(path, "a", encoding="utf-8") as f:
        f.write(line.rstrip() + "\n")

def write_jsonl(path, obj: dict):
    import json
    with open(path, "a", encoding="utf-8") as f:
        f.write(json.dumps(obj, ensure_ascii=False) + "\n")

def motion_score(prev_bgr, curr_bgr) -> float:
    prev = cv2.cvtColor(prev_bgr, cv2.COLOR_BGR2GRAY)
    curr = cv2.cvtColor(curr_bgr, cv2.COLOR_BGR2GRAY)
    prev = cv2.resize(prev, (160, 90))
    curr = cv2.resize(curr, (160, 90))
    return float(np.mean(cv2.absdiff(prev, curr)))


# =========================
# LOAD MODEL
# =========================
ensure_dir(OUT_DIR)

processor = AutoProcessor.from_pretrained(
    MODEL_ID,
    min_pixels=MIN_PIXELS,
    max_pixels=MAX_PIXELS,
)

model = Qwen2VLForConditionalGeneration.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.float16,    # Use FP16 for faster inference
    device_map="auto",
)
model.eval()

print("Model device:", next(model.parameters()).device)
print(f"Speed optimizations: {FRAMES_PER_CLIP} frames/clip, {MAX_NEW_TOKENS} max tokens, resolution {MIN_PIXELS//784}-{MAX_PIXELS//784} visual tokens")


def vlm_describe_clip(frames_pil: list, time_range: str):
    """
    frames_pil: list of PIL images sampled from the clip
    Returns: (model_tag, detailed_description, raw_line)
    """
    # Shorter prompt = faster processing
    content = [
        {"type": "text", "text": f"CCTV {time_range}. {len(frames_pil)} frames in order."}
    ]
    for pil_img in frames_pil:
        content.append({"type": "image", "image": pil_img})
    
    content.append({"type": "text", "text":
        "In maximum 2 sentences: what's happening? "
        "If suspicious (lethals, robbery, assault, weapon, accident, fire, vandalism, forced entry, running), start with ALERT:. "
        "Otherwise start with LOG:"
    })

    messages = [{"role": "user", "content": content}]

    prompt = processor.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        add_vision_id=True,
    )

    images, videos = process_vision_info(messages)
    inputs = processor(
        text=[prompt],
        images=images,
        videos=videos,
        padding=True,
        return_tensors="pt",
    ).to(model.device)

    with torch.no_grad():
        # Mixed precision inference for speed
        with torch.amp.autocast('cuda', dtype=torch.float16):
            output_ids = model.generate(
                **inputs,
                max_new_tokens=MAX_NEW_TOKENS,
                do_sample=False,
                num_beams=1,
                use_cache=True,
            )


    # Decode only new tokens (trim prompt)
    trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, output_ids)]
    text = processor.batch_decode(
        trimmed,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True,
    )[0].strip()

    # Parse tag
    up = text.upper()
    if up.startswith("ALERT:"):
        tag = "ALERT"
        desc = text.split(":", 1)[1].strip() if ":" in text else text
    elif up.startswith("LOG:"):
        tag = "LOG"
        desc = text.split(":", 1)[1].strip() if ":" in text else text
    else:
        tag = "LOG"
        desc = text

    raw_line = f"{tag}: {desc}"
    return tag, desc, raw_line


# =========================
# VIDEO LOOP (CLIP-BASED)
# =========================
cap = cv2.VideoCapture(VIDEO_PATH)
if not cap.isOpened():
    raise RuntimeError(f"Could not open video: {VIDEO_PATH}")

fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
clip_frames_count = int(fps * CLIP_DURATION_SEC)

print(f"FPS: {fps}, Total frames: {total_frames}, Clip size: {clip_frames_count} frames ({CLIP_DURATION_SEC}s)")
print(f"Estimated {total_frames // clip_frames_count} clips to process\n")

# Alert smoothing
decision_window = deque(maxlen=WINDOW_N)
last_alert_time = 0

clip_num = 0
skipped_low_motion = 0
start_time = time.time()

while True:
    # Calculate clip boundaries
    clip_start_frame = clip_num * clip_frames_count
    clip_end_frame = clip_start_frame + clip_frames_count
    
    if clip_start_frame >= total_frames:
        break
    
    # Set video position to clip start
    cap.set(cv2.CAP_PROP_POS_FRAMES, clip_start_frame)
    
    clip_start_sec = clip_start_frame / fps
    clip_end_sec = min(clip_end_frame, total_frames) / fps
    time_range = fmt_time_range(clip_start_sec, clip_end_sec)
    
    # Read all frames in this clip
    clip_frames_bgr = []
    motion_scores = []
    prev_bgr = None
    
    for local_idx in range(clip_frames_count):
        ok, frame_bgr = cap.read()
        if not ok:
            break
        clip_frames_bgr.append(frame_bgr)
        
        # Track motion
        if prev_bgr is not None:
            motion_scores.append(motion_score(prev_bgr, frame_bgr))
        prev_bgr = frame_bgr
    
    if len(clip_frames_bgr) < FRAMES_PER_CLIP:
        clip_num += 1
        continue
    
    avg_motion = np.mean(motion_scores) if motion_scores else 0.0
    
    # SPEED GATE: skip low-motion clips (static scene)
    if avg_motion < MOTION_THRESH_CLIP:
        line = f"Clip {clip_num:03d} | {time_range} | SKIP  | (no significant activity)"
        print(line)
        write_line(LOG_TXT, line)
        write_jsonl(LOG_JSONL, {
            "clip_num": clip_num,
            "time_range": time_range,
            "start_sec": round(clip_start_sec, 3),
            "end_sec": round(clip_end_sec, 3),
            "system_tag": "SKIP",
            "avg_motion": round(avg_motion, 3),
            "note": "low_motion_skip",
        })
        skipped_low_motion += 1
        clip_num += 1
        continue
    
    # Sample representative frames evenly from the clip
    indices = np.linspace(0, len(clip_frames_bgr) - 1, FRAMES_PER_CLIP, dtype=int)
    sampled_bgr = [clip_frames_bgr[i] for i in indices]
    
    # Convert to PIL
    frames_pil = []
    for bgr in sampled_bgr:
        rgb = cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB)
        frames_pil.append(Image.fromarray(rgb))
    
    # Get VLM description for this clip
    clip_inference_start = time.time()
    model_tag, desc, raw_line = vlm_describe_clip(frames_pil, time_range)
    clip_inference_time = time.time() - clip_inference_start
    
    # Smooth alerts
    positive = (model_tag == "ALERT")
    decision_window.append(1 if positive else 0)
    
    should_alert = (sum(decision_window) >= TRIGGER_K) and ((time.time() - last_alert_time) > ALERT_COOLDOWN_SEC)
    system_tag = "ALERT" if should_alert else "LOG"
    if should_alert:
        last_alert_time = time.time()
    
    # Log output with timing
    line = f"Clip {clip_num:03d} | {time_range} | {system_tag:<5} | {clip_inference_time:.1f}s | {raw_line}"
    print(line)
    write_line(LOG_TXT, line)
    
    write_jsonl(LOG_JSONL, {
        "clip_num": clip_num,
        "time_range": time_range,
        "start_sec": round(clip_start_sec, 3),
        "end_sec": round(clip_end_sec, 3),
        "system_tag": system_tag,
        "model_tag": model_tag,
        "avg_motion": round(avg_motion, 3),
        "inference_time_sec": round(clip_inference_time, 3),
        "description": desc,
        "raw_line": raw_line,
        "window_sum": int(sum(decision_window)),
    })
    
    clip_num += 1

cap.release()
elapsed = time.time() - start_time
print(f"\nDone in {elapsed/60:.1f} minutes.")
print(f"Processed {clip_num} clips ({skipped_low_motion} skipped for low motion)")
print(f"Average: {elapsed/max(1, clip_num-skipped_low_motion):.1f}s per analyzed clip")
print("Readable log:", LOG_TXT)
print("JSONL log:", LOG_JSONL)


Loading weights:   0%|          | 0/729 [00:00<?, ?it/s]

Model device: cuda:0
Speed optimizations: 4 frames/clip, 80 max tokens, resolution 128-512 visual tokens
FPS: 60.00287306574532, Total frames: 181, Clip size: 120 frames (2.0s)
Estimated 1 clips to process

Clip 000 | 00:00:00 - 00:00:01 | SKIP  | (no significant activity)
Clip 001 | 00:00:01 - 00:00:03 | LOG   | 30.6s | ALERT: Accident

Done in 0.6 minutes.
Processed 2 clips (1 skipped for low motion)
Average: 34.8s per analyzed clip
Readable log: outputs\cctv_clips.txt
JSONL log: outputs\cctv_clips.jsonl


In [None]:
import os
import cv2
import time
from collections import deque

import numpy as np
from PIL import Image

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info


# =========================
# USER CONFIG
# =========================
VIDEO_PATH = r"A:\Context Aware CCTV Surviellance system\parking.mp4"
MODEL_ID = r"A:\hf_models\Qwen2-VL-2B-Instruct"

# Video processing
TARGET_FPS = 30.0                # Downsample high-FPS videos to this (speeds up reading)
CLIP_DURATION_SEC = 2.0          # Analyze every 2-second clip
FRAMES_PER_CLIP = 3              # Sample 3 representative frames from each clip
MOTION_THRESH_CLIP = 1.0         # Skip clip if avg motion < threshold

# Image resolution (balanced: good detail + reasonable speed)
MIN_PIXELS = 256 * 28 * 28
MAX_PIXELS = 1024 * 28 * 28

# Generation
MAX_NEW_TOKENS = 80

# Alert smoothing
WINDOW_N = 3
TRIGGER_K = 2
ALERT_COOLDOWN_SEC = 10

# Output
OUT_DIR = "outputs"
LOG_TXT = os.path.join(OUT_DIR, "cctv_clips.txt")
LOG_JSONL = os.path.join(OUT_DIR, "cctv_clips.jsonl")


# =========================
# HELPERS
# =========================
def ensure_dir(p):
    os.makedirs(p, exist_ok=True)

def fmt_hhmmss(seconds: float) -> str:
    seconds = int(max(0, seconds))
    h = seconds // 3600
    m = (seconds % 3600) // 60
    s = seconds % 60
    return f"{h:02d}:{m:02d}:{s:02d}"

def fmt_time_range(start_sec: float, end_sec: float) -> str:
    return f"{fmt_hhmmss(start_sec)} - {fmt_hhmmss(end_sec)}"

def write_line(path, line: str):
    with open(path, "a", encoding="utf-8") as f:
        f.write(line.rstrip() + "\n")

def write_jsonl(path, obj: dict):
    import json
    with open(path, "a", encoding="utf-8") as f:
        f.write(json.dumps(obj, ensure_ascii=False) + "\n")

def motion_score(prev_bgr, curr_bgr) -> float:
    prev = cv2.cvtColor(prev_bgr, cv2.COLOR_BGR2GRAY)
    curr = cv2.cvtColor(curr_bgr, cv2.COLOR_BGR2GRAY)
    prev = cv2.resize(prev, (160, 90))
    curr = cv2.resize(curr, (160, 90))
    return float(np.mean(cv2.absdiff(prev, curr)))


# =========================
# LOAD MODEL
# =========================
ensure_dir(OUT_DIR)

processor = AutoProcessor.from_pretrained(
    MODEL_ID,
    min_pixels=MIN_PIXELS,
    max_pixels=MAX_PIXELS,
)

model = Qwen2VLForConditionalGeneration.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.float16,
    device_map="auto",
)
model.eval()

print("Model device:", next(model.parameters()).device)
print(f"Config: {FRAMES_PER_CLIP} frames/clip, {MAX_NEW_TOKENS} tokens, resolution {MIN_PIXELS//784}-{MAX_PIXELS//784} visual tokens\n")


def vlm_describe_clip(frames_pil: list, time_range: str):
    """
    frames_pil: list of PIL images sampled from the clip
    Returns: (model_tag, detailed_description, raw_line)
    """
    content = [
        {"type": "text", "text": f"CCTV {time_range}. {len(frames_pil)} frames in order."}
    ]
    for pil_img in frames_pil:
        content.append({"type": "image", "image": pil_img})
    
    content.append({"type": "text", "text":
        "In 2 sentences: what's happening? "
        "If suspicious (robbery, assault, weapon, accident, destruction, fire, vandalism, forced entry), start with ALERT:. "
        "Else start with LOG and describe the scene."
    })

    messages = [{"role": "user", "content": content}]

    prompt = processor.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        add_vision_id=True,
    )

    images, videos = process_vision_info(messages)
    inputs = processor(
        text=[prompt],
        images=images,
        videos=videos,
        padding=True,
        return_tensors="pt",
    ).to(model.device)

    with torch.no_grad():
        with torch.amp.autocast('cuda', dtype=torch.float16):
            output_ids = model.generate(
                **inputs,
                max_new_tokens=MAX_NEW_TOKENS,
                do_sample=False,
                num_beams=1,
                use_cache=True,
            )

    # Decode only new tokens
    trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, output_ids)]
    text = processor.batch_decode(
        trimmed,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True,
    )[0].strip()

    # Parse tag
    up = text.upper()
    if up.startswith("ALERT:"):
        tag = "ALERT"
        desc = text.split(":", 1)[1].strip() if ":" in text else text
    elif up.startswith("LOG:"):
        tag = "LOG"
        desc = text.split(":", 1)[1].strip() if ":" in text else text
    else:
        tag = "LOG"
        desc = text

    raw_line = f"{tag}: {desc}"
    return tag, desc, raw_line


# =========================
# VIDEO LOOP (CLIP-BASED)
# =========================
cap = cv2.VideoCapture(VIDEO_PATH)
if not cap.isOpened():
    raise RuntimeError(f"Could not open video: {VIDEO_PATH}")

# FPS downsampling for speed
original_fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
fps = min(TARGET_FPS, original_fps)
frame_skip = max(1, int(original_fps / fps))

total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
clip_frames_count = int(fps * CLIP_DURATION_SEC)

print(f"Original FPS: {original_fps:.1f}, Target FPS: {fps:.1f}, Frame skip: {frame_skip}")
print(f"Total frames: {total_frames}, Clip size: {clip_frames_count} frames ({CLIP_DURATION_SEC}s)")
print(f"Estimated {total_frames // (clip_frames_count * frame_skip)} clips to process\n")

# Alert smoothing
decision_window = deque(maxlen=WINDOW_N)
last_alert_time = 0

clip_num = 0
skipped_low_motion = 0
start_time = time.time()

# Frame reading state
frame_idx = 0

while True:
    # Calculate clip boundaries in original video frames
    clip_start_frame = clip_num * clip_frames_count * frame_skip
    clip_end_frame = clip_start_frame + (clip_frames_count * frame_skip)
    
    if clip_start_frame >= total_frames:
        break
    
    # Set video position to clip start
    cap.set(cv2.CAP_PROP_POS_FRAMES, clip_start_frame)
    
    clip_start_sec = clip_start_frame / original_fps
    clip_end_sec = min(clip_end_frame, total_frames) / original_fps
    time_range = fmt_time_range(clip_start_sec, clip_end_sec)
    
    # Read frames in this clip (with frame skipping for FPS reduction)
    clip_frames_bgr = []
    motion_scores = []
    prev_bgr = None
    
    local_frame_idx = 0
    frames_read = 0
    
    while frames_read < clip_frames_count:
        ok, frame_bgr = cap.read()
        if not ok:
            break
        
        # Skip frames to achieve target FPS
        if local_frame_idx % frame_skip == 0:
            clip_frames_bgr.append(frame_bgr)
            
            # Track motion
            if prev_bgr is not None:
                motion_scores.append(motion_score(prev_bgr, frame_bgr))
            prev_bgr = frame_bgr
            frames_read += 1
        
        local_frame_idx += 1
    
    if len(clip_frames_bgr) < FRAMES_PER_CLIP:
        clip_num += 1
        continue
    
    avg_motion = np.mean(motion_scores) if motion_scores else 0.0
    
    # Skip low-motion clips (static scene)
    if avg_motion < MOTION_THRESH_CLIP:
        line = f"Clip {clip_num:03d} | {time_range} | SKIP  | motion={avg_motion:.2f} | (no significant activity)"
        print(line)
        write_line(LOG_TXT, line)
        write_jsonl(LOG_JSONL, {
            "clip_num": clip_num,
            "time_range": time_range,
            "start_sec": round(clip_start_sec, 3),
            "end_sec": round(clip_end_sec, 3),
            "system_tag": "SKIP",
            "avg_motion": round(avg_motion, 3),
            "note": "low_motion_skip",
        })
        skipped_low_motion += 1
        clip_num += 1
        continue
    
    # Sample representative frames evenly from the clip
    indices = np.linspace(0, len(clip_frames_bgr) - 1, FRAMES_PER_CLIP, dtype=int)
    sampled_bgr = [clip_frames_bgr[i] for i in indices]
    
    # Convert to PIL
    frames_pil = []
    for bgr in sampled_bgr:
        rgb = cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB)
        frames_pil.append(Image.fromarray(rgb))
    
    # Get VLM description for this clip
    clip_inference_start = time.time()
    model_tag, desc, raw_line = vlm_describe_clip(frames_pil, time_range)
    clip_inference_time = time.time() - clip_inference_start
    
    # Smooth alerts
    positive = (model_tag == "ALERT")
    decision_window.append(1 if positive else 0)
    
    should_alert = (sum(decision_window) >= TRIGGER_K) and ((time.time() - last_alert_time) > ALERT_COOLDOWN_SEC)
    system_tag = "ALERT" if should_alert else "LOG"
    if should_alert:
        last_alert_time = time.time()
    
    # Log output with timing
    line = f"Clip {clip_num:03d} | {time_range} | {system_tag:<5} | motion={avg_motion:.2f} | {clip_inference_time:.1f}s | {raw_line}"
    print(line)
    write_line(LOG_TXT, line)
    
    write_jsonl(LOG_JSONL, {
        "clip_num": clip_num,
        "time_range": time_range,
        "start_sec": round(clip_start_sec, 3),
        "end_sec": round(clip_end_sec, 3),
        "system_tag": system_tag,
        "model_tag": model_tag,
        "avg_motion": round(avg_motion, 3),
        "inference_time_sec": round(clip_inference_time, 3),
        "description": desc,
        "raw_line": raw_line,
        "window_sum": int(sum(decision_window)),
    })
    
    clip_num += 1

cap.release()
elapsed = time.time() - start_time
processed_clips = clip_num - skipped_low_motion

print(f"\nDone in {elapsed/60:.1f} minutes.")
print(f"Processed {clip_num} clips ({skipped_low_motion} skipped for low motion)")
if processed_clips > 0:
    print(f"Average: {elapsed/processed_clips:.1f}s per analyzed clip")
print("Readable log:", LOG_TXT)
print("JSONL log:", LOG_JSONL)


The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. 


Loading weights:   0%|          | 0/729 [00:00<?, ?it/s]

Model device: cuda:0
Config: 3 frames/clip, 80 tokens, resolution 256-1024 visual tokens

Original FPS: 8.0, Target FPS: 8.0, Frame skip: 1
Total frames: 701, Clip size: 16 frames (2.0s)
Estimated 43 clips to process

Clip 000 | 00:00:00 - 00:00:02 | SKIP  | motion=0.15 | (no significant activity)
Clip 001 | 00:00:02 - 00:00:04 | SKIP  | motion=0.24 | (no significant activity)
Clip 002 | 00:00:04 - 00:00:06 | SKIP  | motion=0.19 | (no significant activity)
Clip 003 | 00:00:06 - 00:00:08 | SKIP  | motion=0.05 | (no significant activity)
Clip 004 | 00:00:08 - 00:00:10 | SKIP  | motion=0.09 | (no significant activity)
Clip 005 | 00:00:10 - 00:00:12 | SKIP  | motion=0.04 | (no significant activity)


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Clip 006 | 00:00:12 - 00:00:14 | SKIP  | motion=0.02 | (no significant activity)
Clip 007 | 00:00:14 - 00:00:16 | SKIP  | motion=0.09 | (no significant activity)
Clip 008 | 00:00:16 - 00:00:18 | SKIP  | motion=0.36 | (no significant activity)
Clip 009 | 00:00:18 - 00:00:20 | LOG   | motion=1.71 | 2.1s | ALERT: No suspicious activity detected.
Clip 010 | 00:00:20 - 00:00:22 | ALERT | motion=3.90 | 1.1s | ALERT: No suspicious activity detected.
Clip 011 | 00:00:22 - 00:00:24 | SKIP  | motion=0.27 | (no significant activity)
Clip 012 | 00:00:24 - 00:00:26 | SKIP  | motion=0.71 | (no significant activity)
Clip 013 | 00:00:26 - 00:00:28 | LOG   | motion=1.87 | 2.2s | ALERT: The scene appears to be a typical parking lot with various vehicles parked. There are no suspicious activities or incidents visible in the image.
Clip 014 | 00:00:28 - 00:00:30 | LOG   | motion=1.81 | 1.1s | ALERT: No suspicious activity detected.
Clip 015 | 00:00:30 - 00:00:32 | SKIP  | motion=0.05 | (no significant act