In [1]:
import os
import time
import subprocess
from pathlib import Path

import torch
import cv2
import urllib.parse
import numpy as np
import matplotlib.pyplot as plt
from ultralytics import YOLO
from IPython.display import display, clear_output
from dotenv import load_dotenv


In [2]:
if not os.path.exists(os.path.join(os.getcwd(), ".env")):
    raise FileNotFoundError("'.env' file not found at current directory.")

load_dotenv()

HOST = os.getenv("HOST")
INPUT_STREAM_NAME = os.getenv("INPUT_STREAM_NAME")
OUT_STREAM_NAME = INPUT_STREAM_NAME + "_AI"

MAX_FRAMES = int(os.getenv("MAX_FRAMES"))


In [3]:
print(HOST, INPUT_STREAM_NAME, OUT_STREAM_NAME, MAX_FRAMES)


127.0.0.1 cam_09_стол cam_09_стол_AI 100


In [4]:
MODEL_PATH = (Path.cwd() / "from_GitHub/dmmmit_smoking_detection/models/final_model.pt").resolve()

print("Model path:", MODEL_PATH)
print("Exists:", MODEL_PATH.exists(), "Is file:", MODEL_PATH.is_file())

assert MODEL_PATH.is_file(), f"Expected a file, got: {MODEL_PATH}"


Model path: /home/jetson/GitHub/AI_Vision_Smoking/from_GitHub/dmmmit_smoking_detection/models/final_model.pt
Exists: True Is file: True


In [5]:
model = YOLO(str(MODEL_PATH))
model_type = "ultralytics.YOLO"


In [6]:
print("Task:", getattr(model, "task", None))
print("Class names:", getattr(model, "names", None))

core = getattr(model, "model", None)

# if core is not None:

#     print("Core type:", type(core))
#     if hasattr(core, "yaml"):
#         print("YAML:", core.yaml)

#     if hasattr(core, "stride"):
#         print("Stride:", core.stride)

#     if hasattr(core, "args"):
#         print("Args:", core.args)

#     n_params = sum(p.numel() for p in core.parameters())
#     print("Param count:", n_params)

#     print("Core module:", core)


Task: detect
Class names: {0: 'Person', 1: 'cell phone', 2: 'cigarette', 3: 'hands_with_cigarettes'}


In [7]:
# Input/output probe

DUMMY_H = 640
DUMMY_W = 640

dummy = np.zeros((DUMMY_H, DUMMY_W, 3), dtype=np.uint8)
print("Input dummy shape:", dummy.shape, "dtype:", dummy.dtype)

results = model.predict(dummy, verbose=False)
r = results[0]

print("Output: boxes.xyxy", r.boxes.xyxy.shape)
print("Output: boxes.conf", r.boxes.conf.shape)
print("Output: boxes.cls", r.boxes.cls.shape)

if r.masks is not None:
    print("Output: masks", r.masks.data.shape)


Input dummy shape: (640, 640, 3) dtype: uint8
Output: boxes.xyxy torch.Size([1, 4])
Output: boxes.conf torch.Size([1])
Output: boxes.cls torch.Size([1])


In [8]:
# go2rtc отдаёт rtsp на порту 8554

STREAM_URL = f"rtsp://{HOST}:8554/{urllib.parse.quote(INPUT_STREAM_NAME)}?video"
print("STREAM_URL:", STREAM_URL)

# иногда помогает для RTSP в OpenCV/FFMPEG
os.environ["OPENCV_FFMPEG_CAPTURE_OPTIONS"] = "rtsp_transport;tcp|max_delay;500000|stimeout;5000000"


STREAM_URL: rtsp://127.0.0.1:8554/cam_09_%D1%81%D1%82%D0%BE%D0%BB?video


In [9]:
OUT_STREAM_URL = f"rtsp://{HOST}:8554/{OUT_STREAM_NAME}"


In [None]:
video_capture = None
ffmpeg_proc = None


In [10]:
def open_video_capture():    
    video_capture = cv2.VideoCapture(STREAM_URL, cv2.CAP_FFMPEG)
    video_capture.set(cv2.CAP_PROP_BUFFERSIZE, 1)
    
    if not video_capture.isOpened():
        raise RuntimeError(f"RTSP stream not opened: {STREAM_URL}")

    try:
        video_capture.set(cv2.CAP_PROP_BUFFERSIZE, 1)
    except Exception:
        pass

    return video_capture


[h264 @ 0xaaaaef2444c0] non-existing PPS 0 referenced
[h264 @ 0xaaaaef2444c0] non-existing PPS 0 referenced
[h264 @ 0xaaaaef2444c0] decode_slice_header error
[h264 @ 0xaaaaef2444c0] no frame!
[h264 @ 0xaaaaef2444c0] non-existing PPS 0 referenced
[h264 @ 0xaaaaef2444c0] non-existing PPS 0 referenced
[h264 @ 0xaaaaef2444c0] decode_slice_header error
[h264 @ 0xaaaaef2444c0] no frame!
[h264 @ 0xaaaaef2444c0] non-existing PPS 0 referenced
[h264 @ 0xaaaaef2444c0] non-existing PPS 0 referenced
[h264 @ 0xaaaaef2444c0] decode_slice_header error
[h264 @ 0xaaaaef2444c0] no frame!
[h264 @ 0xaaaaef2444c0] non-existing PPS 0 referenced
[h264 @ 0xaaaaef2444c0] non-existing PPS 0 referenced
[h264 @ 0xaaaaef2444c0] decode_slice_header error
[h264 @ 0xaaaaef2444c0] no frame!
[h264 @ 0xaaaaef2444c0] non-existing PPS 0 referenced
[h264 @ 0xaaaaef2444c0] non-existing PPS 0 referenced
[h264 @ 0xaaaaef2444c0] decode_slice_header error
[h264 @ 0xaaaaef2444c0] no frame!
[h264 @ 0xaaaaef2444c0] non-existing PPS

In [11]:
def iou(boxA, boxB):
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])

    inter = max(0, xB - xA) * max(0, yB - yA)

    if inter == 0:
        return 0.0

    areaA = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1])
    areaB = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1])

    return inter / (areaA + areaB - inter + 1e-9)


In [12]:
def smoking_score(results):
    # Эвристика: максимум из
    # - conf у класса hands_with_cigarettes
    # - conf у сигареты, которая пересекается с человеком
    names = results[0].names
    boxes = results[0].boxes

    persons = []
    cigarettes = []
    hands = []

    for b in boxes:
        cls_id = int(b.cls.item())
        conf = float(b.conf.item())
        name = names.get(cls_id, str(cls_id))
        xyxy = b.xyxy[0].cpu().numpy().tolist()

        if name == "Person":
            persons.append((xyxy, conf))

        elif name == "cigarette":
            cigarettes.append((xyxy, conf))

        elif name == "hands_with_cigarettes":
            hands.append(conf)

    score = max(hands) if hands else 0.0

    for c_box, c_conf in cigarettes:
        for p_box, _ in persons:
            if iou(c_box, p_box) > 0.01:
                score = max(score, c_conf)

    return score


In [13]:
def downscale_if_needed(frame, max_w=1920, max_h=1080):
    h, w = frame.shape[:2]
    if w <= max_w and h <= max_h:
        return frame, 1.0  # без изменений

    scale = min(max_w / w, max_h / h)
    new_w, new_h = int(w * scale), int(h * scale)
    resized = cv2.resize(frame, (new_w, new_h), interpolation=cv2.INTER_AREA)
    return resized, scale


In [14]:
def start_ai_annotation(TARGET_FPS):
    # эти значения заполним после первого кадра
    W = 0
    H = 0
    
    frame_idx = 0
    shown = 0
    
    CONF = 0.25
    IMG_SIZE = 640
    FRAME_STRIDE = 1
    
    period = 1.0 / TARGET_FPS
    DROP = 2  # сколько кадров максимум выбросить за итерацию

    if ffmpeg_proc is not None:
        try:
            ffmpeg_proc.kill()
        except Exception:
            pass
        ffmpeg_proc = None
    
    next_t = time.perf_counter()
    
    while shown < MAX_FRAMES:
    
        t_proc0 = time.perf_counter()
    
        # 1) grab хотя бы один раз
        if not video_capture.grab():
            print("Stream ended or grab failed.")
            break
    
        # 2) выбросить ещё несколько кадров, чтобы догнать реальное время
        for _ in range(DROP):
            if not video_capture.grab():
                break
    
        is_video_capture_read, frame = video_capture.retrieve()
    
        if not is_video_capture_read:
            print("Stream ended or read failed.")
            break
    
        frame_in, scale = downscale_if_needed(frame, 1280, 720)
        results = model.predict(frame_in, conf=CONF, imgsz=IMG_SIZE, verbose=False)
        
        annotated = frame_in
        names = results[0].names
    
        # список детекций с вероятностями
        detections = []
        
        for b in results[0].boxes:
            x1, y1, x2, y2 = map(int, b.xyxy[0].tolist())
            cls_id = int(b.cls.item())
            confv = float(b.conf.item())
            name = names.get(cls_id, str(cls_id))
        
            detections.append(f"{name}:{confv:.2f}")
        
            cv2.rectangle(annotated, (x1, y1), (x2, y2), (0,255,0), 2)
            cv2.putText(annotated, f"{name} {confv:.2f}",
                        (x1, max(20, y1-5)),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0,255,0), 2, cv2.LINE_AA)
    
        smoke_prob = smoking_score(results)
        text = f"smoking (heuristic): {smoke_prob:.2f}"
    
        # рисуем текст поверх кадра
        cv2.putText(
                annotated, text, (10, 60),
                cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 0, 0), 2, cv2.LINE_AA
                )
    
        # annotated — это BGR numpy array от results[0].plot()
    
        if ffmpeg_proc is None:
            H, W = annotated.shape[:2]
    
            cmd = [
            "ffmpeg", "-hide_banner",
            "-loglevel", "warning",
            "-f", "rawvideo",
            "-pix_fmt", "bgr24",
            "-s", f"{W}x{H}",
            "-r", str(TARGET_FPS),
            "-i", "-",
            "-an",
            "-c:v", "libx264",
            "-pix_fmt", "yuv420p",
            "-preset", "veryfast",
            "-tune", "zerolatency",
            "-g", str(TARGET_FPS),
            "-bf", "0",
            "-f", "rtsp",
            "-rtsp_transport", "tcp",
            OUT_STREAM_URL
            ]
    
            ffmpeg_proc = subprocess.Popen(
            cmd,
            stdin=subprocess.PIPE,
            stdout=subprocess.DEVNULL,
            stderr=subprocess.DEVNULL,
            bufsize=0
            )
    
            time.sleep(0.2)  # дать RTSP сессии установиться
    
        # отправляем кадр в RTSP
        try:
            ffmpeg_proc.stdin.write(annotated.tobytes())
            ffmpeg_proc.stdin.flush()
                
        except BrokenPipeError:
            print("ffmpeg died, returncode:", ffmpeg_proc.poll())
            break
    
        # clear_output(wait=True)
        # plt.figure(figsize=(8, 5))
        # plt.imshow(cv2.cvtColor(annotated, cv2.COLOR_BGR2RGB))
        # plt.axis("off")
        # display(plt.gcf())
        # plt.close()
    
        # print("Detections:", ", ".join(detections) if detections else "none")
        shown += 1
        frame_idx += 1

        t_proc1 = time.perf_counter()
        proc_dt = t_proc1 - t_proc0

        # pacing
        next_t += period
        sleep = next_t - time.perf_counter()
        
        if sleep > 0:
            time.sleep(sleep)
        else:
            next_t = time.perf_counter()
    
        
        proc_fps = 1.0 / proc_dt if proc_dt > 0 else 0.0
    
        dt = time.perf_counter() - t_loop
        fps_eff = 1.0 / dt if dt > 0 else 0
        if shown % 30 == 0:
            print(f"effective_fps≈{fps_eff:.2f}")
            
    wall = time.perf_counter() - t_start
    out_fps = frames / wall if wall > 0 else 0
    
    video_capture.release()
    print("Done.")
    
    if ffmpeg_proc is not None:
        try:
            ffmpeg_proc.stdin.close()
        except Exception:
            pass
        try:
            ffmpeg_proc.send_signal(signal.SIGINT)
            ffmpeg_proc.wait(timeout=2)
        except Exception:
            ffmpeg_proc.kill()

    return {
        "target": target_fps,
        "out_fps": out_fps,
        "misses": misses,
        "frames": frames,
    }


effective_fps≈4.00
effective_fps≈4.00
effective_fps≈4.00
Done.


In [None]:
for fps in [4,5,6,7,8,10]:
    update_video_capture()
    r = start_ai_annotation(fps)
    print(r)
    # критерий: если промахов много или out_fps заметно ниже target — дальше нет смысла
    if r["out_fps"] < fps * 0.90 or r["misses"] > r["frames"] * 0.10:
        print("stop: capacity reached")
        break

In [17]:
# !ffmpeg -re -f lavfi -i testsrc=size=640x360:rate=10 \
# -an -c:v libx264 -pix_fmt yuv420p -tune zerolatency -preset veryfast \
# -f rtsp -rtsp_transport tcp rtsp://127.0.0.1:8554/cam_11_annotated
