In [11]:
import cv2
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
from ai_edge_litert.interpreter import Interpreter

# ==== Config ====
TFLITE_MODEL_PATH = "build/csrnet_mobile_B_float16.tflite"
VIDEO_PATH = "inference/test_videos/crowd_video_test5.mp4"
QUEUE_SIZE = 10  # number of frames for temporal smoothing
FRAME_SKIP = 1   # skip every N frames
VISUALIZE = False

# ==== Load TFLite Model ====
interpreter = Interpreter(model_path=TFLITE_MODEL_PATH)
interpreter.allocate_tensors()
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()
input_shape = input_details[0]['shape']
is_channels_first = input_shape[1] == 3

# ==== Preprocessing ====
def preprocess(frame):
    img = cv2.resize(frame, (512, 512))
    img = img.astype(np.float32) / 255.0
    img = (img - [0.485, 0.456, 0.406]) / [0.229, 0.224, 0.225]
    if not is_channels_first:
        img = img  # NHWC
    else:
        img = np.transpose(img, (2, 0, 1))  # CHW
    return np.expand_dims(img, axis=0).astype(np.float32)

# ==== Spatio-temporal smoothing ====
density_queue = deque(maxlen=QUEUE_SIZE)

cap = cv2.VideoCapture(VIDEO_PATH)
frame_count = 0

while True:
    ret, frame = cap.read()
    if not ret:
        break

    frame_count += 1
    if frame_count % FRAME_SKIP != 0:
        continue

    input_tensor = preprocess(frame)
    interpreter.set_tensor(input_details[0]['index'], input_tensor)
    interpreter.invoke()
    output = interpreter.get_tensor(output_details[0]['index'])
    density_map = output.squeeze()  # shape: (H, W)
    density_queue.append(density_map)

    # Temporal smoothing
    smoothed_density = np.mean(np.stack(density_queue), axis=0)
    predicted_count = smoothed_density.sum()

    print(f"Frame {frame_count}: Estimated Count = {predicted_count:.2f}")

    if VISUALIZE:
        vis = (smoothed_density / smoothed_density.max() * 255).astype(np.uint8)
        vis = cv2.applyColorMap(vis, cv2.COLORMAP_JET)
        vis = cv2.resize(vis, (frame.shape[1], frame.shape[0]))
        overlay = cv2.addWeighted(frame, 0.6, vis, 0.4, 0)
        overlay_rgb = cv2.cvtColor(overlay, cv2.COLOR_BGR2RGB)

        plt.figure(figsize=(8, 6))
        plt.imshow(overlay_rgb)
        plt.title(f"Frame {frame_count}: Count = {predicted_count:.1f}")
        plt.axis('off')
        plt.show()

cap.release()

Frame 1: Estimated Count = 121.92
Frame 2: Estimated Count = 124.57
Frame 3: Estimated Count = 125.17
Frame 4: Estimated Count = 124.51
Frame 5: Estimated Count = 123.02
Frame 6: Estimated Count = 122.69
Frame 7: Estimated Count = 122.57
Frame 8: Estimated Count = 121.91
Frame 9: Estimated Count = 121.94
Frame 10: Estimated Count = 121.40
Frame 11: Estimated Count = 120.98
Frame 12: Estimated Count = 119.66
Frame 13: Estimated Count = 117.94
Frame 14: Estimated Count = 116.71
Frame 15: Estimated Count = 115.89
Frame 16: Estimated Count = 114.83
Frame 17: Estimated Count = 113.70
Frame 18: Estimated Count = 112.35
Frame 19: Estimated Count = 110.28
Frame 20: Estimated Count = 109.18
Frame 21: Estimated Count = 107.54
Frame 22: Estimated Count = 106.09
Frame 23: Estimated Count = 105.66
Frame 24: Estimated Count = 104.61
Frame 25: Estimated Count = 104.10
Frame 26: Estimated Count = 103.34
Frame 27: Estimated Count = 102.88
Frame 28: Estimated Count = 103.21
Frame 29: Estimated Count = 1

KeyboardInterrupt: 

In [10]:
from ultralytics import YOLO

model = YOLO("build/yolo11n_float16.tflite")  # atau yolov8s.pt

results = model("inference/test_videos/crowd_video_test5.mp4", stream=True)
for r in results:
    count = sum([1 for c in r.boxes.cls if int(c) == 0])  # class 0 = person
    print(f"Detected person count: {count}")



E0000 00:00:1749913713.890139  399977 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749913713.952488  399977 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1749913714.348896  399977 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1749913714.348960  399977 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1749913714.348965  399977 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1749913714.348968  399977 computation_placer.cc:177] computation placer already registered. Please check linka

Loading build/yolo11n_float16.tflite for TensorFlow Lite inference...



    TF 2.20. Please use the LiteRT interpreter from the ai_edge_litert package.
    See the [migration guide](https://ai.google.dev/edge/litert/migration)
    for details.
    
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.


video 1/1 (frame 1/513) /home/caxz/Documents/Git/crowd-counting-model/inference/test_videos/crowd_video_test5.mp4: 640x640 14 persons, 1 backpack, 228.6ms
Detected person count: 14
video 1/1 (frame 2/513) /home/caxz/Documents/Git/crowd-counting-model/inference/test_videos/crowd_video_test5.mp4: 640x640 12 persons, 1 handbag, 183.2ms
Detected person count: 12
video 1/1 (frame 3/513) /home/caxz/Documents/Git/crowd-counting-model/inference/test_videos/crowd_video_test5.mp4: 640x640 12 persons, 2 handbags, 184.6ms
Detected person count: 12
video 1/1 (frame 4/513) /home/caxz/Documents/Git/crowd-counting-model/inference/test_videos/crowd_video_test5.mp4: 640x640 14 persons, 183.6ms
Detected person count: 14
video 1/1 (frame 5/513) /home/caxz/Documents/Git/crowd-counting-model/inference/test_videos/crowd_video_test5.mp4: 640x640 15 persons, 1 handbag, 178.8ms
Detected person count: 15
video 1/1 (frame 6/513) /home/caxz/Documents/Git/crowd-counting-model/inference/test_videos/crowd_video_test5

KeyboardInterrupt: 