## Sliced object detection from a video file with optional motion detection
This notebook is an example of how to use DeGirum PySDK to do sliced object detection of a video stream from a video file.
Each video frame is divided by slices/tiles with some overlap, each tile of the AI model input size (to avoid resizing).
Object detection is performed for each tile, then results from different tiles are combined.
The annotated video is saved into new file with `_tiled_annotated` suffix.
If motion detection mode is turned on, then areas with motion are detected for each frame, and only tiles, where
motion is detected, are processed.

This script works with the following inference options:

1. Run inference on DeGirum Cloud Platform;
2. Run inference on DeGirum AI Server deployed on a localhost or on some computer in your LAN or VPN;
3. Run inference on DeGirum ORCA accelerator directly installed on your computer.

To try different options, you need to specify the appropriate `hw_location` option.

When running this notebook locally, you need to specify your cloud API access token in the [env.ini](../../env.ini) file, located in the same directory as this notebook.

When running this notebook in Google Colab, the cloud API access token should be stored in a user secret named `DEGIRUM_CLOUD_TOKEN`.

In [None]:
# make sure degirum-tools package is installed
!pip show degirum-tools || pip install degirum-tools

#### Specify video file name, model name, and other options here

In [None]:
# hw_location: where you want to run inference
#     "@cloud" to use DeGirum cloud
#     "@local" to run on local machine
#     IP address for AI server inference
# video_source: video source for inference
#     camera index for local camera
#     URL of RTSP stream
#     URL of YouTube Video
#     path to video file (mp4 etc)
# model_name: name of the model for running AI inference
# min_overlap_percent: minimum tile overlap (in percent of tile dimensions)
# classes: list of classes to show
# do_motion_detection: Boolean to specify if motion detection is enabled
# ann_path: path to save annotated video
hw_location = "@cloud"
video_source = (
    "https://raw.githubusercontent.com/DeGirum/PySDKExamples/main/images/TrafficHD.mp4"
)
model_name = "yolo_v5s_coco--512x512_quant_n2x_orca1_1"
min_overlap_percent = [20, 20]
classes = ["car"]
do_motion_detection = True
ann_path = "temp/sliced_object_detection.mp4"

#### The rest of the cells below should run without any modifications

In [None]:
import degirum as dg, degirum_tools
import cv2, math, threading, queue, numpy as np

# connect to AI inference engine
zoo = dg.connect(
    hw_location, degirum_tools.get_cloud_zoo_url(), degirum_tools.get_token()
)

# load object detection model
model = zoo.load_model(
    model_name,
    overlay_show_labels=False,
    overlay_show_probabilities=False,
    overlay_line_width=1,
    overlay_alpha=1,
)

In [None]:
# Detect areas with motion on given image in respect to base image.
# Returns a tuple of motion image and updated base image.
# Motion image is black image with white pixels where motion is detected.
def detectMotion(base_img, img):
    cur_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    cur_img = cv2.GaussianBlur(src=cur_img, ksize=(5, 5), sigmaX=0)

    if base_img is None:
        base_img = cur_img
        return None, base_img

    diff = cv2.absdiff(base_img, cur_img)
    base_img = cur_img

    _, thresh = cv2.threshold(diff, 50, 255, cv2.THRESH_BINARY)
    thresh = cv2.dilate(thresh, None)

    return thresh, base_img

In [None]:
# define source of tile frames to be used in batch predict
def source(stream, model, min_overlap_precent, progress):
    tile_w, tile_h = model.model_info.InputW[0], model.model_info.InputH[0]
    image_w, image_h = int(stream.get(cv2.CAP_PROP_FRAME_WIDTH)), int(
        stream.get(cv2.CAP_PROP_FRAME_HEIGHT)
    )

    # function to calculate optimal overlap (0..1) and number of tiles
    def calc_overlap(tile_dim, image_dim, min_overlap_precent):
        tiles_less_one = math.ceil(
            (image_dim - tile_dim) / (tile_dim * (1.0 - 0.01 * min_overlap_precent))
        )
        return (
            1.0 - (image_dim - tile_dim) / (tiles_less_one * tile_dim),
            tiles_less_one + 1,
        )

    x_overlap, x_tiles = calc_overlap(tile_w, image_w, min_overlap_precent[0])
    y_overlap, y_tiles = calc_overlap(tile_h, image_h, min_overlap_precent[1])
    print(
        f"Full frame: {image_w}x{image_h}, tile: {tile_w}x{tile_h}, overlap: {round(x_overlap*100)}x{round(y_overlap*100)}%, tiles: {x_tiles}x{y_tiles}={x_tiles*y_tiles}"
    )

    base_img = None  # base image for motion detection

    while True:
        ret, frame = stream.read()
        if not ret:
            break

        progress.step()

        # loop over tiles
        first_tile = True

        if do_motion_detection:
            motion_img, base_img = detectMotion(base_img, frame)
            if motion_img is None:
                continue

        for xi in range(x_tiles):
            for yi in range(y_tiles):
                x, y = math.floor(xi * tile_w * (1 - x_overlap)), math.floor(
                    yi * tile_h * (1 - y_overlap)
                )

                if do_motion_detection:
                    if (
                        cv2.countNonZero(motion_img[y : y + tile_h, x : x + tile_w])
                        == 0
                    ):
                        continue

                tile = frame[y : y + tile_h, x : x + tile_w]
                info = {
                    "first_tile": first_tile,
                    "frame": frame,
                    "topleft": (x, y),
                    "tilesize": (tile_w, tile_h),
                }
                first_tile = False
                yield (tile, info)

In [None]:
# combine results of multiple tiles
def combine(combined_result, new_result, iou_threshold=0.5):
    # filter classes
    new_result._inference_results = [
        inference_result
        for inference_result in new_result._inference_results
        if inference_result.get("label") in classes
    ]

    # convert bbox coordinates to full image
    topleft = new_result.info["topleft"]
    for r in new_result._inference_results:
        r["bbox"] = list(np.array(r["bbox"]) + (topleft + topleft))

    if not combined_result:
        # first tile result: just store
        combined_result = new_result
        combined_result._input_image = new_result.info["frame"]
    else:
        # consecutive tile result: merge bboxes
        for new_inference_result in new_result._inference_results:
            for inference_result in combined_result._inference_results:
                bboxes = np.array(
                    [new_inference_result["bbox"], inference_result["bbox"]]
                )
                areas = degirum_tools.area(bboxes)
                intersection = degirum_tools.intersection(bboxes[0], bboxes[1])
                if intersection / min(areas) >= iou_threshold:
                    # take biggest box
                    if areas[0] > areas[1]:
                        inference_result["bbox"] = new_inference_result["bbox"]
                    break
            else:  # this clause is executed when `for` loop has no breaks
                # this box is genuine: just add it as is
                combined_result._inference_results.append(new_inference_result)

    return combined_result

In [None]:
abort = False

# AI prediction loop
# Press 'x' or 'q' to stop
with degirum_tools.Display(
    "Tiled Detection", not do_motion_detection
) as display, degirum_tools.open_video_stream(
    video_source
) as stream, degirum_tools.open_video_writer(
    str(ann_path),
    stream.get(cv2.CAP_PROP_FRAME_WIDTH),
    stream.get(cv2.CAP_PROP_FRAME_HEIGHT),
) as writer:
    # do image processing in separate thread to improve performance
    result_queue = queue.Queue()

    def worker():
        global abort
        try:
            while True:
                result = result_queue.get()
                if result is None:
                    break
                img = result.image_overlay
                writer.write(img)

                if do_motion_detection:
                    degirum_tools.put_text(
                        img,
                        f"Motion tiles: {result.info['tiles_cnt']:2d}",
                        (0, 0),
                        font_color=(0, 0, 0),
                        bg_color=(255, 255, 255),
                    )
                display.show(img)
        except KeyboardInterrupt:
            abort = True

    worker_thread = threading.Thread(target=worker)
    worker_thread.start()

    progress = degirum_tools.Progress(int(stream.get(cv2.CAP_PROP_FRAME_COUNT)))
    combined_result = None
    tiles_cnt = 0

    # inference loop
    for inference_result in model.predict_batch(
        source(stream, model, min_overlap_percent, progress)
    ):
        if inference_result.info["first_tile"] and combined_result:  # new frame started
            combined_result.info["tiles_cnt"] = tiles_cnt
            result_queue.put(combined_result)
            combined_result = None
            tiles_cnt = 0

        combined_result = combine(combined_result, inference_result)
        tiles_cnt += 1
        if abort:
            break

    result_queue.put(None)  # to stop worker thread

    worker_thread.join()

In [None]:
# display result
degirum_tools.ipython_display(ann_path)

In [None]:
# display original video
degirum_tools.ipython_display(video_source)