## This notebook is an example of how to pipeline two models. 
A video stream from a local camera is processed by the person detection model. The person detection results are then processed by the pose detection model, one person bbox at a time.
Combined result is then displayed.
OpenCV is required to run this sample.

This script works with 

0. DeGirum Cloud Server, 
1. AI server equipped with DeGirum ORCA accelerator shared via Peer-to-Peer VPN, 
2. AI server equipped with DeGirum ORCA accelerator running in local network and 
3. AI server equipped with DeGirum ORCA accelerator running on the same machine as this code. 

To try different options, you just need to change the __model_zoo_option_index__ in the code below. 

The script needs a web camera connected to the machine running this code. The __camera_index__ also needs to be specified in the code below.

### Specify your model zoo option and camera index here

In [None]:
model_zoo_option_list = ['DEGIRUM_CLOUD_SERVER', 'P2P_VPN_SERVER', 'LOCAL_NETWORK_SERVER', 'LOCALHOST']
model_zoo_option_index=1
camera_index=1

### The rest of the cells below should run without any modifications

In [None]:
import degirum as dg # import DeGirum PySDK
import cv2 # OpenCV
import os
from dotenv import load_dotenv

envs_loaded=load_dotenv()


In [None]:
# Some helper functions

def show(img, capt = "<image>"):
    # show opencv image
    cv2.imshow(capt, img)
    key = cv2.waitKey(1) & 0xFF
    if key == ord('x') or key == ord('q'):
        raise KeyboardInterrupt
    
def crop(img, bbox):
    # crop opencv image to given bbox
    return img[int(bbox[1]):int(bbox[3]), int(bbox[0]):int(bbox[2])]

In [None]:
model_zoo_option=model_zoo_option_list[model_zoo_option_index]
if model_zoo_option=='DEGIRUM_CLOUD_SERVER':
    zoo = dg.connect_model_zoo(os.getenv('DEGIRUM_CLOUD_SERVER_ADDRESS'), token=os.getenv('DEGIRUM_CLOUD_TOKEN'))
elif model_zoo_option=='P2P_VPN_SERVER':
    zoo=dg.connect_model_zoo(os.getenv('P2P_VPN_SERVER_ADDRESS'))
elif model_zoo_option=='LOCAL_NETWORK_SERVER':
    zoo=dg.connect_model_zoo(os.getenv('LOCAL_NETWORK_SERVER_ADDRESS'))
elif model_zoo_option=='LOCALHOST':
    zoo=dg.connect_model_zoo(os.getenv('LOCAL_HOST_ADDRESS'))
else:
    print('Model zoo option not supported \n')                             

In [None]:
# load models for DeGirum Orca AI accelerator
# (change model name to "...n2x_cpu_1" to run it on CPU)
people_det_model = zoo.load_model("yolo_v5s_person_det--512x512_quant_n2x_orca_1")
pose_model = zoo.load_model("mobilenet_v1_posenet_coco_keypoints--353x481_quant_n2x_orca_1")

# adjust pose model properties
pose_model.output_pose_threshold = 0.2 # lower threshold
pose_model.overlay_line_width = 1
pose_model.overlay_alpha = 1
pose_model.overlay_show_probabilities = False
pose_model.overlay_show_labels = False
pose_model.image_backend = 'opencv' 
pose_model.input_numpy_colorspace='BGR'
pose_model._model_parameters.InputImgFmt=['JPEG']

# adjust people model properties
people_det_model.image_backend = 'opencv'
people_det_model._model_parameters.InputImgFmt=['JPEG']


In [None]:
# open video stream from local camera 
stream = cv2.VideoCapture(camera_index)
if (stream.isOpened()== False):
    print("Error opening video stream")
else:
    print("Succesful in opening video stream")

In [None]:
# define iterator function, which returns frames from camera 
def source():
    while True:
        ret, frame = stream.read()
        yield frame

In [None]:
# AI prediction loop
# Press 'x' or 'q' to stop
try:
    # run person detection model on a camera stream
    for people in people_det_model.predict_batch(source()):
        # prepare list of bboxes of detected person
        # if people is not None:
        person_boxes = [person['bbox'] for person in people.results]
        if len(person_boxes) == 0:
            continue

        # prepare list of images cropped around each detected person
        person_crops = [ crop(people.image, box) for box in person_boxes ]

        # for each detected person detect the pose
        all_poses = None # accumulated result
        for poses, box in zip(pose_model.predict_batch(person_crops), person_boxes):

            for r in poses.results: # convert pose coordinates to back to original image
                for p in r['landmarks']:
                    p['landmark'][0] += box[0]
                    p['landmark'][1] += box[1]

            if all_poses is None: # accumulate all detected poses
                all_poses = poses
                all_poses._input_image = people.image_overlay
            else:
                all_poses._inference_results += poses.results

        show(all_poses.image_overlay, "Poses")

except KeyboardInterrupt:
    pass # ignore KeyboardInterrupt errors
finally:
    cv2.destroyAllWindows() # close OpenCV windows
stream.release() # release camera stream