DFKI-SignLanguage · fnunnari · Feb 3, 2022 · Feb 2, 2022 · Feb 3, 2022
diff --git a/slvideotools/common.py b/slvideotools/common.py
@@ -6,6 +6,11 @@
 from typing import Tuple
 
 
+def clamp(x, lo, hi):
+    """clamp x to the [lo,hi] range"""
+    return lo if x < lo else hi if x > hi else x
+
+
 def video_info(video_path: str) -> Tuple[int, int, int]:
     """
     Uses the ffmpeg.probe function to retrieve information about a video file.
@@ -33,7 +38,7 @@ def video_info(video_path: str) -> Tuple[int, int, int]:
 
 
 # https://en.wikipedia.org/wiki/Point_reflection
-def reflect(c, P):
+def reflect(c: Tuple[float, float], P: Tuple[float, float]) -> Tuple[float, float]:
     """ 
         Make a reflection of a point P according to the center c
 
@@ -48,34 +53,7 @@ def reflect(c, P):
     P_x_prime = 2*c[0] - P[0]
     P_y_prime = 2*c[1] - P[1]
 
-    return np.array([P_x_prime, P_y_prime]).astype(int)
-
-
-def get_bbox_pts(nose, rshoulder):
-    """ 
-        Get the upper left and the lower right of the ROI
-
-        Args:
-            nose : nose coordinates
-            rshoulder : a shoulders coordinates, can be either left or right
-        Returns: 
-            bbox : a numpy array containing the upper left corner and the lower right corner coordinates
-
-    """
-
-    pt1 = reflect(nose, rshoulder)
-    pt2 = rshoulder 
-
-    return np.array([pt1, pt2])
-
-
-"""def expand_bbox(x,y,w,h):
-    pt1 = np.array([x,y])
-    pt2 = np.array([x,y+h])
-    pt3 = np.array([x+w,y])
-    pt4 = np.array([x+w,y+h])
-    return np.array([pt1,pt2,pt3,pt4])
-"""
+    return P_x_prime, P_y_prime
 
 
 def bbox_to_dict(x: Tuple[int, int, int, int]) -> dict:

diff --git a/slvideotools/extract_face_bounds.py b/slvideotools/extract_face_bounds.py
@@ -1,56 +1,53 @@
 import cv2
 import mediapipe as mp
-from .common import *
+import numpy as np
+from .common import reflect
+from .common import clamp
 
 from .datagen import FrameProducer
 from .datagen import create_frame_producer
 
 import json
 from typing import Tuple
+from typing import Union
 
 mp_face_detection = mp.solutions.face_detection
 mp_pose = mp.solutions.pose
 mp_holistic = mp.solutions.holistic
 
 
-def _get_head_region_info_cv2(image):
+def _get_head_region_info(image: np.ndarray, pose_detector: mp_pose.Pose)\
+        -> Union[None, Tuple[Tuple[float, float], Tuple[float, float], Tuple[float, float]]]:
     """
         Find the ROI containing the face from an input image
 
-        Args:
-          image : input image
-
-        Returns: List [nose,rshoulder,lshoulder]
-          nose: nose coordinates
-          rshoulder: right shoulder coordinates
-          lshoulder: left shoulder coordinates
+        :param image: input BGR (cv2 format) image of type np.ndarray and shape [height, width, 3]
+        :param pose_detector: the MediaPipe pose detector
 
+        :returns List[nose, rshoulder, lshoulder]. Each element is a 2-size ndarray with 2D landmark coordinates in pixel space.
+        Or returns None if no body could be detected
     """
-    with mp_pose.Pose(
-            static_image_mode=True,
-            model_complexity=2,
-            min_detection_confidence=0.5) as pose:
-        image_height, image_width, _ = image.shape
-        # Convert the BGR image to RGB before processing.
-        results = pose.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
-
-        # Get the pose keypoint of the whole body
-        if not results.pose_landmarks:
-            raise RuntimeError("no body roi detected")
-
-        # Selection of the necessary part from landmarks
-        nose = np.array([results.pose_landmarks.landmark[mp_holistic.PoseLandmark.NOSE].x * image_width,
-                         results.pose_landmarks.landmark[mp_holistic.PoseLandmark.NOSE].y * image_height])
-        rshoulder = np.array([results.pose_landmarks.landmark[mp_holistic.PoseLandmark.RIGHT_SHOULDER].x * image_width,
-                              results.pose_landmarks.landmark[mp_holistic.PoseLandmark.RIGHT_SHOULDER].y * image_height])
-        lshoulder = np.array([results.pose_landmarks.landmark[mp_holistic.PoseLandmark.LEFT_SHOULDER].x * image_width,
-                              results.pose_landmarks.landmark[mp_holistic.PoseLandmark.LEFT_SHOULDER].y * image_height])
-
-        return [nose, rshoulder, lshoulder]
-
-
-def extract_face_bounds(frames_in: FrameProducer, head_focus: bool = False)\
-        -> Tuple[int, int, int, int]:
+
+    image_height, image_width, _ = image.shape
+    # Convert the BGR image to RGB before processing.
+    results = pose_detector.process(image)
+
+    # Get the pose keypoint of the whole body
+    if not results.pose_landmarks:
+        return None
+
+    # Selection of the necessary part from landmarks
+    nose = (results.pose_landmarks.landmark[mp_holistic.PoseLandmark.NOSE].x * image_width,
+                     results.pose_landmarks.landmark[mp_holistic.PoseLandmark.NOSE].y * image_height)
+    rshoulder = (results.pose_landmarks.landmark[mp_holistic.PoseLandmark.RIGHT_SHOULDER].x * image_width,
+                          results.pose_landmarks.landmark[mp_holistic.PoseLandmark.RIGHT_SHOULDER].y * image_height)
+    lshoulder = (results.pose_landmarks.landmark[mp_holistic.PoseLandmark.LEFT_SHOULDER].x * image_width,
+                          results.pose_landmarks.landmark[mp_holistic.PoseLandmark.LEFT_SHOULDER].y * image_height)
+
+    return nose, rshoulder, lshoulder
+
+
+def extract_face_bounds(frames_in: FrameProducer, head_focus: bool = False) -> Tuple[int, int, int, int]:
     """
         Get the global face boundingbox throughout the video
 
@@ -61,24 +58,48 @@ def extract_face_bounds(frames_in: FrameProducer, head_focus: bool = False)\
          as a 4-tuple of int elements, in order: x, y, width, height
     """
 
-    with mp_face_detection.FaceDetection(min_detection_confidence=0.5) as face_detection:
+    with mp_face_detection.FaceDetection(min_detection_confidence=0.5) as face_detection, \
+        mp_pose.Pose(static_image_mode=True, model_complexity=2, min_detection_confidence=0.5) as pose_detector:
+
         xs = []
         ys = []
 
         for image in frames_in.frames():
 
+            # MediaPipe wants images in OpenCV BGR format
+            image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
+
             h, w, _ = image.shape
             x, y = 0, 0
             if head_focus:
-                nose, rshoulder, _ = _get_head_region_info_cv2(cv2.cvtColor(image, cv2.COLOR_RGB2BGR))
-                # print("nose,rshoulder",nose,rshoulder)
-                pts = get_bbox_pts(nose, rshoulder)
-                # print(pts)
-                x, y, w, h = cv2.boundingRect(pts.astype(int))
-                # crop the bbox
-                image = image[y:y+h, x:x+w]
-
-            # Convert the BGR image to RGB and process it with MediaPipe Face Detection.
+
+                # Try to find information about the shoulder/nose area
+                head_info = _get_head_region_info(image=image, pose_detector=pose_detector)
+
+                if head_info is not None:
+                    nose, _, lshoulder = head_info
+                    # print("nose,rshoulder",nose,rshoulder)
+                    # lshoulder is the bottom-right point
+                    x2, y2 = lshoulder
+                    # Compute the top-left
+                    x, y = reflect(c=nose, P=lshoulder)
+                    # Ensure that the reflected point is in the image boundaries
+                    x = clamp(x, 0, w)
+                    y = clamp(y, 0, h)
+
+                    # Compute bbox and round to integers
+                    w = int(x2 - x)
+                    h = int(y2 - y)
+                    x = int(x)
+                    y = int(y)
+
+                    # crop the bbox, if a body was visible from the front
+                    if w > 0 and h > 0:
+                        image = image[y:y+h, x:x+w]
+                        # the `face_detection.process()` requires a contiguous array
+                        image = np.ascontiguousarray(image)
+
+            # Process the image with MediaPipe Face Detection.
             results = face_detection.process(image)
 
             if not results.detections:  # if nothing detected
@@ -107,6 +128,7 @@ def extract_face_bounds(frames_in: FrameProducer, head_focus: bool = False)\
 
 if __name__ == '__main__':
     import argparse
+    from .common import bbox_to_dict
 
     parser = argparse.ArgumentParser(description='Get the bounding box of the face throughout a video')
     parser.add_argument('--inframes', '--invideo',