Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 7 additions & 29 deletions slvideotools/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@
from typing import Tuple


def clamp(x, lo, hi):
"""clamp x to the [lo,hi] range"""
return lo if x < lo else hi if x > hi else x


def video_info(video_path: str) -> Tuple[int, int, int]:
"""
Uses the ffmpeg.probe function to retrieve information about a video file.
Expand Down Expand Up @@ -33,7 +38,7 @@ def video_info(video_path: str) -> Tuple[int, int, int]:


# https://en.wikipedia.org/wiki/Point_reflection
def reflect(c, P):
def reflect(c: Tuple[float, float], P: Tuple[float, float]) -> Tuple[float, float]:
"""
Make a reflection of a point P according to the center c

Expand All @@ -48,34 +53,7 @@ def reflect(c, P):
P_x_prime = 2*c[0] - P[0]
P_y_prime = 2*c[1] - P[1]

return np.array([P_x_prime, P_y_prime]).astype(int)


def get_bbox_pts(nose, rshoulder):
"""
Get the upper left and the lower right of the ROI

Args:
nose : nose coordinates
rshoulder : a shoulders coordinates, can be either left or right
Returns:
bbox : a numpy array containing the upper left corner and the lower right corner coordinates

"""

pt1 = reflect(nose, rshoulder)
pt2 = rshoulder

return np.array([pt1, pt2])


"""def expand_bbox(x,y,w,h):
pt1 = np.array([x,y])
pt2 = np.array([x,y+h])
pt3 = np.array([x+w,y])
pt4 = np.array([x+w,y+h])
return np.array([pt1,pt2,pt3,pt4])
"""
return P_x_prime, P_y_prime


def bbox_to_dict(x: Tuple[int, int, int, int]) -> dict:
Expand Down
110 changes: 66 additions & 44 deletions slvideotools/extract_face_bounds.py
Original file line number Diff line number Diff line change
@@ -1,56 +1,53 @@
import cv2
import mediapipe as mp
from .common import *
import numpy as np
from .common import reflect
from .common import clamp

from .datagen import FrameProducer
from .datagen import create_frame_producer

import json
from typing import Tuple
from typing import Union

mp_face_detection = mp.solutions.face_detection
mp_pose = mp.solutions.pose
mp_holistic = mp.solutions.holistic


def _get_head_region_info_cv2(image):
def _get_head_region_info(image: np.ndarray, pose_detector: mp_pose.Pose)\
-> Union[None, Tuple[Tuple[float, float], Tuple[float, float], Tuple[float, float]]]:
"""
Find the ROI containing the face from an input image

Args:
image : input image

Returns: List [nose,rshoulder,lshoulder]
nose: nose coordinates
rshoulder: right shoulder coordinates
lshoulder: left shoulder coordinates
:param image: input BGR (cv2 format) image of type np.ndarray and shape [height, width, 3]
:param pose_detector: the MediaPipe pose detector

:returns List[nose, rshoulder, lshoulder]. Each element is a 2-size ndarray with 2D landmark coordinates in pixel space.
Or returns None if no body could be detected
"""
with mp_pose.Pose(
static_image_mode=True,
model_complexity=2,
min_detection_confidence=0.5) as pose:
image_height, image_width, _ = image.shape
# Convert the BGR image to RGB before processing.
results = pose.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))

# Get the pose keypoint of the whole body
if not results.pose_landmarks:
raise RuntimeError("no body roi detected")

# Selection of the necessary part from landmarks
nose = np.array([results.pose_landmarks.landmark[mp_holistic.PoseLandmark.NOSE].x * image_width,
results.pose_landmarks.landmark[mp_holistic.PoseLandmark.NOSE].y * image_height])
rshoulder = np.array([results.pose_landmarks.landmark[mp_holistic.PoseLandmark.RIGHT_SHOULDER].x * image_width,
results.pose_landmarks.landmark[mp_holistic.PoseLandmark.RIGHT_SHOULDER].y * image_height])
lshoulder = np.array([results.pose_landmarks.landmark[mp_holistic.PoseLandmark.LEFT_SHOULDER].x * image_width,
results.pose_landmarks.landmark[mp_holistic.PoseLandmark.LEFT_SHOULDER].y * image_height])

return [nose, rshoulder, lshoulder]


def extract_face_bounds(frames_in: FrameProducer, head_focus: bool = False)\
-> Tuple[int, int, int, int]:

image_height, image_width, _ = image.shape
# Convert the BGR image to RGB before processing.
results = pose_detector.process(image)

# Get the pose keypoint of the whole body
if not results.pose_landmarks:
return None

# Selection of the necessary part from landmarks
nose = (results.pose_landmarks.landmark[mp_holistic.PoseLandmark.NOSE].x * image_width,
results.pose_landmarks.landmark[mp_holistic.PoseLandmark.NOSE].y * image_height)
rshoulder = (results.pose_landmarks.landmark[mp_holistic.PoseLandmark.RIGHT_SHOULDER].x * image_width,
results.pose_landmarks.landmark[mp_holistic.PoseLandmark.RIGHT_SHOULDER].y * image_height)
lshoulder = (results.pose_landmarks.landmark[mp_holistic.PoseLandmark.LEFT_SHOULDER].x * image_width,
results.pose_landmarks.landmark[mp_holistic.PoseLandmark.LEFT_SHOULDER].y * image_height)

return nose, rshoulder, lshoulder


def extract_face_bounds(frames_in: FrameProducer, head_focus: bool = False) -> Tuple[int, int, int, int]:
"""
Get the global face boundingbox throughout the video

Expand All @@ -61,24 +58,48 @@ def extract_face_bounds(frames_in: FrameProducer, head_focus: bool = False)\
as a 4-tuple of int elements, in order: x, y, width, height
"""

with mp_face_detection.FaceDetection(min_detection_confidence=0.5) as face_detection:
with mp_face_detection.FaceDetection(min_detection_confidence=0.5) as face_detection, \
mp_pose.Pose(static_image_mode=True, model_complexity=2, min_detection_confidence=0.5) as pose_detector:

xs = []
ys = []

for image in frames_in.frames():

# MediaPipe wants images in OpenCV BGR format
image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)

h, w, _ = image.shape
x, y = 0, 0
if head_focus:
nose, rshoulder, _ = _get_head_region_info_cv2(cv2.cvtColor(image, cv2.COLOR_RGB2BGR))
# print("nose,rshoulder",nose,rshoulder)
pts = get_bbox_pts(nose, rshoulder)
# print(pts)
x, y, w, h = cv2.boundingRect(pts.astype(int))
# crop the bbox
image = image[y:y+h, x:x+w]

# Convert the BGR image to RGB and process it with MediaPipe Face Detection.

# Try to find information about the shoulder/nose area
head_info = _get_head_region_info(image=image, pose_detector=pose_detector)

if head_info is not None:
nose, _, lshoulder = head_info
# print("nose,rshoulder",nose,rshoulder)
# lshoulder is the bottom-right point
x2, y2 = lshoulder
# Compute the top-left
x, y = reflect(c=nose, P=lshoulder)
# Ensure that the reflected point is in the image boundaries
x = clamp(x, 0, w)
y = clamp(y, 0, h)

# Compute bbox and round to integers
w = int(x2 - x)
h = int(y2 - y)
x = int(x)
y = int(y)

# crop the bbox, if a body was visible from the front
if w > 0 and h > 0:
image = image[y:y+h, x:x+w]
# the `face_detection.process()` requires a contiguous array
image = np.ascontiguousarray(image)

# Process the image with MediaPipe Face Detection.
results = face_detection.process(image)

if not results.detections: # if nothing detected
Expand Down Expand Up @@ -107,6 +128,7 @@ def extract_face_bounds(frames_in: FrameProducer, head_focus: bool = False)\

if __name__ == '__main__':
import argparse
from .common import bbox_to_dict

parser = argparse.ArgumentParser(description='Get the bounding box of the face throughout a video')
parser.add_argument('--inframes', '--invideo',
Expand Down