Daniil-Osokin · leonelhs · Nov 17, 2023
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,13 @@
 *.pyc
 __pycache__
-.idea/
+.idea/
+dist/
+human-pose-estimation.bin
+human-pose-estimation.onnx
+human-pose-estimation.xml
+checkpoint_iter_370000.pth
+human_pose_estimator.egg-info
+build
+playground.py
+task_build
+task_upload
diff --git a/README.md b/README.md
@@ -1,5 +1,5 @@
 # Real-time 2D Multi-Person Pose Estimation on CPU: Lightweight OpenPose
-
+# [![HuggingFace](https://img.shields.io/badge/%F0%9F%A4%97-Try%20in%20Huggingface-yellow)](https://huggingface.co/spaces/leonelhs/poser-torch) [![PyPI version](https://badge.fury.io/py/human-pose-estimator.svg)](https://badge.fury.io/py/human-pose-estimator)  
 This repository contains training code for the paper [Real-time 2D Multi-Person Pose Estimation on CPU: Lightweight OpenPose](https://arxiv.org/pdf/1811.12004.pdf). This work heavily optimizes the [OpenPose](https://github.com/CMU-Perceptual-Computing-Lab/openpose) approach to reach real-time inference on CPU with negliable accuracy drop. It detects a skeleton (which consists of keypoints and connections between them) to identify human poses for every person inside the image. The pose may contain up to 18 keypoints: ears, eyes, nose, neck, shoulders, elbows, wrists, hips, knees, and ankles. On COCO 2017 Keypoint Detection validation set this code achives 40% AP for the single scale inference (no flip or any post-processing done). The result can be reproduced using this repository. *This repo significantly overlaps with https://github.com/opencv/openvino_training_extensions, however contains just the necessary code for human pose estimation.*
 
 <p align="center">
@@ -17,8 +17,9 @@ This repository contains training code for the paper [Real-time 2D Multi-Person
 * [Training](#training)
 * [Validation](#validation)
 * [Pre-trained model](#pre-trained-model)
-* [C++ demo](#cpp-demo)
+* [C++ demo](#c-demo-a-namecpp-demo)
 * [Python demo](#python-demo)
+* [Python API](#python-api-usage-a-nameapi-usage)
 * [Citation](#citation)
 
 ### Other Implementations
@@ -28,9 +29,9 @@ This repository contains training code for the paper [Real-time 2D Multi-Person
 
 ## Requirements
 
-* Ubuntu 16.04
-* Python 3.6
-* PyTorch 0.4.1 (should also work with 1.0, but not tested)
+* Ubuntu >= 16.04 <= 23.04
+* Python >= 3.6 <= 3.11.6
+* PyTorch >= 0.4.1 <= 2.1.0
 
 ## Prerequisites
 
@@ -107,7 +108,33 @@ To run the demo download Intel&reg; OpenVINO&trade; Toolkit [https://software.in
 ## Python Demo <a name="python-demo"/>
 
 We provide python demo just for the quick results preview. Please, consider c++ demo for the best performance. To run the python demo from a webcam:
-* `python demo.py --checkpoint-path <path_to>/checkpoint_iter_370000.pth --video 0`
+
+
+```console
+foo@bar:~$ pip install human-pose-estimator
+
+foo@bar:~$ poseestimator --cpu --video /dev/video0
+foo@bar:~$ poseestimator --cpu --images /home/poses
+foo@bar:~$ poseestimator --cpu --images /home/poses/pose01.jpg
+```
+## Python API usage <a name="api-usage"/>
+```python
+import cv2
+from human_pose_estimator import PoseEstimator
+
+img = cv2.imread("/home/leonel/poses/pose02.jpg")
+
+pose_estimator = PoseEstimator("cpu")
+
+poses, _, _ = pose_estimator.get_poses(img, height_size=256)
+
+for pose in poses:
+    pose.draw(img)
+
+cv2.imshow('Human Pose Estimation', img)
+cv2.waitKey(0)
+cv2.destroyAllWindows()
+```
 
 ## Citation:
 

diff --git a/demo.py b/demo.py
@@ -1,166 +1,15 @@
-import argparse
-
 import cv2
-import numpy as np
-import torch
-
-from models.with_mobilenet import PoseEstimationWithMobileNet
-from modules.keypoints import extract_keypoints, group_keypoints
-from modules.load_state import load_state
-from modules.pose import Pose, track_poses
-from val import normalize, pad_width
-
-
-class ImageReader(object):
-    def __init__(self, file_names):
-        self.file_names = file_names
-        self.max_idx = len(file_names)
-
-    def __iter__(self):
-        self.idx = 0
-        return self
-
-    def __next__(self):
-        if self.idx == self.max_idx:
-            raise StopIteration
-        img = cv2.imread(self.file_names[self.idx], cv2.IMREAD_COLOR)
-        if img.size == 0:
-            raise IOError('Image {} cannot be read'.format(self.file_names[self.idx]))
-        self.idx = self.idx + 1
-        return img
-
-
-class VideoReader(object):
-    def __init__(self, file_name):
-        self.file_name = file_name
-        try:  # OpenCV needs int to read from webcam
-            self.file_name = int(file_name)
-        except ValueError:
-            pass
-
-    def __iter__(self):
-        self.cap = cv2.VideoCapture(self.file_name)
-        if not self.cap.isOpened():
-            raise IOError('Video {} cannot be opened'.format(self.file_name))
-        return self
-
-    def __next__(self):
-        was_read, img = self.cap.read()
-        if not was_read:
-            raise StopIteration
-        return img
-
-
-def infer_fast(net, img, net_input_height_size, stride, upsample_ratio, cpu,
-               pad_value=(0, 0, 0), img_mean=np.array([128, 128, 128], np.float32), img_scale=np.float32(1/256)):
-    height, width, _ = img.shape
-    scale = net_input_height_size / height
-
-    scaled_img = cv2.resize(img, (0, 0), fx=scale, fy=scale, interpolation=cv2.INTER_LINEAR)
-    scaled_img = normalize(scaled_img, img_mean, img_scale)
-    min_dims = [net_input_height_size, max(scaled_img.shape[1], net_input_height_size)]
-    padded_img, pad = pad_width(scaled_img, stride, pad_value, min_dims)
-
-    tensor_img = torch.from_numpy(padded_img).permute(2, 0, 1).unsqueeze(0).float()
-    if not cpu:
-        tensor_img = tensor_img.cuda()
-
-    stages_output = net(tensor_img)
-
-    stage2_heatmaps = stages_output[-2]
-    heatmaps = np.transpose(stage2_heatmaps.squeeze().cpu().data.numpy(), (1, 2, 0))
-    heatmaps = cv2.resize(heatmaps, (0, 0), fx=upsample_ratio, fy=upsample_ratio, interpolation=cv2.INTER_CUBIC)
-
-    stage2_pafs = stages_output[-1]
-    pafs = np.transpose(stage2_pafs.squeeze().cpu().data.numpy(), (1, 2, 0))
-    pafs = cv2.resize(pafs, (0, 0), fx=upsample_ratio, fy=upsample_ratio, interpolation=cv2.INTER_CUBIC)
-
-    return heatmaps, pafs, scale, pad
-
-
-def run_demo(net, image_provider, height_size, cpu, track, smooth):
-    net = net.eval()
-    if not cpu:
-        net = net.cuda()
-
-    stride = 8
-    upsample_ratio = 4
-    num_keypoints = Pose.num_kpts
-    previous_poses = []
-    delay = 1
-    for img in image_provider:
-        orig_img = img.copy()
-        heatmaps, pafs, scale, pad = infer_fast(net, img, height_size, stride, upsample_ratio, cpu)
-
-        total_keypoints_num = 0
-        all_keypoints_by_type = []
-        for kpt_idx in range(num_keypoints):  # 19th for bg
-            total_keypoints_num += extract_keypoints(heatmaps[:, :, kpt_idx], all_keypoints_by_type, total_keypoints_num)
-
-        pose_entries, all_keypoints = group_keypoints(all_keypoints_by_type, pafs)
-        for kpt_id in range(all_keypoints.shape[0]):
-            all_keypoints[kpt_id, 0] = (all_keypoints[kpt_id, 0] * stride / upsample_ratio - pad[1]) / scale
-            all_keypoints[kpt_id, 1] = (all_keypoints[kpt_id, 1] * stride / upsample_ratio - pad[0]) / scale
-        current_poses = []
-        for n in range(len(pose_entries)):
-            if len(pose_entries[n]) == 0:
-                continue
-            pose_keypoints = np.ones((num_keypoints, 2), dtype=np.int32) * -1
-            for kpt_id in range(num_keypoints):
-                if pose_entries[n][kpt_id] != -1.0:  # keypoint was found
-                    pose_keypoints[kpt_id, 0] = int(all_keypoints[int(pose_entries[n][kpt_id]), 0])
-                    pose_keypoints[kpt_id, 1] = int(all_keypoints[int(pose_entries[n][kpt_id]), 1])
-            pose = Pose(pose_keypoints, pose_entries[n][18])
-            current_poses.append(pose)
-
-        if track:
-            track_poses(previous_poses, current_poses, smooth=smooth)
-            previous_poses = current_poses
-        for pose in current_poses:
-            pose.draw(img)
-        img = cv2.addWeighted(orig_img, 0.6, img, 0.4, 0)
-        for pose in current_poses:
-            cv2.rectangle(img, (pose.bbox[0], pose.bbox[1]),
-                          (pose.bbox[0] + pose.bbox[2], pose.bbox[1] + pose.bbox[3]), (0, 255, 0))
-            if track:
-                cv2.putText(img, 'id: {}'.format(pose.id), (pose.bbox[0], pose.bbox[1] - 16),
-                            cv2.FONT_HERSHEY_COMPLEX, 0.5, (0, 0, 255))
-        cv2.imshow('Lightweight Human Pose Estimation Python Demo', img)
-        key = cv2.waitKey(delay)
-        if key == 27:  # esc
-            return
-        elif key == 112:  # 'p'
-            if delay == 1:
-                delay = 0
-            else:
-                delay = 1
-
+from human_pose_estimator import PoseEstimator
 
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(
-        description='''Lightweight human pose estimation python demo.
-                       This is just for quick results preview.
-                       Please, consider c++ demo for the best performance.''')
-    parser.add_argument('--checkpoint-path', type=str, required=True, help='path to the checkpoint')
-    parser.add_argument('--height-size', type=int, default=256, help='network input layer height size')
-    parser.add_argument('--video', type=str, default='', help='path to video file or camera id')
-    parser.add_argument('--images', nargs='+', default='', help='path to input image(s)')
-    parser.add_argument('--cpu', action='store_true', help='run network inference on cpu')
-    parser.add_argument('--track', type=int, default=1, help='track pose id in video')
-    parser.add_argument('--smooth', type=int, default=1, help='smooth pose keypoints')
-    args = parser.parse_args()
+img = cv2.imread("/home/leonel/poses/pose02.jpg")
 
-    if args.video == '' and args.images == '':
-        raise ValueError('Either --video or --image has to be provided')
+pose_estimator = PoseEstimator("cpu")
 
-    net = PoseEstimationWithMobileNet()
-    checkpoint = torch.load(args.checkpoint_path, map_location='cpu')
-    load_state(net, checkpoint)
+poses, _, _ = pose_estimator.get_poses(img, height_size=256)
 
-    frame_provider = ImageReader(args.images)
-    if args.video != '':
-        frame_provider = VideoReader(args.video)
-    else:
-        args.track = 0
+for pose in poses:
+    pose.draw(img)
 
-    run_demo(net, frame_provider, args.height_size, args.cpu, args.track, args.smooth)
+cv2.imshow('Human Pose Estimation', img)
+cv2.waitKey(0)
+cv2.destroyAllWindows()
diff --git a/human_pose_estimator/__init__.py b/human_pose_estimator/__init__.py
@@ -0,0 +1,8 @@
+__appname__ = "Human Pose Estimator"
+__version__ = "1.0.0"
+
+from .image_reader import ImageReader
+from .video_reader import VideoReader
+from .pose_estimator import PoseEstimator
+
+
diff --git a/human_pose_estimator/__main__.py b/human_pose_estimator/__main__.py
@@ -0,0 +1,86 @@
+import argparse
+import os.path
+
+import cv2
+import filetype
+
+from human_pose_estimator import PoseEstimator, VideoReader, ImageReader
+from human_pose_estimator.modules.pose import track_poses
+
+
+def make_estimation(estimator, image_provider, delay, height_size, track, smooth):
+    previous_poses = []
+
+    for img in image_provider:
+        orig_img = img.copy()
+        current_poses, _, _ = estimator.get_poses(img, height_size)
+
+        if track:
+            track_poses(previous_poses, current_poses, smooth=smooth)
+            previous_poses = current_poses
+        for pose in current_poses:
+            pose.draw(img)
+        img = cv2.addWeighted(orig_img, 0.6, img, 0.4, 0)
+        for pose in current_poses:
+            cv2.rectangle(img, (pose.bbox[0], pose.bbox[1]),
+                          (pose.bbox[0] + pose.bbox[2], pose.bbox[1] + pose.bbox[3]), (0, 255, 0))
+            if track:
+                cv2.putText(img, 'id: {}'.format(pose.id), (pose.bbox[0], pose.bbox[1] - 16),
+                            cv2.FONT_HERSHEY_COMPLEX, 0.5, (0, 0, 255))
+        cv2.imshow('Lightweight Human Pose Estimation Python Demo', img)
+        key = cv2.waitKey(delay)
+        if key == 27:  # esc
+            return
+        elif key == 112:  # 'p'
+            if delay == 1:
+                delay = 0
+            else:
+                delay = 1
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='''Lightweight human pose estimation python demo.
+                           This is just for quick results preview.
+                           Please, consider c++ demo for the best performance.''')
+    parser.add_argument('--height-size', type=int, default=256, help='network input layer height size')
+    parser.add_argument('--video', type=str, default='', help='path to video file or camera id')
+    parser.add_argument('--images', type=str, default='', help='path to input image(s)')
+    parser.add_argument('--cpu', action='store_true', help='run network inference on cpu')
+    parser.add_argument('--track', type=int, default=1, help='track pose id in video')
+    parser.add_argument('--smooth', type=int, default=1, help='smooth pose keypoints')
+    args = parser.parse_args()
+
+    if args.video == '' and args.images == '':
+        raise ValueError('Either --video or --image has to be provided')
+
+    output_delay = 0
+    frame_provider = None
+
+    if args.images != '':
+        images = []
+        if os.path.isdir(args.images):
+            for file in os.listdir(args.images):
+                img_path = os.path.join(args.images, file)
+                if filetype.is_image(img_path):
+                    images.append(img_path)
+            frame_provider = ImageReader(images)
+        elif os.path.isfile(args.images):
+            if filetype.is_image(args.images):
+                frame_provider = ImageReader([args.images])
+        else:
+            raise ValueError('No valid images were found.')
+
+    if args.video != '':
+        frame_provider = VideoReader(args.video)
+        output_delay = 1
+    else:
+        args.track = 0
+
+    pose_estimator = PoseEstimator(args.cpu)
+
+    make_estimation(pose_estimator, frame_provider, output_delay, args.height_size, args.track, args.smooth)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/datasets/__init__.py → human_pose_estimator/datasets/__init__.py b/datasets/__init__.py → human_pose_estimator/datasets/__init__.py
diff --git a/datasets/coco.py → human_pose_estimator/datasets/coco.py b/datasets/coco.py → human_pose_estimator/datasets/coco.py
diff --git a/datasets/transformations.py → ...ose_estimator/datasets/transformations.py b/datasets/transformations.py → ...ose_estimator/datasets/transformations.py
diff --git a/human_pose_estimator/image_reader.py b/human_pose_estimator/image_reader.py
@@ -0,0 +1,21 @@
+import cv2
+
+
+class ImageReader(object):
+    def __init__(self, file_names):
+        self.file_names = file_names
+        self.max_idx = len(file_names)
+
+    def __iter__(self):
+        self.idx = 0
+        return self
+
+    def __next__(self):
+        if self.idx == self.max_idx:
+            raise StopIteration
+        img = cv2.imread(self.file_names[self.idx], cv2.IMREAD_COLOR)
+        if img.size == 0:
+            raise IOError('Image {} cannot be read'.format(self.file_names[self.idx]))
+        self.idx = self.idx + 1
+        return img
+
diff --git a/models/__init__.py → human_pose_estimator/models/__init__.py b/models/__init__.py → human_pose_estimator/models/__init__.py
diff --git a/models/with_mobilenet.py → ...n_pose_estimator/models/with_mobilenet.py b/models/with_mobilenet.py → ...n_pose_estimator/models/with_mobilenet.py
@@ -1,7 +1,7 @@
 import torch
 from torch import nn
 
-from modules.conv import conv, conv_dw, conv_dw_no_bn
+from human_pose_estimator.modules.conv import conv, conv_dw, conv_dw_no_bn
 
 
 class Cpm(nn.Module):

diff --git a/modules/__init__.py → human_pose_estimator/modules/__init__.py b/modules/__init__.py → human_pose_estimator/modules/__init__.py
diff --git a/modules/conv.py → human_pose_estimator/modules/conv.py b/modules/conv.py → human_pose_estimator/modules/conv.py
diff --git a/modules/get_parameters.py → ..._pose_estimator/modules/get_parameters.py b/modules/get_parameters.py → ..._pose_estimator/modules/get_parameters.py
diff --git a/modules/keypoints.py → human_pose_estimator/modules/keypoints.py b/modules/keypoints.py → human_pose_estimator/modules/keypoints.py
diff --git a/modules/load_state.py → human_pose_estimator/modules/load_state.py b/modules/load_state.py → human_pose_estimator/modules/load_state.py
diff --git a/modules/loss.py → human_pose_estimator/modules/loss.py b/modules/loss.py → human_pose_estimator/modules/loss.py
diff --git a/modules/one_euro_filter.py → ...pose_estimator/modules/one_euro_filter.py b/modules/one_euro_filter.py → ...pose_estimator/modules/one_euro_filter.py
diff --git a/modules/pose.py → human_pose_estimator/modules/pose.py b/modules/pose.py → human_pose_estimator/modules/pose.py
@@ -1,8 +1,8 @@
 import cv2
 import numpy as np
 
-from modules.keypoints import BODY_PARTS_KPT_IDS, BODY_PARTS_PAF_IDS
-from modules.one_euro_filter import OneEuroFilter
+from human_pose_estimator.modules.keypoints import BODY_PARTS_PAF_IDS, BODY_PARTS_KPT_IDS
+from human_pose_estimator.modules.one_euro_filter import OneEuroFilter
 
 
 class Pose: