In [1]:
# Environment: Apple M1 Max, macOS 15.0, Python 3.10
!pip install imageio
!pip install ipython
!pip install matplotlib
!pip install opencv-python
!pip install tensorflow
!pip install tensorflow-docs
!pip install tensorflow-hub
!pip install tensorflow-metal



In [2]:
import os
import time

import cv2
# Some modules to display an animation using imageio
import imageio
import matplotlib.patches as patches
import numpy as np
import tensorflow as tf
from IPython.display import HTML
# Import matplotlib libraries
from matplotlib import pyplot as plt
from matplotlib.collections import LineCollection
from tensorflow.python.framework.ops import EagerTensor
from tensorflow_docs.vis import embed

In [3]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [4]:
# Load the TFLite model using the TFLite Interpreter
model_path = "./posenet_mobilenet_v1_100_257x257_multi_kpt_stripped.tflite"
interpreter = tf.lite.Interpreter(model_path=model_path)
interpreter.allocate_tensors()

INFO: Created TensorFlow Lite XNNPACK delegate for CPU.


In [5]:
# Get input and output tensor details
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

In [6]:
input_details

[{'name': 'sub_2',
  'index': 93,
  'shape': array([  1, 257, 257,   3], dtype=int32),
  'shape_signature': array([  1, 257, 257,   3], dtype=int32),
  'dtype': numpy.float32,
  'quantization': (0.0, 0),
  'quantization_parameters': {'scales': array([], dtype=float32),
   'zero_points': array([], dtype=int32),
   'quantized_dimension': 0},
  'sparsity_parameters': {}}]

In [7]:
output_details

[{'name': 'MobilenetV1/heatmap_2/BiasAdd',
  'index': 87,
  'shape': array([ 1,  9,  9, 17], dtype=int32),
  'shape_signature': array([ 1,  9,  9, 17], dtype=int32),
  'dtype': numpy.float32,
  'quantization': (0.0, 0),
  'quantization_parameters': {'scales': array([], dtype=float32),
   'zero_points': array([], dtype=int32),
   'quantized_dimension': 0},
  'sparsity_parameters': {}},
 {'name': 'MobilenetV1/offset_2/BiasAdd',
  'index': 90,
  'shape': array([ 1,  9,  9, 34], dtype=int32),
  'shape_signature': array([ 1,  9,  9, 34], dtype=int32),
  'dtype': numpy.float32,
  'quantization': (0.0, 0),
  'quantization_parameters': {'scales': array([], dtype=float32),
   'zero_points': array([], dtype=int32),
   'quantized_dimension': 0},
  'sparsity_parameters': {}},
 {'name': 'MobilenetV1/displacement_fwd_2/BiasAdd',
  'index': 84,
  'shape': array([ 1,  9,  9, 32], dtype=int32),
  'shape_signature': array([ 1,  9,  9, 32], dtype=int32),
  'dtype': numpy.float32,
  'quantization': (0.0, 

In [8]:
input_size = input_details[0]['shape'][1]

In [9]:
#@title Helper functions for visualization

# Dictionary that maps from joint names to keypoint indices.
KEYPOINT_DICT = {
    'nose': 0,
    'left_eye': 1,
    'right_eye': 2,
    'left_ear': 3,
    'right_ear': 4,
    'left_shoulder': 5,
    'right_shoulder': 6,
    'left_elbow': 7,
    'right_elbow': 8,
    'left_wrist': 9,
    'right_wrist': 10,
    'left_hip': 11,
    'right_hip': 12,
    'left_knee': 13,
    'right_knee': 14,
    'left_ankle': 15,
    'right_ankle': 16
}

# Maps bones to a matplotlib color name.
KEYPOINT_EDGE_INDS_TO_COLOR = {
    (0, 1): 'm',
    (0, 2): 'c',
    (1, 3): 'm',
    (2, 4): 'c',
    (0, 5): 'm',
    (0, 6): 'c',
    (5, 7): 'm',
    (7, 9): 'm',
    (6, 8): 'c',
    (8, 10): 'c',
    (5, 6): 'y',
    (5, 11): 'm',
    (6, 12): 'c',
    (11, 12): 'y',
    (11, 13): 'm',
    (13, 15): 'm',
    (12, 14): 'c',
    (14, 16): 'c'
}


def _keypoints_and_edges_for_display(keypoints_with_scores,
                                     height,
                                     width,
                                     keypoint_threshold=0.11):
    """Returns high confidence keypoints and edges for visualization.

    Args:
      keypoints_with_scores: A numpy array with shape [1, 1, 17, 3] representing
        the keypoint coordinates and scores returned from the MoveNet model.
      height: height of the image in pixels.
      width: width of the image in pixels.
      keypoint_threshold: minimum confidence score for a keypoint to be
        visualized.

    Returns:
      A (keypoints_xy, edges_xy, edge_colors) containing:
        * the coordinates of all keypoints of all detected entities;
        * the coordinates of all skeleton edges of all detected entities;
        * the colors in which the edges should be plotted.
    """
    keypoints_all = []
    keypoint_edges_all = []
    edge_colors = []
    num_instances, _, _, _ = keypoints_with_scores.shape  # No. of entities
    for idx in range(num_instances):
        # Retrieve values from the output
        kpts_x = keypoints_with_scores[0, idx, :, 1]
        kpts_y = keypoints_with_scores[0, idx, :, 0]
        kpts_scores = keypoints_with_scores[0, idx, :, 2]

        kpts_absolute_xy = np.stack([width * np.array(kpts_x), height * np.array(kpts_y)], axis=-1)
        kpts_above_thresh_absolute = kpts_absolute_xy[kpts_scores > keypoint_threshold, :]
        keypoints_all.append(kpts_above_thresh_absolute)

        # Pair up keypoints to form edges
        for edge_pair, color in KEYPOINT_EDGE_INDS_TO_COLOR.items():
            if (kpts_scores[edge_pair[0]] > keypoint_threshold and
                    kpts_scores[edge_pair[1]] > keypoint_threshold):
                x_start = kpts_absolute_xy[edge_pair[0], 0]
                y_start = kpts_absolute_xy[edge_pair[0], 1]
                x_end = kpts_absolute_xy[edge_pair[1], 0]
                y_end = kpts_absolute_xy[edge_pair[1], 1]
                line_seg = np.array([[x_start, y_start], [x_end, y_end]])
                keypoint_edges_all.append(line_seg)
                edge_colors.append(color)

    if keypoints_all:
        keypoints_xy = np.concatenate(keypoints_all, axis=0)
    else:
        keypoints_xy = np.zeros((0, 17, 2))  # Empty array with shape

    if keypoint_edges_all:
        edges_xy = np.stack(keypoint_edges_all, axis=0)
    else:
        edges_xy = np.zeros((0, 2, 2))  # Empty array with shape

    return keypoints_xy, edges_xy, edge_colors


def draw_prediction_on_image(
        image, keypoints_with_scores, crop_region=None, close_figure=False,
        output_image_height=None):
    """Draws the keypoint predictions on image.

    Args:
      image: A numpy array with shape [height, width, channel] representing the
        pixel values of the input image.
      keypoints_with_scores: A numpy array with shape [1, 1, 17, 3] representing
        the keypoint coordinates and scores returned from the MoveNet model.
      crop_region: A dictionary that defines the coordinates of the bounding box
        of the crop region in normalized coordinates (see the init_crop_region
        function below for more detail). If provided, this function will also
        draw the bounding box on the image.
      output_image_height: An integer indicating the height of the output image.
        Note that the image aspect ratio will be the same as the input image.

    Returns:
      A numpy array with shape [out_height, out_width, channel] representing the
      image overlaid with keypoint predictions.
    """
    height, width, channel = image.shape
    aspect_ratio = float(width) / height
    fig, ax = plt.subplots(figsize=(12 * aspect_ratio, 12))

    # To remove the huge white borders
    fig.tight_layout(pad=0)
    ax.margins(0)
    ax.set_yticklabels([])
    ax.set_xticklabels([])
    plt.axis('off')

    im = ax.imshow(image)
    line_segments = LineCollection([], linewidths=4, linestyle='solid')
    ax.add_collection(line_segments)
    # Turn off tick labels
    scat = ax.scatter([], [], s=60, color='#FF1493', zorder=3)

    (keypoint_locs, keypoint_edges,
     edge_colors) = _keypoints_and_edges_for_display(
        keypoints_with_scores, height, width)

    line_segments.set_segments(keypoint_edges)
    line_segments.set_color(edge_colors)
    if keypoint_edges.shape[0]:
        line_segments.set_segments(keypoint_edges)
        line_segments.set_color(edge_colors)
    if keypoint_locs.shape[0]:
        scat.set_offsets(keypoint_locs)

    if crop_region is not None:
        xmin = max(crop_region['x_min'] * width, 0.0)
        ymin = max(crop_region['y_min'] * height, 0.0)
        rec_width = min(crop_region['x_max'], 0.99) * width - xmin
        rec_height = min(crop_region['y_max'], 0.99) * height - ymin
        rect = patches.Rectangle(
            (xmin, ymin), rec_width, rec_height,
            linewidth=1, edgecolor='b', facecolor='none')
        ax.add_patch(rect)

    fig.canvas.draw()
    image_from_plot = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
    image_from_plot = image_from_plot.reshape(
        fig.canvas.get_width_height()[::-1] + (3,))
    plt.close(fig)
    if output_image_height is not None:
        output_image_width = int(output_image_height / height * width)
        image_from_plot = cv2.resize(
            image_from_plot, dsize=(output_image_width, output_image_height),
            interpolation=cv2.INTER_CUBIC)
    return image_from_plot


def to_gif(images, duration):
    """Converts image sequence (4D numpy array) to gif."""
    imageio.mimsave('./animation.gif', images, duration=duration)
    return embed.embed_file('./animation.gif')


def progress(value, max=100):
    return HTML("""
      <progress
          value='{value}'
          max='{max}',
          style='width: 100%'
      >
          {value}
      </progress>
  """.format(value=value, max=max))

### Load Dataset

In [10]:
dataset_root_dir = "./dataset"
images = []
for dirpath, dirnames, filenames in os.walk(dataset_root_dir):
    dirnames.sort()
    filenames.sort()

    for filename in filenames:
        filepath = os.path.join(dirpath, filename)
        file_extension = os.path.splitext(filepath)[1].lower()

        image = tf.io.read_file(filepath)
        if file_extension in ('.jpg', '.jpeg'):
            image = tf.image.decode_jpeg(image)
        elif file_extension == '.png':
            image = tf.image.decode_png(image)
        else:
            continue

        # Ensure image is 3-channel
        image = image[..., :3]
        images.append(image)

2024-10-12 23:46:15.089445: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Max
2024-10-12 23:46:15.089468: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 32.00 GB
2024-10-12 23:46:15.089474: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 10.67 GB
2024-10-12 23:46:15.089487: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-10-12 23:46:15.089497: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


### Run Inference

In [11]:
def run_inference(image: EagerTensor):
    # Resize and pad the image to keep the aspect ratio and fit the expected size.
    input_image = tf.expand_dims(image, axis=0)
    input_image = tf.image.resize_with_pad(input_image, input_size, input_size)
    input_image = input_image.numpy() / 255

    # Run model inference.
    interpreter.set_tensor(input_details[0]['index'], input_image)
    interpreter.invoke()

    heatmaps = interpreter.get_tensor(output_details[0]['index'])  # (1, 9, 9, 17)
    offsets = interpreter.get_tensor(output_details[1]['index'])  # (1, 9, 9, 34)
    forward_displacements = interpreter.get_tensor(output_details[2]['index'])  # (1, 9, 9, 32)
    backward_displacements = interpreter.get_tensor(output_details[3]['index'])  # (1, 9, 9, 32)

    return heatmaps, offsets, forward_displacements, backward_displacements

In [12]:
def raw_output_to_coords(heatmaps, offsets, forward_displacements, backward_displacements):
    # Reference: https://raw.githubusercontent.com/joonb14/TFLitePoseEstimation/refs/heads/main/pose%20estimation.ipynb

    def sigmoid(x):
        return 1 / (1 + np.exp(x))

    _, height, width, num_keypoints = heatmaps.shape

    keypoint_positions = []
    for keypoint in range(num_keypoints):
        # Get the heatmap for the current keypoint
        heatmap = heatmaps[0, :, :, keypoint]

        # Find the index of the maximum value in the heatmap
        max_index = np.unravel_index(np.argmax(heatmap), heatmap.shape)

        # Append the row and column of the max value
        keypoint_positions.append(list(max_index))

    confidence_scores = []
    y_coords = []
    x_coords = []
    for idx, (position_y, position_x) in enumerate(keypoint_positions):
        # Normalize the coordinates and add the offset
        y_normalized = position_y / (height - 1) * input_size + offsets[0, position_y, position_x, idx]
        x_normalized = position_x / (width - 1) * input_size + offsets[0, position_y, position_x, idx + num_keypoints]

        y_coords.append(y_normalized)
        x_coords.append(x_normalized)

        # Calculate and append the confidence score using the sigmoid of the heatmap value
        confidence_score = sigmoid(heatmaps[0, position_y, position_x, idx])
        confidence_scores.append(confidence_score)

    y_relative_coords = np.array(y_coords) / input_size
    x_relative_coords = np.array(x_coords) / input_size

    return np.stack([y_relative_coords, x_relative_coords, confidence_scores], axis=1)

In [13]:
start_time = time.time()
results = [raw_output_to_coords(*run_inference(image)) for image in images]
end_time = time.time()

print("Total time spent:", end_time - start_time)

Total time spent: 32.843241930007935


In [14]:
def show_and_save(image_idx):
    keypoints_with_scores = np.array([[results[image_idx]]])
    keypoints_with_scores[..., 2] = 1

    # Visualize the predictions with image.
    display_image = tf.expand_dims(images[image_idx], axis=0)  # uint8
    image_shape = np.array(images[image_idx].shape)[:2]
    longest_side = max(image_shape)
    display_image = tf.cast(tf.image.resize_with_pad(
        display_image, longest_side, longest_side), dtype=tf.int32)
    output_overlay = draw_prediction_on_image(
        np.squeeze(display_image.numpy(), axis=0), keypoints_with_scores, output_image_height=longest_side)

    plt.figure(figsize=(5, 5))
    plt.imshow(output_overlay)
    _ = plt.axis('off')

    # Crop the image
    center = longest_side // 2
    y_min, x_min = center - image_shape // 2
    y_max, x_max = center + image_shape // 2
    plt.xlim([x_min, x_max])
    plt.ylim([y_max, y_min])

    # Save the result in png
    plt.savefig(f"./output/{image_idx:08d}.png", dpi=180, bbox_inches='tight', pad_inches=0)

In [15]:
os.makedirs("./output", exist_ok=True)
for i in range(len(images)):
    show_and_save(i)
plt.close('all')

  image_from_plot = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
  image_from_plot = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
  image_from_plot = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
  image_from_plot = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
  image_from_plot = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
  image_from_plot = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
  image_from_plot = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
  image_from_plot = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
  image_from_plot = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
  image_from_plot = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
  image_from_plot = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
  image_from_plot = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
  image_from_plot = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)