In [1]:
import cv2
import time
import mediapipe
import numpy as np
from collections import deque
from filterpy.kalman import KalmanFilter

In [2]:
gesture_locked = {'Left':False,'Right':False}
gesture_start_time = {'Left':0,'Right':0}
buffer_start_time = {'Left':0,'Right':0}
start_drag_time = {'Left':0,'Right':0}
dragging = {'Left':False,'Right':False}
drag_point = {'Left':(0, 0),'Right':(0, 0)}
buffer_duration = {'Left':0.25,'Right':0.25}
is_index_finger_up = {'Left':False,'Right':False}
index_finger_second = {'Left':0,'Right':0}
index_finger_tip = {'Left':0,'Right':0}
trajectory = {'Left':[],'Right':[]}
square_queue = deque()
wait_time = 1.5
kalman_wait_time = 0.5
wait_box = 2
rect_draw_time = {'Left':0,'Right':0}
last_drawn_box = {'Left':None,'Right':None}
elapsed_time = {'Left':0,'Right':0}

In [3]:
def clear_hand_states(detected_hand='Both'):
    global gesture_locked, gesture_start_time, buffer_start_time, dragging, drag_point, buffer_duration, is_index_finger_up, trajectory, wait_time, kalman_wait_time, start_drag_time, rect_draw_time, last_drawn_box, wait_box, elapsed_time

    hands_to_clear = {'Left', 'Right'}
    if detected_hand == 'Both':
        hands_to_clear = hands_to_clear
    else:
        hands_to_clear -= {detected_hand}
        # Reverse check for left and right hands

    for h in hands_to_clear:
        gesture_locked[h] = False
        gesture_start_time[h] = 0
        buffer_start_time[h] = 0
        dragging[h] = False
        drag_point[h] = (0, 0)
        buffer_duration[h] = 0.25
        is_index_finger_up[h] = False
        trajectory[h].clear()
        start_drag_time[h] = 0
        rect_draw_time[h] = 0
        last_drawn_box[h] = None
        elapsed_time[h] = 0
        # Clear states for hands that are not detected

In [4]:
kalman_filters = {
    'Left': KalmanFilter(dim_x=4, dim_z=2),
    'Right': KalmanFilter(dim_x=4, dim_z=2)
}

for key in kalman_filters:
    kalman_filters[key].x = np.array([0., 0., 0., 0.])
    kalman_filters[key].F = np.array([[1, 0, 1, 0],
                                      [0, 1, 0, 1],
                                      [0, 0, 1, 0],
                                      [0, 0, 0, 1]])
    # State transition matrix
    kalman_filters[key].H = np.array([[1, 0, 0, 0],
                                      [0, 1, 0, 0]])
    # Observation matrix
    kalman_filters[key].P *= 1000.
    kalman_filters[key].R = 3
    kalman_filters[key].Q = np.eye(4) * 0.01

def kalman_filter_point(hand_label, x, y):
    kf = kalman_filters[hand_label]
    kf.predict()
    kf.update([x, y])
    # Update state
    return (kf.x[0], kf.x[1])

def reset_kalman_filter(hand_label, x, y):
    kf = kalman_filters[hand_label]
    kf.x = np.array([x, y, 0., 0.])
    kf.P *= 1000.
    # Reset

In [5]:
mp_hands = mediapipe.solutions.hands

hands = mp_hands.Hands(
    static_image_mode=False,
    max_num_hands=2,
    # One hand is more stable
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5
)

mp_drawing = mediapipe.solutions.drawing_utils
clear_hand_states()

In [6]:
def process_image(image):

    start_time = time.time()
    height, width = image.shape[:2]
    image = cv2.flip(image, 1)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    # Preprocess the input frame

    results = hands.process(image)

    if results.multi_hand_landmarks:
        # If hands are detected

        handness_str = ''
        index_finger_tip_str = ''

        if len(results.multi_hand_landmarks) == 1:
            clear_hand_states(detected_hand=results.multi_handedness[0].classification[0].label)
            # If only one hand is detected, clear the data of the other hand to avoid conflicts that could cause instability.

        for hand_idx in range(len(results.multi_hand_landmarks)):

            hand_21 = results.multi_hand_landmarks[hand_idx]
            mp_drawing.draw_landmarks(image, hand_21, mp_hands.HAND_CONNECTIONS)

            temp_handness = results.multi_handedness[hand_idx].classification[0].label
            handness_str += '{}:{}, '.format(hand_idx, temp_handness)
            is_index_finger_up[temp_handness] = False
            # Set to False first to prevent incorrect updates to raised when lowered

            cz0 = hand_21.landmark[0].z
            index_finger_second[temp_handness] = hand_21.landmark[7]
            index_finger_tip[temp_handness] = hand_21.landmark[8]
            # Index fingertip and first joint

            index_x, index_y = int(index_finger_tip[temp_handness].x * width), int(index_finger_tip[temp_handness].y * height)

            if all(index_finger_second[temp_handness].y < hand_21.landmark[i].y for i in range(21) if i not in [7, 8]) and index_finger_tip[temp_handness].y < index_finger_second[temp_handness].y:
                is_index_finger_up[temp_handness] = True
                # If the fingertip and second joint are higher than all other keypoints on the hand, consider it as a "pointing" gesture. 

            if is_index_finger_up[temp_handness]:
                if not gesture_locked[temp_handness]:
                    if gesture_start_time[temp_handness] == 0:
                        gesture_start_time[temp_handness] = time.time()
                        # Record the time when the index finger is raised
                    elif time.time() - gesture_start_time[temp_handness] > wait_time:
                        dragging[temp_handness] = True
                        gesture_locked[temp_handness] = True
                        drag_point[temp_handness] = (index_x, index_y)
                        # If the index finger is raised for longer than the set wait time, it is considered a "pointing" gesture.
                    buffer_start_time[temp_handness] = 0
                    # Refresh the buffer time whenever the index finger is raised
            else:
                if buffer_start_time[temp_handness] == 0:
                    buffer_start_time[temp_handness] = time.time()
                elif time.time() - buffer_start_time[temp_handness] > buffer_duration[temp_handness]:
                    gesture_start_time[temp_handness] = 0
                    gesture_locked[temp_handness] = False
                    dragging[temp_handness] = False
                    # If the buffer time exceeds the set limit, it indicates the end of the pointing gesture.
                    # This prevents incorrect clearing of the pointing gesture due to recognition errors in a single frame.

            if dragging[temp_handness]:

                if start_drag_time[temp_handness] == 0:
                    start_drag_time[temp_handness] = time.time()
                    reset_kalman_filter(temp_handness, index_x, index_y)
                    # Initialize the filter whenever a line is drawn

                smooth_x, smooth_y = kalman_filter_point(temp_handness, index_x, index_y)
                drag_point[temp_handness] = (index_x, index_y)
                index_finger_radius = max(int(10 * (1 + (cz0 - index_finger_tip[temp_handness].z) * 5)), 0)
                cv2.circle(image, drag_point[temp_handness], index_finger_radius, (0, 0, 255), -1)
                # Create a circle based on the depth distance from the wrist root
                # This is used to show that the pointing gesture has started
                # The corresponding depth points below are scaled directly
                drag_point_smooth = (smooth_x, smooth_y)

                if time.time() - start_drag_time[temp_handness] > kalman_wait_time:
                    trajectory[temp_handness].append(drag_point_smooth)
                    # The Kalman filter can be very unstable when initialized, with significant noise in the first few frames
                    # Wait until the first few frames have run before adding the coordinates to the trajectory list.
            else:
                if len(trajectory[temp_handness]) > 4:
                    contour = np.array(trajectory[temp_handness], dtype=np.int32)
                    rect = cv2.minAreaRect(contour)
                    box = cv2.boxPoints(rect)
                    box = np.int0(box)
                    rect_draw_time[temp_handness] = time.time()
                    last_drawn_box[temp_handness] = box
                    # If the pointing gesture ends and there are at least four points in the trajectory list,
                    # Use the minimum bounding box to adjust the irregular drawing to a rectangle.

                start_drag_time[temp_handness] = 0
                trajectory[temp_handness].clear()

            for i in range(1, len(trajectory[temp_handness])):

                pt1 = (int(trajectory[temp_handness][i-1][0]), int(trajectory[temp_handness][i-1][1]))
                pt2 = (int(trajectory[temp_handness][i][0]), int(trajectory[temp_handness][i][1]))
                cv2.line(image, pt1, pt2, (0, 0, 255), 2)
                # Draw lines connecting trajectory points

            if last_drawn_box[temp_handness] is not None:
                elapsed_time[temp_handness] = time.time() - rect_draw_time[temp_handness]

                if elapsed_time[temp_handness] < wait_box:
                    cv2.drawContours(image, [last_drawn_box[temp_handness]], 0, (0, 255, 0), 2)
                    # Keep the rectangle visible for a while, otherwise, it's too fast to observe.

                elif elapsed_time[temp_handness] >= wait_box - 0.1:

                    box = last_drawn_box[temp_handness]
                    x_min = max(0, min(box[:, 0]))
                    y_min = max(0, min(box[:, 1]))
                    x_max = min(image.shape[1], max(box[:, 0]))
                    y_max = min(image.shape[0], max(box[:, 1]))
                    cropped_image = image[y_min:y_max, x_min:x_max]
                    filename = f"../image/cropped_{temp_handness}_{int(time.time())}.jpg"
                    cv2.imwrite(filename, cropped_image)
                    last_drawn_box[temp_handness] = None
                # The drawn image cannot be cropped immediately, as it might wrongly crop the hand into it.
                # Wait a while to give the hand time to move away before extracting the rectangle from this frame.

            for i in range(21):

                cx = int(hand_21.landmark[i].x * width)
                cy = int(hand_21.landmark[i].y * height)
                cz = hand_21.landmark[i].z
                depth_z = cz0 - cz
                radius = max(int(6 * (1 + depth_z * 5)), 0)

                if i == 0:
                    image = cv2.circle(image, (cx, cy), radius, (255, 255, 0), thickness=-1)
                if i == 8:
                    image = cv2.circle(image, (cx, cy), radius, (255, 165, 0), thickness=-1)
                    index_finger_tip_str += '{}:{:.2f}, '.format(hand_idx, depth_z)
                if i in [1, 5, 9, 13, 17]:
                    image = cv2.circle(image, (cx, cy), radius,  (0, 0, 255), thickness=-1)
                if i in [2, 6, 10, 14, 18]:
                    image = cv2.circle(image, (cx, cy), radius,  (75, 0, 130), thickness=-1)
                if i in [3, 7, 11, 15, 19]:
                    image = cv2.circle(image, (cx, cy), radius, (238, 130, 238), thickness=-1)
                if i in [4, 12, 16, 20]:
                    image = cv2.circle(image, (cx, cy), radius, (0, 255, 255), thickness=-1)
                # Extract each keypoint, assign corresponding colors, and set depth based on the wrist root.

        scaler = 1
        image = cv2.putText(image, handness_str, (25 * scaler, 100 * scaler), cv2.FONT_HERSHEY_SIMPLEX, 1.25 * scaler, (0, 0, 255), 2)
        image = cv2.putText(image, index_finger_tip_str, (25 * scaler, 150 * scaler), cv2.FONT_HERSHEY_SIMPLEX, 1.25 * scaler, (0, 0, 255), 2,)

        spend_time = time.time() - start_time
        if spend_time > 0:
            FPS = 1.0 / spend_time
        else:
            FPS = 0

        image = cv2.putText(image, 'FPS ' + str(int(FPS)), (25 * scaler, 50 * scaler), cv2.FONT_HERSHEY_SIMPLEX, 1.25 * scaler, (0, 0, 255), 2,)
        # Display FPS, detected hands, and the depth value of the index fingertip relative to the wrist root.

    else:
        clear_hand_states()
        # If no hands are detected, clear all information.

    return image

In [7]:
cap = cv2.VideoCapture(1)
cap.open(0)

while cap.isOpened():
    success, frame = cap.read()
    if not success:
        print("Camera Error")
        break

    frame = process_image(frame)
    cv2.imshow('Video', frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()  