**Tried Using Green Boxes**

Perfect for recognition but fails in identification with position

In [5]:
import cv2
import numpy as np

# Initialize variables for face tracking
face_trackers = {}  # Dictionary to track each face and its identifier
next_face_id = 0    # ID counter for assigning IDs to new faces

# Load the pre-trained Haar Cascade classifier for face detection
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')

# Function to assign face ID or retrieve existing ID based on face location
def assign_or_return_face_id(x, y, w, h):
    global next_face_id, face_trackers
    
    # Calculate the center of the detected face
    face_center_x = x + w // 2
    face_center_y = y + h // 2
    
    # Initialize variables for face matching
    matched_fid = None
    min_distance = float('inf')
    
    # Iterate through existing trackers to find matching face or assign new ID
    for fid, (prev_x, prev_y, prev_w, prev_h) in face_trackers.items():
        # Calculate Euclidean distance between face centers
        distance = np.sqrt((face_center_x - (prev_x + prev_w // 2))**2 + (face_center_y - (prev_y + prev_h // 2))**2)
        
        # If the distance is small enough, consider it the same face
        if distance < 50:  # Adjust this threshold based on your camera setup
            matched_fid = fid
            break
    
    # If no matching face is found, assign a new ID
    if matched_fid is None:
        face_trackers[next_face_id] = (x, y, w, h)
        matched_fid = next_face_id
        next_face_id += 1
    
    return matched_fid

# Open video capture from default camera (usually index 0)
cap = cv2.VideoCapture(0)

while True:
    # Capture frame-by-frame
    ret, frame = cap.read()

    if not ret:
        print("Failed to capture frame from camera. Check camera connection.")
        break

    # Convert frame to grayscale for face detection
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

    # Detect faces in the grayscale frame
    faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))

    # Iterate over detected faces
    for (x, y, w, h) in faces:
        # Assign or retrieve face ID based on face location
        fid = assign_or_return_face_id(x, y, w, h)

        # Draw a green rectangle around the face
        cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 255, 0), 2)

        # Display face ID on the frame
        cv2.putText(frame, f'Person {fid}', (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

    # Display the frame with rectangles and face IDs
    cv2.imshow('Face Tracking', frame)

    # Exit the loop if the 'q' key is pressed
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release the camera and close all OpenCV windows
cap.release()
cv2.destroyAllWindows()


**Pose detection using the YOLO model**

 For detecting keypoints and MediaPipe for calculating angles and poses. This will solve the previous problem of unique identification

In [26]:
from ultralytics import YOLO as yolo
import cv2 as cv
import numpy as np

## Function to detect the difference between two frames.
def diff(prev, frame):
    prev = cv.cvtColor(prev, cv.COLOR_BGR2GRAY)
    frame = cv.cvtColor(frame, cv.COLOR_BGR2GRAY)
    # Compute the Mean Squared Error (MSE)
    mse = ((prev - frame) ** 2).mean()
    return mse

## Function for angle calculation
def angle_calc(p1: list, p2: list, p3: list):
    if (p1.all() == 0) or (p2.all() == 0) or (p3.all() == 0):
        return -1
    v1 = p1 - p2
    v2 = p3 - p2
    cos_theta = (np.dot(v1, v2)) / (np.linalg.norm(v1) * np.linalg.norm(v2))
    cos_theta = np.clip(cos_theta, -1.0, 1.0)
    # print("p1",p1,"p2",p2,"p3",p3,"v1",v1,"v2",v2,"Cos_theta",cos_theta)
    theta = np.arccos(cos_theta)  # angle in radians
    angle = abs(theta * 180.0 / np.pi)  # angle in degree
    if angle > 180:
        angle = 360 - angle
    return int(angle)

## Calculating 8 important angles for pose detection.
### This function is specific to tis code application.
def joint_angles(joint: list):
    joints = np.zeros((17, 2))
    joints[: joint.shape[0], : joint.shape[1]] = joint
    out = []
    out.append(angle_calc(joints[9], joints[7], joints[5]))  ### Angle 1 on right side
    out.append(angle_calc(joints[10], joints[8], joints[6]))  ### Angle 2 on left
    out.append(angle_calc(joints[7], joints[5], joints[11]))  ### Angle 3 on right
    out.append(angle_calc(joints[8], joints[6], joints[12]))  ### Angle 4 on left
    out.append(angle_calc(joints[5], joints[11], joints[13]))  ### Angle 5 on right
    out.append(angle_calc(joints[6], joints[12], joints[14]))  ### Angle 6 on left
    out.append(angle_calc(joints[11], joints[13], joints[15]))  ### Angle 7 on right
    out.append(angle_calc(joints[12], joints[14], joints[16]))  ### Angle 8 on left
    return out

## Special Cases for this code to determine the pose names.
def pose(flexion_angles: list):
    l = len(flexion_angles)
    up = flexion_angles[: int(l / 2)]
    down = flexion_angles[int(l / 2) :]
    final = []
    text = 0

    def is_between(value, min_val, max_val):
        return all(min_val[i] <= value[i] <= max_val[i] for i in range(len(value)))

    match up:  ## Hands Up condition
        case _ if is_between(up, [0, 0, 160, 160], [0, 0, 180, 180]):
            text = "Both Hands Up"
        case _ if is_between(up, [0, 0, 160, 0], [0, 0, 180, 0]):
            text = "Right Hand up"
        case _ if is_between(up, [0, 0, 0, 160], [0, 0, 0, 180]):
            text = "Left Hand Up"

    if text != 0:
        final.append(text)
        text = 0

    match up:  ## Hands Raised condition
        case _ if is_between(up, [80, 80, 80, 80], [100, 100, 100, 100]):
            text = "Both Hands Raised Up"
        case _ if is_between(up, [80, 0, 80, 0], [100, 0, 100, 0]):
            text = "Right Hand Raised up"
        case _ if is_between(up, [0, 80, 0, 80], [0, 100, 0, 100]):
            text = "Left Hand Raised Up"

    if text != 0:
        final.append(text)
        text = 0

    match up:  ## Hands Horizontal condition
        case _ if is_between(up, [160, 160, 80, 80], [180, 180, 100, 100]):
            text = "Both hands are horizontal"
        case _ if is_between(up, [160, 160, 0, 80], [180, 180, 20, 100]):
            text = "Right hand is horizontal"
        case _ if is_between(up, [160, 160, 80, 0], [180, 180, 100, 20]):
            text = "Left hand is horizontal"

    if text != 0:
        final.append(text)
        text = 0

    match up:  ## Hands Down condition
        case _ if is_between(up, [130, 130, 0, 0], [180, 180, 30, 30]):
            text = "Both Hands Down"
        case _ if is_between(up, [0, 130, 0, 0], [0, 180, 30, 30]):
            text = "Left Hand Down"
        case _ if is_between(up, [130, 0, 0, 0], [180, 30, 30, 30]):
            text = "Right Hand Down"

    if text != 0:
        final.append(text)
        text = 0

    match down:  ## Legs position condition
        case _ if is_between(down, [80, 80, -1, -1], [100, 100, 180, 180]):
            text = "Sitting Down"
        case _ if is_between(down, [160, 160, -1, -1], [180, 180, 180, 180]):
            text = "Standing pose"
        case _ if is_between(down, [80, 160, -1, 160], [100, 180, 180, 180]):
            text = "Standing on left leg"
        case _ if is_between(down, [160, 80, 160, -1], [180, 100, 180, 180]):
            text = "Standing on Right leg"

    if text != 0:
        final.append(text)
        text = 0
    return final

# List of common colors in OpenCV (BGR format)
colors = [
    (0, 0, 255),      # Red
    (0, 255, 0),      # Green
    # (255, 0, 0),      # Blue
    (0, 255, 255),    # Yellow
    (255, 255, 0),    # Cyan
    (255, 0, 255),    # Magenta
    (128, 128, 128),  # Gray
    (50, 50, 50),     # Dark Gray
    (200, 200, 200),  # Light Gray
    (0, 0, 128),      # Maroon
    (0, 128, 128),    # Olive
    (128, 0, 128),    # Purple
    (128, 128, 0),    # Teal
    (128, 0, 0),      # Navy
    (0, 165, 255),    # Orange
    (19, 69, 139),    # Brown
    (203, 192, 255),  # Pink
    (230, 216, 173)   # Light Blue
]

"""
Main function starts here to determine the Pose of the subject.
"""

model = yolo("yolov8n-pose.pt", task="pose")  ## defining the model

# cap = cv.VideoCapture("test_video3.mp4")  ## to use a video from the device
cap = cv.VideoCapture(0)  ## To capture from the webcam

# Check if camera opened successfully
if cap.isOpened() == False:
    print("Error opening video file")

ret, prev = cap.read()

thresh = 5
while cap.isOpened():
    ret, frame = cap.read()
    if ret:
        if diff(prev, frame) > thresh:
            print(diff(prev, frame))
            results = model(source=frame)
            frame1 = results[0].plot()
            # thresh += 1
        else:
            frame1 = results[0].plot(
                img=frame
            )  ## to print the annotations on the original image without feeding it into the model.
            print(diff(prev, frame))
            # thresh -= 0.5
        prev = frame
        joints = np.array(results[0].keypoints.xy).astype(
            int
        )  # saving the coordinates in joints variable as int

        img_text = np.zeros((frame1.shape[0], frame1.shape[1] + 200, frame1.shape[2]), dtype=np.uint8)
        img_text[: frame1.shape[0], : frame1.shape[1], : frame1.shape[2]] = frame1
        cv.putText(
            img_text,
            text="Pose:",
            org=(frame1.shape[1], 20),
            fontFace=cv.FONT_HERSHEY_SIMPLEX,
            fontScale=0.5,
            color=(255, 255, 255),
            thickness=1,
            lineType=cv.LINE_AA,
        )
        var = 1
        for j in range(len(joints)): ### This loop ensures to detect pose for more than one persons.
            out = joint_angles(joints[j])
            final = pose(out) ### text variable containg the pose names.
            cv.putText(
            img_text,
            text="Person "+str(j+1)+" :",
            org=(frame1.shape[1], (var+1)*20),
            fontFace=cv.FONT_HERSHEY_SIMPLEX,
            fontScale=0.5,
            color=colors[var],
            thickness=1,
            lineType=cv.LINE_AA,
        )
            var += 1
            for i in range(len(final)):
                cv.putText(
                    img_text,
                    text=final[i],
                    org=(frame1.shape[1], 20 * (var + 1)),
                    fontFace=cv.FONT_HERSHEY_SIMPLEX,
                    fontScale=0.5,
                    color=colors[i+j],
                    thickness=1,
                    lineType=cv.LINE_AA,
                )
                var += 1
        cv.imshow("Frame", img_text)
        # Break the loop on 'q' key press
        if cv.waitKey(20) & 0xFF == ord("q"):
            break
    else:
        break
cap.release()
cv.destroyAllWindows()

16.093291015625

0: 480x640 1 person, 327.9ms
Speed: 15.7ms preprocess, 327.9ms inference, 9.2ms postprocess per image at shape (1, 3, 480, 640)
28.1089453125

0: 480x640 1 person, 310.6ms
Speed: 0.0ms preprocess, 310.6ms inference, 5.7ms postprocess per image at shape (1, 3, 480, 640)
32.376842447916665

0: 480x640 1 person, 376.9ms
Speed: 7.3ms preprocess, 376.9ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)
30.217418619791665

0: 480x640 1 person, 242.5ms
Speed: 7.1ms preprocess, 242.5ms inference, 14.1ms postprocess per image at shape (1, 3, 480, 640)
27.228860677083333

0: 480x640 1 person, 219.8ms
Speed: 2.5ms preprocess, 219.8ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)
27.219895833333332

0: 480x640 1 person, 231.0ms
Speed: 0.0ms preprocess, 231.0ms inference, 16.2ms postprocess per image at shape (1, 3, 480, 640)
16.355755208333335

0: 480x640 1 person, 227.7ms
Speed: 5.0ms preprocess, 227.7ms inference, 4.1ms postprocess per image at

**Now we use it for two cameras for person detection and identification**

In [27]:
from ultralytics import YOLO as yolo
import cv2 as cv
import numpy as np
import time

# Function to detect the difference between two frames.
def diff(prev, frame):
    prev = cv.cvtColor(prev, cv.COLOR_BGR2GRAY)
    frame = cv.cvtColor(frame, cv.COLOR_BGR2GRAY)
    mse = ((prev - frame) ** 2).mean()
    return mse

# Function for angle calculation
def angle_calc(p1: list, p2: list, p3: list):
    if (p1.all() == 0) or (p2.all() == 0) or (p3.all() == 0):
        return -1
    v1 = p1 - p2
    v2 = p3 - p2
    cos_theta = (np.dot(v1, v2)) / (np.linalg.norm(v1) * np.linalg.norm(v2))
    cos_theta = np.clip(cos_theta, -1.0, 1.0)
    theta = np.arccos(cos_theta)
    angle = abs(theta * 180.0 / np.pi)
    if angle > 180:
        angle = 360 - angle
    return int(angle)

# Calculating 8 important angles for pose detection.
def joint_angles(joint: list):
    joints = np.zeros((17, 2))
    joints[: joint.shape[0], : joint.shape[1]] = joint
    out = []
    out.append(angle_calc(joints[9], joints[7], joints[5]))  # Angle 1 on right side
    out.append(angle_calc(joints[10], joints[8], joints[6]))  # Angle 2 on left
    out.append(angle_calc(joints[7], joints[5], joints[11]))  # Angle 3 on right
    out.append(angle_calc(joints[8], joints[6], joints[12]))  # Angle 4 on left
    out.append(angle_calc(joints[5], joints[11], joints[13]))  # Angle 5 on right
    out.append(angle_calc(joints[6], joints[12], joints[14]))  # Angle 6 on left
    out.append(angle_calc(joints[11], joints[13], joints[15]))  # Angle 7 on right
    out.append(angle_calc(joints[12], joints[14], joints[16]))  # Angle 8 on left
    return out

# Special Cases to determine the pose names.
def pose(flexion_angles: list):
    l = len(flexion_angles)
    up = flexion_angles[: int(l / 2)]
    down = flexion_angles[int(l / 2) :]
    final = []
    text = 0

    def is_between(value, min_val, max_val):
        return all(min_val[i] <= value[i] <= max_val[i] for i in range(len(value)))

    match up:
        case _ if is_between(up, [0, 0, 160, 160], [0, 0, 180, 180]):
            text = "Both Hands Up"
        case _ if is_between(up, [0, 0, 160, 0], [0, 0, 180, 0]):
            text = "Right Hand up"
        case _ if is_between(up, [0, 0, 0, 160], [0, 0, 0, 180]):
            text = "Left Hand Up"

    if text != 0:
        final.append(text)
        text = 0

    match up:
        case _ if is_between(up, [80, 80, 80, 80], [100, 100, 100, 100]):
            text = "Both Hands Raised Up"
        case _ if is_between(up, [80, 0, 80, 0], [100, 0, 100, 0]):
            text = "Right Hand Raised up"
        case _ if is_between(up, [0, 80, 0, 80], [0, 100, 0, 100]):
            text = "Left Hand Raised Up"

    if text != 0:
        final.append(text)
        text = 0

    match up:
        case _ if is_between(up, [160, 160, 80, 80], [180, 180, 100, 100]):
            text = "Both hands are horizontal"
        case _ if is_between(up, [160, 160, 0, 80], [180, 180, 20, 100]):
            text = "Right hand is horizontal"
        case _ if is_between(up, [160, 160, 80, 0], [180, 180, 100, 20]):
            text = "Left hand is horizontal"

    if text != 0:
        final.append(text)
        text = 0

    match up:
        case _ if is_between(up, [130, 130, 0, 0], [180, 180, 30, 30]):
            text = "Both Hands Down"
        case _ if is_between(up, [0, 130, 0, 0], [0, 180, 30, 30]):
            text = "Left Hand Down"
        case _ if is_between(up, [130, 0, 0, 0], [180, 30, 30, 30]):
            text = "Right Hand Down"

    if text != 0:
        final.append(text)
        text = 0

    match down:
        case _ if is_between(down, [80, 80, -1, -1], [100, 100, 180, 180]):
            text = "Sitting Down"
        case _ if is_between(down, [160, 160, -1, -1], [180, 180, 180, 180]):
            text = "Standing pose"
        case _ if is_between(down, [80, 160, -1, 160], [100, 180, 180, 180]):
            text = "Standing on left leg"
        case _ if is_between(down, [160, 80, 160, -1], [180, 100, 180, 180]):
            text = "Standing on Right leg"

    if text != 0:
        final.append(text)
        text = 0
    return final

# List of common colors in OpenCV (BGR format)
colors = [
    (0, 0, 255),      # Red
    (0, 255, 0),      # Green
    (0, 255, 255),    # Yellow
    (255, 255, 0),    # Cyan
    (255, 0, 255),    # Magenta
    (128, 128, 128),  # Gray
    (50, 50, 50),     # Dark Gray
    (200, 200, 200),  # Light Gray
    (0, 0, 128),      # Maroon
    (0, 128, 128),    # Olive
    (128, 0, 128),    # Purple
    (128, 128, 0),    # Teal
    (128, 0, 0),      # Navy
    (0, 165, 255),    # Orange
    (19, 69, 139),    # Brown
    (203, 192, 255),  # Pink
    (230, 216, 173)   # Light Blue
]

# Main function starts here to determine the Pose of the subject.
model = yolo("yolov8n-pose.pt", task="pose")

cap = cv.VideoCapture(0)

# Check if camera opened successfully
if cap.isOpened() == False:
    print("Error opening video file")

ret, prev = cap.read()

thresh = 5
frame_count = 0
skip_frames = 2  # Process every 2nd frame

while cap.isOpened():
    ret, frame1 = cap.read()

    if ret == True:
        frame_count += 1

        # Skip frames for faster processing
        if frame_count % skip_frames != 0:
            continue

        output_frame1 = None
        output_frame2 = None

        results1 = model(frame1)
        output_frame1 = results1[0].plot(img=frame1)

        ret, frame2 = cap.read()

        if ret == True:
            results2 = model(frame2)
            output_frame2 = results2[0].plot(img=frame2)
            print(diff(prev, frame1), diff(prev, frame2))
            prev = frame2

            joints1 = np.array(results1[0].keypoints.xy).astype(int)
            joints2 = np.array(results2[0].keypoints.xy).astype(int)

            # Resize frames to fit better on the screen
            output_frame1 = cv.resize(output_frame1, (640, 420))
            output_frame2 = cv.resize(output_frame2, (640, 420))

            img_text1 = np.zeros((output_frame1.shape[0], output_frame1.shape[1] + 200, output_frame1.shape[2]), dtype=np.uint8)
            img_text2 = np.zeros((output_frame2.shape[0], output_frame2.shape[1] + 200, output_frame2.shape[2]), dtype=np.uint8)

            img_text1[: output_frame1.shape[0], : output_frame1.shape[1], : output_frame1.shape[2]] = output_frame1
            img_text2[: output_frame2.shape[0], : output_frame2.shape[1], : output_frame2.shape[2]] = output_frame2

            cv.putText(
                img_text1,
                text="Pose from Camera 1:",
                org=(output_frame1.shape[1], 20),
                fontFace=cv.FONT_HERSHEY_SIMPLEX,
                fontScale=0.5,
                color=(255, 255, 255),
                thickness=1,
                lineType=cv.LINE_AA,
            )
            cv.putText(
                img_text2,
                text="Pose from Camera 2:",
                org=(output_frame2.shape[1], 20),
                fontFace=cv.FONT_HERSHEY_SIMPLEX,
                fontScale=0.5,
                color=(255, 255, 255),
                thickness=1,
                lineType=cv.LINE_AA,
            )

            var = 1
            for j in range(len(joints1)):  ### This loop ensures to detect pose for more than one person.
                out = joint_angles(joints1[j])
                final = pose(out)  ### text variable containing the pose names.
                cv.putText(
                    img_text1,
                    text="Person " + str(j + 1) + " :",
                    org=(output_frame1.shape[1], (var + 1) * 20),
                    fontFace=cv.FONT_HERSHEY_SIMPLEX,
                    fontScale=0.5,
                    color=colors[var],
                    thickness=1,
                    lineType=cv.LINE_AA,
                )
                var += 1
                for i in range(len(final)):
                    cv.putText(
                        img_text1,
                        text=final[i],
                        org=(output_frame1.shape[1], 20 * (var + 1)),
                        fontFace=cv.FONT_HERSHEY_SIMPLEX,
                        fontScale=0.5,
                        color=colors[i + j],
                        thickness=1,
                        lineType=cv.LINE_AA,
                    )
                    var += 1

            var = 1
            for j in range(len(joints2)):  ### This loop ensures to detect pose for more than one person.
                out = joint_angles(joints2[j])
                final = pose(out)  ### text variable containing the pose names.
                cv.putText(
                    img_text2,
                    text="Person " + str(j + 1) + " :",
                    org=(output_frame2.shape[1], (var + 1) * 20),
                    fontFace=cv.FONT_HERSHEY_SIMPLEX,
                    fontScale=0.5,
                    color=colors[var],
                    thickness=1,
                    lineType=cv.LINE_AA,
                )
                var += 1
                for i in range(len(final)):
                    cv.putText(
                        img_text2,
                        text=final[i],
                        org=(output_frame2.shape[1], 20 * (var + 1)),
                        fontFace=cv.FONT_HERSHEY_SIMPLEX,
                        fontScale=0.5,
                        color=colors[i + j],
                        thickness=1,
                        lineType=cv.LINE_AA,
                    )
                    var += 1

            combined_frame = np.vstack((img_text1, img_text2))
            cv.imshow("Frames", combined_frame)
            # Break the loop on 'q' key press
            if cv.waitKey(20) & 0xFF == ord("q"):
                break
        else:
            break
    else:
        break
cap.release()
cv.destroyAllWindows()


0: 480x640 1 person, 315.9ms
Speed: 0.0ms preprocess, 315.9ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 244.9ms
Speed: 6.7ms preprocess, 244.9ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)
15.860406901041667 107.36931315104167

0: 480x640 1 person, 360.1ms
Speed: 6.5ms preprocess, 360.1ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 246.6ms
Speed: 0.0ms preprocess, 246.6ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)
29.985231119791667 31.803304036458332

0: 480x640 1 person, 317.6ms
Speed: 0.0ms preprocess, 317.6ms inference, 8.4ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 231.8ms
Speed: 5.6ms preprocess, 231.8ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)
24.578938802083332 31.28986328125


**Problem:**

Though detection was possible identifying an individual uniquely was the issue. If person is recognized as person 1 he comes out of the frame and comes after another person the another person is identified as person 1 and he is identified as person 2

**Suggestions:**

Implement Object Tracking Algorithms:

Use algorithms like Deep SORT or CSRT to maintain individual identities over time.
Assign Unique Identifiers:

Use Kalman Filters and feature matching (e.g., color histograms) to assign and maintain unique identifiers for each person.
Combine Pose Information with Tracking:

Integrate pose keypoints with tracking to better distinguish and track individuals.
Improve Detection Consistency:

Increase frame rate and apply temporal smoothing to stabilize tracking and reduce identity switching.
Handle Occlusions:

Implement occlusion handling techniques and consider using multiple cameras for different angles.
Robust Data Association:

Use the Hungarian algorithm and IoU calculations to match detections with tracked individuals accurately.