In [1]:
import os
import cv2
import mediapipe as mp
from ultralytics import YOLO

# Initialize YOLO and Mediapipe pose models
yolo_model = YOLO('yolov8n.pt')  # YOLOv8n for lightweight detection
mp_pose = mp.solutions.pose
mp_drawing = mp.solutions.drawing_utils

# Folder paths
image_folder = r'C:\Users\Admin\Downloads\Body Language Detection with mediapipe\images' 
output_folder = r'C:\Users\Admin\Downloads\Body Language Detection with mediapipe\labels' 

# Ensure output folder exists
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# List to store all flip_idx (landmarks)
all_flip_idx = []

# Initialize Mediapipe pose estimation model
with mp_pose.Pose(static_image_mode=True, min_detection_confidence=0.5, model_complexity=1) as pose:
    # Iterate through all images in the folder
    for image_name in os.listdir(image_folder):
        if image_name.endswith('.jpg') or image_name.endswith('.png'): 
            # Load the image
            image_path = os.path.join(image_folder, image_name)
            image = cv2.imread(image_path)
            
            # Use YOLO to detect persons in the image
            yolo_results = yolo_model(image)
            detections = yolo_results[0]  # YOLO detections for the image
            
            # Prepare to store detection results
            txt_lines = []
            
            # Process each detected person
            for det in detections.boxes:
                if det.cls == 0:  # '0' class for person detection
                    # Get YOLO bounding box in YOLO format
                    x1, y1, x2, y2 = map(int, det.xyxy[0].cpu().numpy())
                    box_w = (x2 - x1) / image.shape[1]
                    box_h = (y2 - y1) / image.shape[0]
                    box_x_center = (x1 + x2) / 2 / image.shape[1]
                    box_y_center = (y1 + y2) / 2 / image.shape[0]
                    
                    # Crop detected person for pose estimation
                    person_crop = image[y1:y2, x1:x2]
                    person_rgb = cv2.cvtColor(person_crop, cv2.COLOR_BGR2RGB)
                    
                    # Perform pose estimation with Mediapipe
                    results = pose.process(person_rgb)
                    
                    if results.pose_landmarks:
                        # Prepare keypoints for the .txt file
                        keypoints = []
                        for idx, landmark in enumerate(results.pose_landmarks.landmark):
                            keypoints.append(f'{landmark.x} {landmark.y} {landmark.visibility}')
                            all_flip_idx.append(idx)  # Add to flip_idx list

                        # Combine bounding box and keypoints into a single line
                        txt_line = f'0 {box_x_center} {box_y_center} {box_w} {box_h} ' + ' '.join(keypoints)
                        txt_lines.append(txt_line)
            
            # Write results to a .txt file with the same name as the image
            output_txt_path = os.path.join(output_folder, os.path.splitext(image_name)[0] + '.txt')
            with open(output_txt_path, 'w') as f:
                f.write('\n'.join(txt_lines))

# Remove duplicates from flip_idx and return the list
flip_idx = list(set(all_flip_idx))
print("flip_idx:", flip_idx)




0: 480x640 1 person, 253.8ms
Speed: 15.6ms preprocess, 253.8ms inference, 15.0ms postprocess per image at shape (1, 3, 480, 640)

0: 640x512 1 person, 289.3ms
Speed: 5.0ms preprocess, 289.3ms inference, 0.0ms postprocess per image at shape (1, 3, 640, 512)

0: 480x640 (no detections), 251.7ms
Speed: 0.0ms preprocess, 251.7ms inference, 4.0ms postprocess per image at shape (1, 3, 480, 640)

0: 640x544 1 person, 368.7ms
Speed: 0.0ms preprocess, 368.7ms inference, 0.5ms postprocess per image at shape (1, 3, 640, 544)
flip_idx: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32]


In [1]:
import os
import cv2
import mediapipe as mp
from ultralytics import YOLO

# Initialize YOLO and Mediapipe pose models
yolo_model = YOLO('yolov8n.pt')  # YOLOv8n for lightweight detection
mp_pose = mp.solutions.pose
mp_drawing = mp.solutions.drawing_utils

# Folder paths
image_folder = r'C:\Users\Admin\Downloads\Body Language Detection with mediapipe\img' 
output_folder = r'C:\Users\Admin\Downloads\Body Language Detection with mediapipe\labels'
annotated_folder = r'C:\Users\Admin\Downloads\Body Language Detection with mediapipe\img_lbl'

# Ensure output and annotated image folders exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
if not os.path.exists(annotated_folder):
    os.makedirs(annotated_folder)

# List to store all flip_idx (landmarks)
all_flip_idx = []

# Initialize Mediapipe pose estimation model
with mp_pose.Pose(static_image_mode=True, min_detection_confidence=0.5, model_complexity=1) as pose:
    # Iterate through all images in the folder
    for image_name in os.listdir(image_folder):
        if image_name.endswith('.jpg') or image_name.endswith('.png'): 
            # Load the image
            image_path = os.path.join(image_folder, image_name)
            image = cv2.imread(image_path)
            
            # Use YOLO to detect persons in the image
            yolo_results = yolo_model(image)
            detections = yolo_results[0]  # YOLO detections for the image
            
            # Prepare to store detection results
            txt_lines = []
            
            # Process each detected person
            for det in detections.boxes:
                if det.cls == 0:  # '0' class for person detection
                    # Get YOLO bounding box in YOLO format
                    x1, y1, x2, y2 = map(int, det.xyxy[0].cpu().numpy())
                    box_w = (x2 - x1) / image.shape[1]
                    box_h = (y2 - y1) / image.shape[0]
                    box_x_center = (x1 + x2) / 2 / image.shape[1]
                    box_y_center = (y1 + y2) / 2 / image.shape[0]
                    
                    # Draw YOLO bounding box on the image
                    cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 2)
                    
                    # Crop detected person for pose estimation
                    person_crop = image[y1:y2, x1:x2]
                    person_rgb = cv2.cvtColor(person_crop, cv2.COLOR_BGR2RGB)
                    
                    # Perform pose estimation with Mediapipe
                    results = pose.process(person_rgb)
                    
                    if results.pose_landmarks:
                        # Draw pose landmarks on the cropped person and overlay it back on the original image
                        mp_drawing.draw_landmarks(image[y1:y2, x1:x2], results.pose_landmarks, mp_pose.POSE_CONNECTIONS)

                        # Prepare keypoints for the .txt file
                        keypoints = []
                        for idx, landmark in enumerate(results.pose_landmarks.landmark):
                            keypoints.append(f'{landmark.x} {landmark.y} {landmark.visibility}')
                            all_flip_idx.append(idx)  # Add to flip_idx list

                        # Combine bounding box and keypoints into a single line
                        txt_line = f'0 {box_x_center} {box_y_center} {box_w} {box_h} ' + ' '.join(keypoints)
                        txt_lines.append(txt_line)
            
            # Write results to a .txt file with the same name as the image
            output_txt_path = os.path.join(output_folder, os.path.splitext(image_name)[0] + '.txt')
            with open(output_txt_path, 'w') as f:
                f.write('\n'.join(txt_lines))
            
            # Save the annotated image
            annotated_image_path = os.path.join(annotated_folder, image_name)
            cv2.imwrite(annotated_image_path, image)

# Remove duplicates from flip_idx and return the list
flip_idx = list(set(all_flip_idx))
print("flip_idx:", flip_idx)



0: 384x640 1 person, 357.9ms
Speed: 18.8ms preprocess, 357.9ms inference, 27.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 272.7ms
Speed: 3.5ms preprocess, 272.7ms inference, 6.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 283.2ms
Speed: 3.0ms preprocess, 283.2ms inference, 4.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 353.2ms
Speed: 4.9ms preprocess, 353.2ms inference, 4.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 459.7ms
Speed: 5.0ms preprocess, 459.7ms inference, 4.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 489.3ms
Speed: 6.8ms preprocess, 489.3ms inference, 6.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 433.5ms
Speed: 5.1ms preprocess, 433.5ms inference, 4.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 389.2ms
Speed: 50.8ms preprocess, 389.2ms inference, 7.0ms postprocess per image


0: 384x640 1 person, 146.6ms
Speed: 10.4ms preprocess, 146.6ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 135.9ms
Speed: 15.8ms preprocess, 135.9ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 151.2ms
Speed: 0.0ms preprocess, 151.2ms inference, 3.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 130.9ms
Speed: 1.3ms preprocess, 130.9ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 146.9ms
Speed: 0.0ms preprocess, 146.9ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 146.9ms
Speed: 3.0ms preprocess, 146.9ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 197.7ms
Speed: 5.4ms preprocess, 197.7ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 193.4ms
Speed: 7.7ms preprocess, 193.4ms inference, 0.0ms postprocess per image 


0: 384x640 1 person, 195.1ms
Speed: 16.9ms preprocess, 195.1ms inference, 3.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 218.7ms
Speed: 4.0ms preprocess, 218.7ms inference, 4.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 248.6ms
Speed: 0.0ms preprocess, 248.6ms inference, 4.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 193.4ms
Speed: 0.0ms preprocess, 193.4ms inference, 4.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 245.4ms
Speed: 5.2ms preprocess, 245.4ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 203.6ms
Speed: 0.0ms preprocess, 203.6ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 239.0ms
Speed: 0.0ms preprocess, 239.0ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 239.6ms
Speed: 0.1ms preprocess, 239.6ms inference, 3.3ms postprocess per image a

In [19]:
import os
import cv2
import mediapipe as mp
from ultralytics import YOLO

# Initialize YOLO and Mediapipe pose models
yolo_model = YOLO('yolov8n.pt')  # YOLOv8n for lightweight detection
mp_pose = mp.solutions.pose
mp_drawing = mp.solutions.drawing_utils

# Folder paths
image_folder = r'C:\Users\Admin\Downloads\Body Language Detection with mediapipe\img' 
output_folder = r'C:\Users\Admin\Downloads\Body Language Detection with mediapipe\labels'
annotated_folder = r'C:\Users\Admin\Downloads\Body Language Detection with mediapipe\img_lbl'

# Ensure output and annotated image folders exist
os.makedirs(output_folder, exist_ok=True)
os.makedirs(annotated_folder, exist_ok=True)

# Initialize Mediapipe pose estimation model
with mp_pose.Pose(static_image_mode=True, min_detection_confidence=0.5, model_complexity=1) as pose:
    # Iterate through all images in the folder
    for image_name in os.listdir(image_folder):
        if image_name.endswith('.jpg') or image_name.endswith('.png'): 
            # Load the image
            image_path = os.path.join(image_folder, image_name)
            image = cv2.imread(image_path)

            if image is None:
                print(f"Skipping corrupt image: {image_name}")
                continue
            
            # Use YOLO to detect persons in the image
            yolo_results = yolo_model(image)
            detections = yolo_results[0]  # YOLO detections for the image

            # Prepare to store detection results
            txt_lines = []

            # Process each detected person
            for det in detections.boxes:
                if det.cls == 0:  # '0' class for person detection
                    # Get YOLO bounding box in YOLO format
                    x1, y1, x2, y2 = map(int, det.xyxy[0].cpu().numpy())

                    # Normalize bounding box coordinates
                    box_w = (x2 - x1) / image.shape[1]
                    box_h = (y2 - y1) / image.shape[0]
                    box_x_center = (x1 + x2) / 2 / image.shape[1]
                    box_y_center = (y1 + y2) / 2 / image.shape[0]

                    # Ensure bounding box coordinates are within [0, 1] range
                    box_x_center = min(max(box_x_center, 0), 1)
                    box_y_center = min(max(box_y_center, 0), 1)
                    box_w = min(max(box_w, 0), 1)
                    box_h = min(max(box_h, 0), 1)

                    # Crop detected person for pose estimation
                    person_crop = image[y1:y2, x1:x2]
                    person_rgb = cv2.cvtColor(person_crop, cv2.COLOR_BGR2RGB)

                    # Perform pose estimation with Mediapipe
                    results = pose.process(person_rgb)

                    # Initialize an empty list for keypoints (x, y, visibility) for 33 keypoints
                    keypoint_labels = ['0 0 0'] * 33  # Initialize with zeros for missing keypoints

                    if results.pose_landmarks:
                        height, width = image.shape[:2]
                        # Loop through detected landmarks and map back to the original image size
                        for idx, landmark in enumerate(results.pose_landmarks.landmark):
                            # Absolute keypoint position in the cropped person image
                            abs_x = landmark.x * (x2 - x1)
                            abs_y = landmark.y * (y2 - y1)

                            # Transform back to the original image coordinates
                            orig_x = (abs_x + x1) / width
                            orig_y = (abs_y + y1) / height
                            
                            # Ensure coordinates are within [0, 1] range
                            orig_x = min(max(orig_x, 0), 1)
                            orig_y = min(max(orig_y, 0), 1)

                            # Update keypoint labels with (x, y, visibility)
                            keypoint_labels[idx] = f'{orig_x} {orig_y} {landmark.visibility}'

                        # Draw pose landmarks on the cropped person and overlay it back on the original image
                        mp_drawing.draw_landmarks(image[y1:y2, x1:x2], results.pose_landmarks, mp_pose.POSE_CONNECTIONS)

                    # Combine bounding box and keypoints into a single line
                    txt_line = f'0 {box_x_center} {box_y_center} {box_w} {box_h} ' + ' '.join(keypoint_labels)
                    txt_lines.append(txt_line)

            # Write results to a .txt file with the same name as the image
            output_txt_path = os.path.join(output_folder, os.path.splitext(image_name)[0] + '.txt')
            with open(output_txt_path, 'w') as f:
                f.write('\n'.join(txt_lines))

            # Save the annotated image
            annotated_image_path = os.path.join(annotated_folder, image_name)
            cv2.imwrite(annotated_image_path, image)



0: 384x640 1 person, 164.4ms
Speed: 19.0ms preprocess, 164.4ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 174.0ms
Speed: 3.8ms preprocess, 174.0ms inference, 15.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 175.4ms
Speed: 0.0ms preprocess, 175.4ms inference, 16.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 222.7ms
Speed: 0.0ms preprocess, 222.7ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 298.3ms
Speed: 0.0ms preprocess, 298.3ms inference, 5.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 228.1ms
Speed: 4.0ms preprocess, 228.1ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 257.2ms
Speed: 15.6ms preprocess, 257.2ms inference, 4.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 286.6ms
Speed: 6.4ms preprocess, 286.6ms inference, 0.0ms postprocess per imag