In [1]:
import os
import cv2
import mediapipe as mp
from ultralytics import YOLO

# Initialize YOLO and Mediapipe pose models
yolo_model = YOLO('yolov8n.pt')  # YOLOv8n for lightweight detection
mp_pose = mp.solutions.pose
mp_drawing = mp.solutions.drawing_utils

# Folder paths
image_folder = r'C:\Users\Admin\Downloads\Body Language Detection with mediapipe\images' 
output_folder = r'C:\Users\Admin\Downloads\Body Language Detection with mediapipe\labels' 

# Ensure output folder exists
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# List to store all flip_idx (landmarks)
all_flip_idx = []

# Initialize Mediapipe pose estimation model
with mp_pose.Pose(static_image_mode=True, min_detection_confidence=0.5, model_complexity=1) as pose:
    # Iterate through all images in the folder
    for image_name in os.listdir(image_folder):
        if image_name.endswith('.jpg') or image_name.endswith('.png'): 
            # Load the image
            image_path = os.path.join(image_folder, image_name)
            image = cv2.imread(image_path)
            
            # Use YOLO to detect persons in the image
            yolo_results = yolo_model(image)
            detections = yolo_results[0]  # YOLO detections for the image
            
            # Prepare to store detection results
            txt_lines = []
            
            # Process each detected person
            for det in detections.boxes:
                if det.cls == 0:  # '0' class for person detection
                    # Get YOLO bounding box in YOLO format
                    x1, y1, x2, y2 = map(int, det.xyxy[0].cpu().numpy())
                    box_w = (x2 - x1) / image.shape[1]
                    box_h = (y2 - y1) / image.shape[0]
                    box_x_center = (x1 + x2) / 2 / image.shape[1]
                    box_y_center = (y1 + y2) / 2 / image.shape[0]
                    
                    # Crop detected person for pose estimation
                    person_crop = image[y1:y2, x1:x2]
                    person_rgb = cv2.cvtColor(person_crop, cv2.COLOR_BGR2RGB)
                    
                    # Perform pose estimation with Mediapipe
                    results = pose.process(person_rgb)
                    
                    if results.pose_landmarks:
                        # Prepare keypoints for the .txt file
                        keypoints = []
                        for idx, landmark in enumerate(results.pose_landmarks.landmark):
                            keypoints.append(f'{landmark.x} {landmark.y} {landmark.visibility}')
                            all_flip_idx.append(idx)  # Add to flip_idx list

                        # Combine bounding box and keypoints into a single line
                        txt_line = f'0 {box_x_center} {box_y_center} {box_w} {box_h} ' + ' '.join(keypoints)
                        txt_lines.append(txt_line)
            
            # Write results to a .txt file with the same name as the image
            output_txt_path = os.path.join(output_folder, os.path.splitext(image_name)[0] + '.txt')
            with open(output_txt_path, 'w') as f:
                f.write('\n'.join(txt_lines))

# Remove duplicates from flip_idx and return the list
flip_idx = list(set(all_flip_idx))
print("flip_idx:", flip_idx)




0: 480x640 1 person, 253.8ms
Speed: 15.6ms preprocess, 253.8ms inference, 15.0ms postprocess per image at shape (1, 3, 480, 640)

0: 640x512 1 person, 289.3ms
Speed: 5.0ms preprocess, 289.3ms inference, 0.0ms postprocess per image at shape (1, 3, 640, 512)

0: 480x640 (no detections), 251.7ms
Speed: 0.0ms preprocess, 251.7ms inference, 4.0ms postprocess per image at shape (1, 3, 480, 640)

0: 640x544 1 person, 368.7ms
Speed: 0.0ms preprocess, 368.7ms inference, 0.5ms postprocess per image at shape (1, 3, 640, 544)
flip_idx: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32]


In [2]:
import os
import cv2
import mediapipe as mp
from ultralytics import YOLO

# Initialize YOLO and Mediapipe pose models
yolo_model = YOLO('yolov8n.pt')  # YOLOv8n for lightweight detection
mp_pose = mp.solutions.pose
mp_drawing = mp.solutions.drawing_utils

# Folder paths
image_folder = r'C:\Users\Admin\Downloads\Body Language Detection with mediapipe\images' 
output_folder = r'C:\Users\Admin\Downloads\Body Language Detection with mediapipe\labels'
annotated_folder = r'C:\Users\Admin\Downloads\Body Language Detection with mediapipe\images1'

# Ensure output and annotated image folders exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
if not os.path.exists(annotated_folder):
    os.makedirs(annotated_folder)

# List to store all flip_idx (landmarks)
all_flip_idx = []

# Initialize Mediapipe pose estimation model
with mp_pose.Pose(static_image_mode=True, min_detection_confidence=0.5, model_complexity=1) as pose:
    # Iterate through all images in the folder
    for image_name in os.listdir(image_folder):
        if image_name.endswith('.jpg') or image_name.endswith('.png'): 
            # Load the image
            image_path = os.path.join(image_folder, image_name)
            image = cv2.imread(image_path)
            
            # Use YOLO to detect persons in the image
            yolo_results = yolo_model(image)
            detections = yolo_results[0]  # YOLO detections for the image
            
            # Prepare to store detection results
            txt_lines = []
            
            # Process each detected person
            for det in detections.boxes:
                if det.cls == 0:  # '0' class for person detection
                    # Get YOLO bounding box in YOLO format
                    x1, y1, x2, y2 = map(int, det.xyxy[0].cpu().numpy())
                    box_w = (x2 - x1) / image.shape[1]
                    box_h = (y2 - y1) / image.shape[0]
                    box_x_center = (x1 + x2) / 2 / image.shape[1]
                    box_y_center = (y1 + y2) / 2 / image.shape[0]
                    
                    # Draw YOLO bounding box on the image
                    cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 2)
                    
                    # Crop detected person for pose estimation
                    person_crop = image[y1:y2, x1:x2]
                    person_rgb = cv2.cvtColor(person_crop, cv2.COLOR_BGR2RGB)
                    
                    # Perform pose estimation with Mediapipe
                    results = pose.process(person_rgb)
                    
                    if results.pose_landmarks:
                        # Draw pose landmarks on the cropped person and overlay it back on the original image
                        mp_drawing.draw_landmarks(image[y1:y2, x1:x2], results.pose_landmarks, mp_pose.POSE_CONNECTIONS)

                        # Prepare keypoints for the .txt file
                        keypoints = []
                        for idx, landmark in enumerate(results.pose_landmarks.landmark):
                            keypoints.append(f'{landmark.x} {landmark.y} {landmark.visibility}')
                            all_flip_idx.append(idx)  # Add to flip_idx list

                        # Combine bounding box and keypoints into a single line
                        txt_line = f'0 {box_x_center} {box_y_center} {box_w} {box_h} ' + ' '.join(keypoints)
                        txt_lines.append(txt_line)
            
            # Write results to a .txt file with the same name as the image
            output_txt_path = os.path.join(output_folder, os.path.splitext(image_name)[0] + '.txt')
            with open(output_txt_path, 'w') as f:
                f.write('\n'.join(txt_lines))
            
            # Save the annotated image
            annotated_image_path = os.path.join(annotated_folder, image_name)
            cv2.imwrite(annotated_image_path, image)

# Remove duplicates from flip_idx and return the list
flip_idx = list(set(all_flip_idx))
print("flip_idx:", flip_idx)



0: 480x640 1 person, 202.4ms
Speed: 2.6ms preprocess, 202.4ms inference, 2.7ms postprocess per image at shape (1, 3, 480, 640)

0: 640x512 1 person, 247.7ms
Speed: 0.0ms preprocess, 247.7ms inference, 0.0ms postprocess per image at shape (1, 3, 640, 512)

0: 480x640 (no detections), 194.0ms
Speed: 0.0ms preprocess, 194.0ms inference, 3.0ms postprocess per image at shape (1, 3, 480, 640)

0: 640x544 1 person, 188.1ms
Speed: 0.0ms preprocess, 188.1ms inference, 3.0ms postprocess per image at shape (1, 3, 640, 544)
flip_idx: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32]
