# 0. Install and Import Dependencies

In [30]:
!pip install mediapipe opencv-python pandas scikit-learn



In [1]:
import mediapipe as mp # Import mediapipe
import cv2 # Import opencv

In [2]:
mp_drawing = mp.solutions.drawing_utils # Drawing helpers
mp_holistic = mp.solutions.holistic # Mediapipe Solutions

# 1. Make Some Detections

In [3]:
import cv2
import mediapipe as mp

# Initialize mediapipe holistic model and drawing utilities
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils

# Start capturing video from the webcam
cap = cv2.VideoCapture(0)

# Initiate holistic model
with mp_holistic.Holistic(min_detection_confidence=0.1, min_tracking_confidence=0.1) as holistic:
    while cap.isOpened():
        ret, frame = cap.read()
        
        # Recolor Feed to RGB for Mediapipe processing
        image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        image.flags.writeable = False        
        
        # Make Detections
        results = holistic.process(image)
        
        # Recolor image back to BGR for rendering
        image.flags.writeable = True   
        image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
        
        # 1. Draw face landmarks (no FACE_CONNECTIONS, just landmarks)
        if results.face_landmarks:
            mp_drawing.draw_landmarks(image, results.face_landmarks, 
                                      mp.solutions.holistic.FACEMESH_TESSELATION,  # Use FACEMESH_TESSELATION for face mesh connections
                                      mp_drawing.DrawingSpec(color=(80,110,10), thickness=1, circle_radius=1),
                                      mp_drawing.DrawingSpec(color=(80,256,121), thickness=1, circle_radius=1))
        
        # 2. Right hand
        mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                                 mp_drawing.DrawingSpec(color=(80,22,10), thickness=2, circle_radius=4),
                                 mp_drawing.DrawingSpec(color=(80,44,121), thickness=2, circle_radius=2))
        
        # 3. Left Hand
        mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                                 mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4),
                                 mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2))
        
        # 4. Pose Detections
        mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS, 
                                 mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=4),
                                 mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2))
        
        # Show the processed image
        cv2.imshow('Raw Webcam Feed', image)

        # Break the loop if 'q' is pressed
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break

# Release the video capture object and close windows
cap.release()
cv2.destroyAllWindows()


In [102]:
import cv2
import mediapipe as mp
import os

# Initialize mediapipe holistic model and drawing utilities
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils

# Path to input video
video_path = r'C:\Users\Admin\Downloads\Body Language Detection with mediapipe\output_videos\Vid3.mp4'

# Specify output folder and file
output_folder = 'output_videos'
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
output_video_path = os.path.join(output_folder, r'C:\Users\Admin\Downloads\Body Language Detection with mediapipe\output_videos\Vid3Processed.mp4')

# Start capturing video from the file
cap = cv2.VideoCapture(video_path)

# Get frame width, height, and FPS for saving the output video
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = int(cap.get(cv2.CAP_PROP_FPS))

# Define the codec and create VideoWriter object to save the video
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_video_path, fourcc, fps, (frame_width, frame_height))

# Initiate holistic model
with mp_holistic.Holistic(min_detection_confidence=0.1, min_tracking_confidence=0.1) as holistic:
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        
        # Recolor Feed to RGB for Mediapipe processing
        image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        image.flags.writeable = False        
        
        # Make Detections
        results = holistic.process(image)
        
        # Recolor image back to BGR for rendering
        image.flags.writeable = True   
        image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
        
        # 1. Draw face landmarks
        if results.face_landmarks:
            mp_drawing.draw_landmarks(image, results.face_landmarks, 
                                      mp.solutions.holistic.FACEMESH_TESSELATION, 
                                      mp_drawing.DrawingSpec(color=(80,110,10), thickness=1, circle_radius=1),
                                      mp_drawing.DrawingSpec(color=(80,256,121), thickness=1, circle_radius=1))
        
        # 2. Right hand
        mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                                 mp_drawing.DrawingSpec(color=(80,22,10), thickness=2, circle_radius=4),
                                 mp_drawing.DrawingSpec(color=(80,44,121), thickness=2, circle_radius=2))
        
        # 3. Left Hand
        mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                                 mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4),
                                 mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2))
        
        # 4. Pose Detections
        mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS, 
                                 mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=4),
                                 mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2))
        
        # Write the processed frame to the output video
        out.write(image)
        
        # Show the processed image (optional)
        cv2.imshow('Processed Video Feed', image)

        # Break the loop if 'q' is pressed
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break

# Release the video capture and writer objects, and close windows
cap.release()
out.release()
cv2.destroyAllWindows()


In [104]:
import cv2
import mediapipe as mp
import os

# Initialize mediapipe pose model and drawing utilities
mp_pose = mp.solutions.pose
mp_drawing = mp.solutions.drawing_utils

# Path to input video
video_path = r'C:\Users\Admin\Downloads\Body Language Detection with mediapipe\output_videos\Vid3.mp4'

# Specify output folder and file
output_folder = 'output_videos'
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
output_video_path = os.path.join(output_folder, r'C:\Users\Admin\Downloads\Body Language Detection with mediapipe\output_videos\MainProcessed.mp4')

# Start capturing video from the file
cap = cv2.VideoCapture(video_path)

# Get frame width, height, and FPS for saving the output video
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = int(cap.get(cv2.CAP_PROP_FPS))

# Define the codec and create VideoWriter object to save the video
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_video_path, fourcc, fps, (frame_width, frame_height))

# Initiate pose model
with mp_pose.Pose(static_image_mode=False, min_detection_confidence=0.5, min_tracking_confidence=0.5, model_complexity=1) as pose:
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        
        # Recolor Feed to RGB for Mediapipe processing
        image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        image.flags.writeable = False        
        
        # Make Detections
        results = pose.process(image)
        
        # Recolor image back to BGR for rendering
        image.flags.writeable = True   
        image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
        
        # Draw Pose landmarks for the detected person
        if results.pose_landmarks:
            mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_pose.POSE_CONNECTIONS, 
                                      mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=4),
                                      mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2))
        
        # Write the processed frame to the output video
        out.write(image)
        
        # Show the processed image (optional)
        cv2.imshow('Processed Video Feed', image)

        # Break the loop if 'q' is pressed
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break

# Release the video capture and writer objects, and close windows
cap.release()
out.release()
cv2.destroyAllWindows()


In [4]:
results.pose_landmarks.landmark[0].visibility

0.995158314704895

In [5]:
results.face_landmarks.landmark[0].visibility

0.0

In [6]:
print(results.face_landmarks.landmark[0])
results.face_landmarks.landmark[0].x

x: 0.6766873002052307
y: 0.38457825779914856
z: -0.061855755746364594



0.6766873002052307

# 2. Capture Landmarks & Export to CSV
<!--<img src="https://i.imgur.com/8bForKY.png">-->
<!--<img src="https://i.imgur.com/AzKNp7A.png">-->
<!--<img src="https://i.imgur.com/8bForKY.png">-->
<img src="https://i.imgur.com/AzKNp7A.png">

In [7]:
import csv
import os
import numpy as np

In [55]:
num_coords = len(results.pose_landmarks.landmark)+len(results.face_landmarks.landmark)
num_coords

501

In [56]:
landmarks = ['class']
for val in range(1, num_coords+1):
    landmarks += ['x{}'.format(val), 'y{}'.format(val), 'z{}'.format(val), 'v{}'.format(val)]

In [57]:
landmarks

['class',
 'x1',
 'y1',
 'z1',
 'v1',
 'x2',
 'y2',
 'z2',
 'v2',
 'x3',
 'y3',
 'z3',
 'v3',
 'x4',
 'y4',
 'z4',
 'v4',
 'x5',
 'y5',
 'z5',
 'v5',
 'x6',
 'y6',
 'z6',
 'v6',
 'x7',
 'y7',
 'z7',
 'v7',
 'x8',
 'y8',
 'z8',
 'v8',
 'x9',
 'y9',
 'z9',
 'v9',
 'x10',
 'y10',
 'z10',
 'v10',
 'x11',
 'y11',
 'z11',
 'v11',
 'x12',
 'y12',
 'z12',
 'v12',
 'x13',
 'y13',
 'z13',
 'v13',
 'x14',
 'y14',
 'z14',
 'v14',
 'x15',
 'y15',
 'z15',
 'v15',
 'x16',
 'y16',
 'z16',
 'v16',
 'x17',
 'y17',
 'z17',
 'v17',
 'x18',
 'y18',
 'z18',
 'v18',
 'x19',
 'y19',
 'z19',
 'v19',
 'x20',
 'y20',
 'z20',
 'v20',
 'x21',
 'y21',
 'z21',
 'v21',
 'x22',
 'y22',
 'z22',
 'v22',
 'x23',
 'y23',
 'z23',
 'v23',
 'x24',
 'y24',
 'z24',
 'v24',
 'x25',
 'y25',
 'z25',
 'v25',
 'x26',
 'y26',
 'z26',
 'v26',
 'x27',
 'y27',
 'z27',
 'v27',
 'x28',
 'y28',
 'z28',
 'v28',
 'x29',
 'y29',
 'z29',
 'v29',
 'x30',
 'y30',
 'z30',
 'v30',
 'x31',
 'y31',
 'z31',
 'v31',
 'x32',
 'y32',
 'z32',
 'v32',
 '

In [58]:
with open('coords.csv', mode='w', newline='') as f:
    csv_writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    csv_writer.writerow(landmarks)

In [64]:
class_name = "sad"

In [65]:
cap = cv2.VideoCapture(0)
# Initiate holistic model
with mp_holistic.Holistic(min_detection_confidence=0.1, min_tracking_confidence=0.1) as holistic:
    
    while cap.isOpened():
        ret, frame = cap.read()
        
        # Recolor Feed
        image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        image.flags.writeable = False        
        
        # Make Detections
        results = holistic.process(image)
        # print(results.face_landmarks)
        
        # Recolor image back to BGR for rendering
        image.flags.writeable = True   
        image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
        
        # 1. Draw face landmarks (use FACEMESH_TESSELATION)
        if results.face_landmarks:
            mp_drawing.draw_landmarks(image, results.face_landmarks, mp.solutions.holistic.FACEMESH_TESSELATION, 
                                      mp_drawing.DrawingSpec(color=(80,110,10), thickness=1, circle_radius=1),
                                      mp_drawing.DrawingSpec(color=(80,256,121), thickness=1, circle_radius=1))
        
        # 2. Right hand
        mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                                 mp_drawing.DrawingSpec(color=(80,22,10), thickness=2, circle_radius=4),
                                 mp_drawing.DrawingSpec(color=(80,44,121), thickness=2, circle_radius=2))

        # 3. Left Hand
        mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                                 mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4),
                                 mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2))

        # 4. Pose Detections
        mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS, 
                                 mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=4),
                                 mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2))

        # Export coordinates
        try:
            # Extract Pose landmarks
            pose = results.pose_landmarks.landmark
            pose_row = list(np.array([[landmark.x, landmark.y, landmark.z, landmark.visibility] for landmark in pose]).flatten())
            
            # Extract Face landmarks
            face = results.face_landmarks.landmark
            face_row = list(np.array([[landmark.x, landmark.y, landmark.z, landmark.visibility] for landmark in face]).flatten())
            
            # Concatenate rows
            row = pose_row + face_row
            
            # Append class name 
            row.insert(0, class_name)
            
            # Export to CSV
            with open('coords.csv', mode='a', newline='') as f:
                csv_writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
                csv_writer.writerow(row) 
            
        except:
            pass
                        
        cv2.imshow('Raw Webcam Feed', image)

        if cv2.waitKey(10) & 0xFF == ord('q'):
            break

cap.release()
cv2.destroyAllWindows()


# 3. Train Custom Model Using Scikit Learn

## 3.1 Read in Collected Data and Process

In [66]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [67]:
df = pd.read_csv('coords.csv')

In [68]:
df.head()

Unnamed: 0,class,x1,y1,z1,v1,x2,y2,z2,v2,x3,...,z499,v499,x500,y500,z500,v500,x501,y501,z501,v501
0,bored,0.62902,0.568639,-1.385486,0.99991,0.658297,0.489352,-1.307458,0.99989,0.676192,...,-0.016321,0.0,0.713972,0.468259,0.015721,0.0,0.720104,0.459737,0.016592,0.0
1,bored,0.628555,0.568766,-1.359292,0.999915,0.658376,0.489075,-1.280529,0.999895,0.676367,...,-0.015541,0.0,0.717927,0.469455,0.017143,0.0,0.724083,0.461155,0.018116,0.0
2,bored,0.628529,0.56895,-1.306886,0.99992,0.658665,0.489102,-1.224599,0.999901,0.676657,...,-0.015379,0.0,0.718667,0.468024,0.017193,0.0,0.724801,0.460089,0.018118,0.0
3,bored,0.628614,0.569653,-1.30228,0.999924,0.65916,0.489213,-1.221179,0.999906,0.677106,...,-0.012318,0.0,0.718511,0.46837,0.022802,0.0,0.72458,0.460204,0.024055,0.0
4,bored,0.628584,0.570388,-1.246793,0.999929,0.65929,0.489301,-1.169416,0.999911,0.67728,...,-0.01387,0.0,0.719794,0.464793,0.0202,0.0,0.725901,0.456046,0.021383,0.0


In [69]:
df.tail()

Unnamed: 0,class,x1,y1,z1,v1,x2,y2,z2,v2,x3,...,z499,v499,x500,y500,z500,v500,x501,y501,z501,v501
297,sad,0.547834,0.554001,-1.346296,0.999237,0.577068,0.491668,-1.270525,0.998378,0.593076,...,-0.0012,0.0,0.620441,0.49137,0.023442,0.0,0.626484,0.484749,0.024353,0.0
298,sad,0.547324,0.555652,-1.306115,0.999271,0.577012,0.492059,-1.237636,0.998449,0.593033,...,-0.004037,0.0,0.617775,0.487227,0.017349,0.0,0.623953,0.480202,0.017936,0.0
299,sad,0.546696,0.557833,-1.306317,0.999268,0.576753,0.492594,-1.237203,0.998476,0.592998,...,-0.005649,0.0,0.617015,0.486855,0.014468,0.0,0.623224,0.479893,0.014918,0.0
300,sad,0.545958,0.558138,-1.305457,0.999269,0.576405,0.492615,-1.236949,0.998491,0.592972,...,-0.006536,0.0,0.619035,0.487489,0.01371,0.0,0.625227,0.480185,0.014181,0.0
301,sad,0.545848,0.558433,-1.313607,0.999252,0.576298,0.492713,-1.243711,0.998481,0.593066,...,-0.006081,0.0,0.61915,0.487931,0.014544,0.0,0.625306,0.480583,0.015057,0.0


In [70]:
df[df['class']=='wave']

Unnamed: 0,class,x1,y1,z1,v1,x2,y2,z2,v2,x3,...,z499,v499,x500,y500,z500,v500,x501,y501,z501,v501


In [71]:
X = df.drop('class', axis=1) # features
y = df['class'] # target value

In [72]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1234)

In [73]:
X_test

Unnamed: 0,x1,y1,z1,v1,x2,y2,z2,v2,x3,y3,...,z499,v499,x500,y500,z500,v500,x501,y501,z501,v501
263,0.529031,0.539838,-1.208879,0.999475,0.559137,0.480468,-1.138925,0.998923,0.582513,0.481862,...,-0.001608,0.0,0.590233,0.468628,0.020492,0.0,0.595983,0.461490,0.021075,0.0
77,0.392653,0.559100,-1.430887,0.999548,0.422583,0.490989,-1.331480,0.999476,0.440784,0.495545,...,-0.016099,0.0,0.474671,0.503379,0.003644,0.0,0.481827,0.494505,0.003946,0.0
163,0.559856,0.604692,-1.537517,0.998916,0.585246,0.524771,-1.457976,0.997477,0.600971,0.523674,...,-0.013425,0.0,0.635077,0.517787,0.017497,0.0,0.639931,0.511390,0.018394,0.0
206,0.520391,0.588952,-1.338330,0.999192,0.553459,0.508157,-1.278191,0.998464,0.576750,0.505051,...,-0.014673,0.0,0.594599,0.517553,0.000154,0.0,0.600545,0.508007,0.000044,0.0
167,0.561019,0.605435,-1.474053,0.998663,0.585978,0.525987,-1.401680,0.996999,0.601071,0.525276,...,-0.015614,0.0,0.635474,0.519326,0.012322,0.0,0.640690,0.512325,0.012918,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51,0.371020,0.545866,-1.415979,0.999313,0.410275,0.486595,-1.311429,0.999225,0.430361,0.492007,...,-0.014271,0.0,0.457727,0.509061,0.008053,0.0,0.464680,0.501833,0.008377,0.0
284,0.518099,0.543335,-1.290829,0.999371,0.547564,0.482061,-1.218027,0.998729,0.570402,0.482558,...,0.002969,0.0,0.604905,0.480979,0.030421,0.0,0.610917,0.475190,0.031543,0.0
205,0.522656,0.580285,-1.336037,0.999226,0.555130,0.502012,-1.273325,0.998561,0.578338,0.499466,...,-0.015013,0.0,0.594865,0.516136,-0.000913,0.0,0.600881,0.505847,-0.000949,0.0
9,0.628595,0.573134,-1.218545,0.999945,0.660316,0.490308,-1.145461,0.999928,0.678244,0.490302,...,-0.014698,0.0,0.720123,0.471514,0.018277,0.0,0.726254,0.463033,0.019299,0.0


## 3.2 Train Machine Learning Classification Model

In [74]:
from sklearn.pipeline import make_pipeline 
from sklearn.preprocessing import StandardScaler 

from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [75]:
pipelines = {
    'lr':make_pipeline(StandardScaler(), LogisticRegression()),
    'rc':make_pipeline(StandardScaler(), RidgeClassifier()),
    'rf':make_pipeline(StandardScaler(), RandomForestClassifier()),
    'gb':make_pipeline(StandardScaler(), GradientBoostingClassifier()),
}

In [76]:
fit_models = {}
for algo, pipeline in pipelines.items():
    model = pipeline.fit(X_train, y_train)
    fit_models[algo] = model

In [77]:
fit_models

{'lr': Pipeline(steps=[('standardscaler', StandardScaler()),
                 ('logisticregression', LogisticRegression())]),
 'rc': Pipeline(steps=[('standardscaler', StandardScaler()),
                 ('ridgeclassifier', RidgeClassifier())]),
 'rf': Pipeline(steps=[('standardscaler', StandardScaler()),
                 ('randomforestclassifier', RandomForestClassifier())]),
 'gb': Pipeline(steps=[('standardscaler', StandardScaler()),
                 ('gradientboostingclassifier', GradientBoostingClassifier())])}

In [78]:
fit_models['rc'].predict(X_test)

array(['sad', 'bored', 'sad', 'sad', 'sad', 'bored', 'sad', 'sad', 'sad',
       'sad', 'sad', 'sad', 'sad', 'sad', 'sad', 'sad', 'sad', 'sad',
       'bored', 'bored', 'bored', 'bored', 'bored', 'sad', 'bored',
       'bored', 'sad', 'sad', 'sad', 'bored', 'bored', 'bored', 'bored',
       'sad', 'bored', 'sad', 'sad', 'sad', 'sad', 'bored', 'sad', 'sad',
       'bored', 'sad', 'sad', 'bored', 'bored', 'sad', 'sad', 'sad',
       'bored', 'sad', 'sad', 'sad', 'sad', 'bored', 'sad', 'bored',
       'sad', 'bored', 'bored', 'sad', 'sad', 'sad', 'sad', 'sad', 'sad',
       'bored', 'bored', 'sad', 'sad', 'sad', 'sad', 'sad', 'sad', 'sad',
       'sad', 'bored', 'bored', 'bored', 'sad', 'sad', 'sad', 'bored',
       'bored', 'sad', 'bored', 'sad', 'sad', 'bored', 'bored'],
      dtype='<U5')

## 3.3 Evaluate and Serialize Model 

In [8]:
from sklearn.metrics import accuracy_score # Accuracy metrics 
import pickle 

In [80]:
for algo, model in fit_models.items():
    yhat = model.predict(X_test)
    print(algo, accuracy_score(y_test, yhat))

lr 1.0
rc 1.0
rf 1.0
gb 1.0


In [81]:
fit_models['rc'].predict(X_test)

array(['sad', 'bored', 'sad', 'sad', 'sad', 'bored', 'sad', 'sad', 'sad',
       'sad', 'sad', 'sad', 'sad', 'sad', 'sad', 'sad', 'sad', 'sad',
       'bored', 'bored', 'bored', 'bored', 'bored', 'sad', 'bored',
       'bored', 'sad', 'sad', 'sad', 'bored', 'bored', 'bored', 'bored',
       'sad', 'bored', 'sad', 'sad', 'sad', 'sad', 'bored', 'sad', 'sad',
       'bored', 'sad', 'sad', 'bored', 'bored', 'sad', 'sad', 'sad',
       'bored', 'sad', 'sad', 'sad', 'sad', 'bored', 'sad', 'bored',
       'sad', 'bored', 'bored', 'sad', 'sad', 'sad', 'sad', 'sad', 'sad',
       'bored', 'bored', 'sad', 'sad', 'sad', 'sad', 'sad', 'sad', 'sad',
       'sad', 'bored', 'bored', 'bored', 'sad', 'sad', 'sad', 'bored',
       'bored', 'sad', 'bored', 'sad', 'sad', 'bored', 'bored'],
      dtype='<U5')

In [82]:
y_test

263      sad
77     bored
163      sad
206      sad
167      sad
       ...  
51     bored
284      sad
205      sad
9      bored
83     bored
Name: class, Length: 91, dtype: object

In [83]:
with open('model.pkl', 'wb') as f:
    pickle.dump(fit_models['rf'], f)

# 4. Make Detections with Model

In [9]:
with open('model.pkl', 'rb') as f:
    model = pickle.load(f)

In [10]:
model

In [11]:
cap = cv2.VideoCapture(0)
# Initiate holistic model
with mp_holistic.Holistic(min_detection_confidence=0.1, min_tracking_confidence=0.1) as holistic:
    
    while cap.isOpened():
        ret, frame = cap.read()
        
        # Recolor Feed
        image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        image.flags.writeable = False        
        
        # Make Detections
        results = holistic.process(image)
        
        # Recolor image back to BGR for rendering
        image.flags.writeable = True   
        image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
        
        # 1. Draw face landmarks (use FACEMESH_TESSELATION)
        if results.face_landmarks:
            mp_drawing.draw_landmarks(image, results.face_landmarks, mp.solutions.holistic.FACEMESH_TESSELATION, 
                                      mp_drawing.DrawingSpec(color=(80,110,10), thickness=1, circle_radius=1),
                                      mp_drawing.DrawingSpec(color=(80,256,121), thickness=1, circle_radius=1))
        
        # 2. Right hand
        mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                                 mp_drawing.DrawingSpec(color=(80,22,10), thickness=2, circle_radius=4),
                                 mp_drawing.DrawingSpec(color=(80,44,121), thickness=2, circle_radius=2))

        # 3. Left Hand
        mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                                 mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4),
                                 mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2))

        # 4. Pose Detections
        mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS, 
                                 mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=4),
                                 mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2))
        
        # Export coordinates
        try:
            # Extract Pose landmarks
            pose = results.pose_landmarks.landmark
            pose_row = list(np.array([[landmark.x, landmark.y, landmark.z, landmark.visibility] for landmark in pose]).flatten())
            
            # Extract Face landmarks
            face = results.face_landmarks.landmark
            face_row = list(np.array([[landmark.x, landmark.y, landmark.z, landmark.visibility] for landmark in face]).flatten())
            
            # Concatenate rows
            row = pose_row + face_row
            
            # Make Detections
            X = pd.DataFrame([row])
            body_language_class = model.predict(X)[0]
            body_language_prob = model.predict_proba(X)[0]
            print(body_language_class, body_language_prob)
            
            # Grab ear coords
            coords = tuple(np.multiply(
                            np.array(
                                (results.pose_landmarks.landmark[mp_holistic.PoseLandmark.LEFT_EAR].x, 
                                 results.pose_landmarks.landmark[mp_holistic.PoseLandmark.LEFT_EAR].y))
                        , [640,480]).astype(int))
            
            cv2.rectangle(image, 
                          (coords[0], coords[1]+5), 
                          (coords[0]+len(body_language_class)*20, coords[1]-30), 
                          (245, 117, 16), -1)
            cv2.putText(image, body_language_class, coords, 
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
            
            # Get status box
            cv2.rectangle(image, (0,0), (250, 60), (245, 117, 16), -1)
            
            # Display Class
            cv2.putText(image, 'CLASS', (95,12), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 1, cv2.LINE_AA)
            cv2.putText(image, body_language_class.split(' ')[0], (90,40), 
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
            
            # Display Probability
            cv2.putText(image, 'PROB', (15,12), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 1, cv2.LINE_AA)
            cv2.putText(image, str(round(body_language_prob[np.argmax(body_language_prob)],2)), 
                        (10,40), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
            
        except:
            pass
                        
        cv2.imshow('Raw Webcam Feed', image)

        if cv2.waitKey(10) & 0xFF == ord('q'):
            break

cap.release()
cv2.destroyAllWindows()


In [12]:
tuple(np.multiply(np.array((results.pose_landmarks.landmark[mp_holistic.PoseLandmark.LEFT_EAR].x, 
results.pose_landmarks.landmark[mp_holistic.PoseLandmark.LEFT_EAR].y)), [640,480]).astype(int))

(408, 132)

In [1]:
import cv2
import mediapipe as mp
import torch
import os
from ultralytics import YOLO

# Initialize Mediapipe pose model and drawing utilities
mp_pose = mp.solutions.pose
mp_drawing = mp.solutions.drawing_utils

# Load YOLOv8 model for person detection
yolo_model = YOLO('yolov8n.pt')  # Use 'yolov8n.pt' for lightweight model

# Path to input video
video_path = r'C:\Users\Admin\Downloads\Body Language Detection with mediapipe\output_videos\Vid2.mp4'

# Specify output folder and file
output_folder = 'output_videos'
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
output_video_path = os.path.join(output_folder, r'C:\Users\Admin\Downloads\Body Language Detection with mediapipe\output_videos\Vnnbnhg.mp4')

# Start capturing video from the file
cap = cv2.VideoCapture(video_path)

# Get frame width, height, and FPS for saving the output video
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = int(cap.get(cv2.CAP_PROP_FPS))

# Define the codec and create VideoWriter object to save the video
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_video_path, fourcc, fps, (frame_width, frame_height))

# Initiate pose model
with mp_pose.Pose(static_image_mode=False, min_detection_confidence=0.5, min_tracking_confidence=0.5, model_complexity=1) as pose:
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        
        # Use YOLO to detect people in the frame
        results = yolo_model(frame)
        detections = results[0]  # Get detections from YOLO
        
        # Process each person detected
        for det in detections.boxes:
            if det.cls == 0:  # Class 0 is for 'person' in YOLO
                # Get bounding box coordinates for each person
                x1, y1, x2, y2 = map(int, det.xyxy[0].cpu().numpy())

                # Extract the person from the frame using the bounding box
                person = frame[y1:y2, x1:x2]

                # Convert to RGB for Mediapipe processing
                person_rgb = cv2.cvtColor(person, cv2.COLOR_BGR2RGB)
                person_rgb.flags.writeable = False

                # Apply Mediapipe Pose estimation
                pose_results = pose.process(person_rgb)
                
                # Recolor the image back to BGR for display
                person_rgb.flags.writeable = True
                person_bgr = cv2.cvtColor(person_rgb, cv2.COLOR_RGB2BGR)

                # Draw Pose landmarks on the person
                if pose_results.pose_landmarks:
                    mp_drawing.draw_landmarks(person_bgr, pose_results.pose_landmarks, mp_pose.POSE_CONNECTIONS,
                                              mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=4),
                                              mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2))

                # Replace the processed person back into the original frame
                frame[y1:y2, x1:x2] = person_bgr
        
        # Write the processed frame to the output video
        out.write(frame)

        # Show the processed frame (optional)
        cv2.imshow('Processed Video Feed', frame)

        # Break the loop if 'q' is pressed
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break

# Release the video capture and writer objects, and close windows
cap.release()
out.release()
cv2.destroyAllWindows()



0: 640x352 2 persons, 221.4ms
Speed: 15.9ms preprocess, 221.4ms inference, 15.1ms postprocess per image at shape (1, 3, 640, 352)

0: 640x352 2 persons, 189.7ms
Speed: 4.9ms preprocess, 189.7ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 352)

0: 640x352 2 persons, 163.8ms
Speed: 8.0ms preprocess, 163.8ms inference, 0.0ms postprocess per image at shape (1, 3, 640, 352)

0: 640x352 2 persons, 149.4ms
Speed: 0.0ms preprocess, 149.4ms inference, 3.2ms postprocess per image at shape (1, 3, 640, 352)

0: 640x352 3 persons, 133.0ms
Speed: 0.0ms preprocess, 133.0ms inference, 0.0ms postprocess per image at shape (1, 3, 640, 352)

0: 640x352 2 persons, 121.2ms
Speed: 5.9ms preprocess, 121.2ms inference, 15.6ms postprocess per image at shape (1, 3, 640, 352)

0: 640x352 2 persons, 132.7ms
Speed: 0.0ms preprocess, 132.7ms inference, 0.0ms postprocess per image at shape (1, 3, 640, 352)

0: 640x352 3 persons, 145.3ms
Speed: 4.1ms preprocess, 145.3ms inference, 0.0ms postprocess p

In [27]:
import cv2
import mediapipe as mp
import os

# Initialize mediapipe holistic model and drawing utilities
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils

# Path to input video
video_path = r'C:\Users\Admin\Downloads\Body Language Detection with mediapipe\img\1frame_120.jpg'

# Specify output folder and file
output_folder = 'output_videos'
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
output_video_path = os.path.join(output_folder, r'C:\Users\Admin\Downloads\Body Language Detection with mediapipe\Vid3Processed.mp4')

# Start capturing video from the file
cap = cv2.VideoCapture(video_path)

# Get frame width, height, and FPS for saving the output video
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = int(cap.get(cv2.CAP_PROP_FPS))

# Define the codec and create VideoWriter object to save the video
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_video_path, fourcc, fps, (frame_width, frame_height))

# Initiate holistic model
with mp_holistic.Holistic(min_detection_confidence=0.1, min_tracking_confidence=0.1) as holistic:
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        
        # Recolor Feed to RGB for Mediapipe processing
        image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        image.flags.writeable = False        
        
        # Make Detections
        results = holistic.process(image)
        
        # Recolor image back to BGR for rendering
        image.flags.writeable = True   
        image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
        
        # 1. Draw face landmarks
        if results.face_landmarks:
            mp_drawing.draw_landmarks(image, results.face_landmarks, 
                                      mp.solutions.holistic.FACEMESH_TESSELATION, 
                                      mp_drawing.DrawingSpec(color=(80,110,10), thickness=1, circle_radius=1),
                                      mp_drawing.DrawingSpec(color=(80,256,121), thickness=1, circle_radius=1))
        
        # 2. Right hand
        mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                                 mp_drawing.DrawingSpec(color=(80,22,10), thickness=2, circle_radius=4),
                                 mp_drawing.DrawingSpec(color=(80,44,121), thickness=2, circle_radius=2))
        
        # 3. Left Hand
        mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                                 mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4),
                                 mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2))
        
        # 4. Pose Detections
        mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS, 
                                 mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=4),
                                 mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2))
        
        # Write the processed frame to the output video
        out.write(image)
        
        # Show the processed image (optional)
        cv2.imshow('Processed Video Feed', image)

        # Break the loop if 'q' is pressed
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break

# Release the video capture and writer objects, and close windows
cap.release()
out.release()
cv2.destroyAllWindows()


In [28]:
results.pose_world_landmarks

landmark {
  x: 0.07610340416431427
  y: -0.5598576068878174
  z: -0.23969310522079468
  visibility: 0.9999986886978149
}
landmark {
  x: 0.08138136565685272
  y: -0.6002929210662842
  z: -0.22783206403255463
  visibility: 0.9999979734420776
}
landmark {
  x: 0.08076263964176178
  y: -0.6028578877449036
  z: -0.22877252101898193
  visibility: 0.999998927116394
}
landmark {
  x: 0.08007578551769257
  y: -0.603441596031189
  z: -0.22994594275951385
  visibility: 0.9999978542327881
}
landmark {
  x: 0.044503096491098404
  y: -0.6005644798278809
  z: -0.229008749127388
  visibility: 0.9999973773956299
}
landmark {
  x: 0.045958511531353
  y: -0.5998915433883667
  z: -0.2325519174337387
  visibility: 0.9999980926513672
}
landmark {
  x: 0.045850515365600586
  y: -0.601276695728302
  z: -0.23145847022533417
  visibility: 0.999996542930603
}
landmark {
  x: 0.13288669288158417
  y: -0.6102220416069031
  z: -0.12780320644378662
  visibility: 0.9999990463256836
}
landmark {
  x: -0.014122351072

In [29]:
results.face_landmarks

landmark {
  x: 0.5237988233566284
  y: 0.3085106909275055
  z: -0.009163235314190388
}
landmark {
  x: 0.5225407481193542
  y: 0.2887094020843506
  z: -0.016820957884192467
}
landmark {
  x: 0.5218188762664795
  y: 0.29591765999794006
  z: -0.00927744060754776
}
landmark {
  x: 0.5148445963859558
  y: 0.2752120792865753
  z: -0.01321321353316307
}
landmark {
  x: 0.5214449167251587
  y: 0.2831488847732544
  z: -0.01779015362262726
}
landmark {
  x: 0.5192997455596924
  y: 0.276962548494339
  z: -0.016528351232409477
}
landmark {
  x: 0.5133754014968872
  y: 0.26295819878578186
  z: -0.008182582445442677
}
landmark {
  x: 0.4890238642692566
  y: 0.2814463973045349
  z: -0.0008227388025261462
}
landmark {
  x: 0.5092921257019043
  y: 0.24980776011943817
  z: -0.005937241017818451
}
landmark {
  x: 0.5074960589408875
  y: 0.24228627979755402
  z: -0.00631817989051342
}
landmark {
  x: 0.5002334117889404
  y: 0.21654316782951355
  z: -0.0030911562498658895
}
landmark {
  x: 0.524182379245

In [30]:
results.pose_landmarks.landmark[0].x

0.5183186531066895

In [31]:
results.pose_landmarks

landmark {
  x: 0.5183186531066895
  y: 0.2888590097427368
  z: -0.431506872177124
  visibility: 0.9999986886978149
}
landmark {
  x: 0.5230835676193237
  y: 0.26147329807281494
  z: -0.4057560861110687
  visibility: 0.9999979734420776
}
landmark {
  x: 0.5283770561218262
  y: 0.25966089963912964
  z: -0.40576207637786865
  visibility: 0.999998927116394
}
landmark {
  x: 0.5332298874855042
  y: 0.2575591802597046
  z: -0.4057518541812897
  visibility: 0.9999978542327881
}
landmark {
  x: 0.5042418241500854
  y: 0.26721012592315674
  z: -0.4060806930065155
  visibility: 0.9999973773956299
}
landmark {
  x: 0.497038871049881
  y: 0.27003908157348633
  z: -0.40612009167671204
  visibility: 0.9999980926513672
}
landmark {
  x: 0.4911555051803589
  y: 0.27277565002441406
  z: -0.4062861204147339
  visibility: 0.999996542930603
}
landmark {
  x: 0.5402431488037109
  y: 0.26838093996047974
  z: -0.23066753149032593
  visibility: 0.9999990463256836
}
landmark {
  x: 0.48231562972068787
  y: 0.

In [32]:
import cv2
import mediapipe as mp
import numpy as np
import os
from ultralytics import YOLO

# Initialize MediaPipe Pose
mp_pose = mp.solutions.pose
pose = mp_pose.Pose(static_image_mode=True)
mp_drawing = mp.solutions.drawing_utils

# Load YOLOv8 model from Ultralytics
yolo_model = YOLO('yolov8n.pt')  # Replace with your YOLOv8 model

# Function to normalize landmark points
def normalize_landmarks(landmarks, image_shape):
    h, w = image_shape[:2]
    normalized = []
    for lm in landmarks:
        x = lm.x * w
        y = lm.y * h
        # Ensure the coordinates are within [0, 1]
        normalized_x = np.clip(x / w, 0, 1)
        normalized_y = np.clip(y / h, 0, 1)
        normalized.append((normalized_x, normalized_y, lm.z, lm.visibility))
    return normalized

# Function to write YOLO format
def write_yolo_format(yolo_detections, landmarks, output_file):
    with open(output_file, 'w') as f:
        # YOLO detection line
        for det in yolo_detections:
            # Normalize coordinates (ensure all within bounds [0, 1])
            x_center = np.clip(det[0], 0, 1)
            y_center = np.clip(det[1], 0, 1)
            width = np.clip(det[2], 0, 1)
            height = np.clip(det[3], 0, 1)
            f.write(f"0 {x_center} {y_center} {width} {height}\n")

        # Pose landmarks line (assuming all keypoints belong to person class 2)
        for i, lm in enumerate(landmarks):
            f.write(f"2 {lm[0]} {lm[1]} {lm[2]} {lm[3]}\n")

# Main function to process images
def process_images(input_folder, output_txt_folder, output_img_folder):
    # Ensure output folders exist
    os.makedirs(output_txt_folder, exist_ok=True)
    os.makedirs(output_img_folder, exist_ok=True)
    
    # Iterate through all files in the input folder
    for file_name in os.listdir(input_folder):
        if file_name.endswith(('.jpg', '.png', '.jpeg')):  # Filter for image files
            image_path = os.path.join(input_folder, file_name)
            output_txt_path = os.path.join(output_txt_folder, f"{os.path.splitext(file_name)[0]}.txt")
            output_img_path = os.path.join(output_img_folder, file_name)
            
            # Load input image
            image = cv2.imread(image_path)
            
            # Pose detection using MediaPipe
            results = pose.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
            
            # Initialize pose landmarks
            normalized_landmarks = []
            if results.pose_landmarks:
                normalized_landmarks = normalize_landmarks(results.pose_landmarks.landmark, image.shape)
            
            # Run YOLOv8 for person detection
            yolo_results = yolo_model(image)
            
            # Collect bounding box and confidence for person detection
            yolo_detections = []
            for result in yolo_results:
                boxes = result.boxes
                for box in boxes:
                    # Only consider the 'person' class (usually class_id 0)
                    if int(box.cls) == 0:
                        # Extract and normalize bounding box coordinates
                        x_center, y_center, width, height = box.xywh[0] / np.array([image.shape[1], image.shape[0], image.shape[1], image.shape[0]])
                        yolo_detections.append((x_center, y_center, width, height))
            
            # Write the data to YOLO format .txt file
            write_yolo_format(yolo_detections, normalized_landmarks, output_txt_path)
            
            # Save the output image with YOLO detections (optional: draw bounding boxes)
            for box in yolo_results[0].boxes:
                x1, y1, x2, y2 = map(int, box.xyxy[0])  # Get box coordinates
                cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 2)  # Draw bounding box

            # Save the image with detections
            cv2.imwrite(output_img_path, image)
            print(f"Processed {file_name}, saved txt and image.")

# Specify folders
input_folder = r'C:\Users\Admin\Downloads\Body Language Detection with mediapipe\img'
output_txt_folder = r'C:\Users\Admin\Downloads\Body Language Detection with mediapipe\labels'
output_img_folder = r'C:\Users\Admin\Downloads\Body Language Detection with mediapipe\img_lbl'

# Run the processing function
process_images(input_folder, output_txt_folder, output_img_folder)



0: 384x640 1 person, 167.2ms
Speed: 11.0ms preprocess, 167.2ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)
Processed 1frame_0.jpg, saved txt and image.

0: 384x640 1 person, 182.0ms
Speed: 0.0ms preprocess, 182.0ms inference, 2.7ms postprocess per image at shape (1, 3, 384, 640)
Processed 1frame_120.jpg, saved txt and image.

0: 384x640 1 person, 174.9ms
Speed: 3.0ms preprocess, 174.9ms inference, 3.1ms postprocess per image at shape (1, 3, 384, 640)
Processed 1frame_150.jpg, saved txt and image.

0: 384x640 1 person, 207.0ms
Speed: 7.5ms preprocess, 207.0ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)
Processed 1frame_180.jpg, saved txt and image.

0: 384x640 1 person, 204.8ms
Speed: 6.8ms preprocess, 204.8ms inference, 3.5ms postprocess per image at shape (1, 3, 384, 640)
Processed 1frame_210.jpg, saved txt and image.

0: 384x640 1 person, 187.4ms
Speed: 10.1ms preprocess, 187.4ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 64

In [23]:
import cv2
import mediapipe as mp
import numpy as np
import os
from ultralytics import YOLO

# Initialize MediaPipe Pose
mp_pose = mp.solutions.pose
pose = mp_pose.Pose(static_image_mode=True)
mp_drawing = mp.solutions.drawing_utils

# Load YOLOv8 model from Ultralytics
yolo_model = YOLO('yolov8n.pt')  # Replace with your YOLOv8 model

# Function to normalize landmark points
def normalize_landmarks(landmarks, image_shape):
    h, w = image_shape[:2]
    normalized = []
    for lm in landmarks:
        x = lm.x * w
        y = lm.y * h
        # Ensure the coordinates are within [0, 1]
        normalized_x = np.clip(x / w, 0, 1)
        normalized_y = np.clip(y / h, 0, 1)
        normalized.append((normalized_x, normalized_y, lm.z, lm.visibility))
    return normalized

# Function to write YOLO format
def write_yolo_format(yolo_detections, landmarks, output_file, image_shape):
    h, w = image_shape[:2]
    with open(output_file, 'w') as f:
        # YOLO detection line for each detected object (person in this case)
        for det in yolo_detections:
            x_center, y_center, width, height = det
            f.write(f"0 {x_center} {y_center} {width} {height}\n")

        # Pose landmarks line (assuming all keypoints belong to person class 2)
        for lm in landmarks:
            x, y, z, visibility = lm
            f.write(f"2 {x} {y}\n")

# Function to process an image and save the results
def process_image(image_path, txt_output_dir, img_output_dir):
    image = cv2.imread(image_path)

    # Pose detection using MediaPipe
    results = pose.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    
    # Initialize an empty list for normalized landmarks
    normalized_landmarks = []
    
    if results.pose_landmarks:
        # Normalize the pose landmarks for YOLO format
        normalized_landmarks = normalize_landmarks(results.pose_landmarks.landmark, image.shape)

    # Run YOLOv8 for person detection
    yolo_results = yolo_model(image)

    # Collect bounding box and confidence for person detection
    yolo_detections = []
    for result in yolo_results:
        boxes = result.boxes
        for box in boxes:
            # Only consider the 'person' class (usually class_id 0)
            if int(box.cls) == 0:
                # Extract and normalize bounding box coordinates
                x_center, y_center, width, height = box.xywh[0] / np.array([image.shape[1], image.shape[0], image.shape[1], image.shape[0]])
                yolo_detections.append((x_center, y_center, width, height))
    
    # Define output file names
    base_name = os.path.basename(image_path).split('.')[0]
    txt_output_path = os.path.join(txt_output_dir, f"{base_name}.txt")
    img_output_path = os.path.join(img_output_dir, f"{base_name}.jpg")
    
    # Write the data to YOLO format .txt file
    write_yolo_format(yolo_detections, normalized_landmarks, txt_output_path, image.shape)
    
    # Save the detected image to output folder
    cv2.imwrite(img_output_path, image)

    print(f"Processed {image_path}, saved .txt to {txt_output_path} and image to {img_output_path}")

# Main function to process a folder of images
def process_folder(input_folder, txt_output_folder, img_output_folder):
    # Ensure output directories exist
    os.makedirs(txt_output_folder, exist_ok=True)
    os.makedirs(img_output_folder, exist_ok=True)

    # Loop through all images in the input folder
    for file_name in os.listdir(input_folder):
        if file_name.endswith(('.jpg', '.jpeg', '.png')):  # Filter for image files
            image_path = os.path.join(input_folder, file_name)
            process_image(image_path, txt_output_folder, img_output_folder)

# Define input and output folders
input_folder = r'C:\Users\Admin\Downloads\Body Language Detection with mediapipe\img'
txt_output_folder = r'C:\Users\Admin\Downloads\Body Language Detection with mediapipe\labels'
img_output_folder = r'C:\Users\Admin\Downloads\Body Language Detection with mediapipe\img_lbl'

# Process the folder of images
process_folder(input_folder, txt_output_folder, img_output_folder)



0: 384x640 1 person, 284.3ms
Speed: 2.3ms preprocess, 284.3ms inference, 3.9ms postprocess per image at shape (1, 3, 384, 640)
Processed C:\Users\Admin\Downloads\Body Language Detection with mediapipe\img\1frame_0.jpg, saved .txt to C:\Users\Admin\Downloads\Body Language Detection with mediapipe\labels\1frame_0.txt and image to C:\Users\Admin\Downloads\Body Language Detection with mediapipe\img_lbl\1frame_0.jpg

0: 384x640 1 person, 152.7ms
Speed: 4.0ms preprocess, 152.7ms inference, 5.5ms postprocess per image at shape (1, 3, 384, 640)
Processed C:\Users\Admin\Downloads\Body Language Detection with mediapipe\img\1frame_120.jpg, saved .txt to C:\Users\Admin\Downloads\Body Language Detection with mediapipe\labels\1frame_120.txt and image to C:\Users\Admin\Downloads\Body Language Detection with mediapipe\img_lbl\1frame_120.jpg

0: 384x640 1 person, 152.5ms
Speed: 8.0ms preprocess, 152.5ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)
Processed C:\Users\Admin\Downloads

In [25]:
import cv2
import mediapipe as mp
import numpy as np
import os
from ultralytics import YOLO

# Initialize MediaPipe Pose
mp_pose = mp.solutions.pose
pose = mp_pose.Pose(static_image_mode=True)
mp_drawing = mp.solutions.drawing_utils

# Load YOLOv8 model from Ultralytics
yolo_model = YOLO('yolov8n.pt')  # Replace with your YOLOv8 model

# Function to normalize landmark points
def normalize_landmarks(landmarks, image_shape):
    h, w = image_shape[:2]
    normalized = []
    for lm in landmarks:
        x = lm.x * w
        y = lm.y * h
        # Normalize coordinates to [0, 1]
        normalized_x = np.clip(x / w, 0, 1)
        normalized_y = np.clip(y / h, 0, 1)
        # Include z and visibility, but YOLO format does not use them
        normalized.append((normalized_x, normalized_y))
    return normalized

# Function to write YOLO format
def write_yolo_format(yolo_detections, landmarks, output_file):
    with open(output_file, 'w') as f:
        # YOLO detection line for each detected object (person in this case)
        for det in yolo_detections:
            x_center, y_center, width, height = det
            if all(0 <= val <= 1 for val in [x_center, y_center, width, height]):
                f.write(f"0 {x_center} {y_center} {width} {height} ")  # Space added for subsequent landmarks

        # Pose landmarks line (appending the class '2' after each pair of coordinates)
        for lm in landmarks:
            x, y = lm
            if 0 <= x <= 1 and 0 <= y <= 1:
                f.write(f"{x} {y} 2 ")  # '2' after each coordinate pair

        f.write('\n')  # Add a newline character at the end of the file


# Function to process an image and save the results
def process_image(image_path, txt_output_dir, img_output_dir):
    image = cv2.imread(image_path)

    # Pose detection using MediaPipe
    results = pose.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    
    # Initialize an empty list for normalized landmarks
    normalized_landmarks = []
    
    if results.pose_landmarks:
        # Normalize the pose landmarks for YOLO format
        normalized_landmarks = normalize_landmarks(results.pose_landmarks.landmark, image.shape)

    # Run YOLOv8 for person detection
    yolo_results = yolo_model(image)

    # Collect bounding box and confidence for person detection
    yolo_detections = []
    for result in yolo_results:
        boxes = result.boxes
        for box in boxes:
            # Only consider the 'person' class (usually class_id 0)
            if int(box.cls) == 0:
                # Extract and normalize bounding box coordinates
                x_center, y_center, width, height = box.xywh[0] / np.array([image.shape[1], image.shape[0], image.shape[1], image.shape[0]])
                yolo_detections.append((x_center, y_center, width, height))
    
    # Define output file names
    base_name = os.path.basename(image_path).split('.')[0]
    txt_output_path = os.path.join(txt_output_dir, f"{base_name}.txt")
    img_output_path = os.path.join(img_output_dir, f"{base_name}.jpg")
    
    # Write the data to YOLO format .txt file
    write_yolo_format(yolo_detections, normalized_landmarks, txt_output_path)
    
    # Save the detected image to output folder
    cv2.imwrite(img_output_path, image)

    print(f"Processed {image_path}, saved .txt to {txt_output_path} and image to {img_output_path}")

# Main function to process a folder of images
def process_folder(input_folder, txt_output_folder, img_output_folder):
    # Ensure output directories exist
    os.makedirs(txt_output_folder, exist_ok=True)
    os.makedirs(img_output_folder, exist_ok=True)

    # Loop through all images in the input folder
    for file_name in os.listdir(input_folder):
        if file_name.endswith(('.jpg', '.jpeg', '.png')):  # Filter for image files
            image_path = os.path.join(input_folder, file_name)
            process_image(image_path, txt_output_folder, img_output_folder)

# Define input and output folders
input_folder = r'C:\Users\Admin\Downloads\Body Language Detection with mediapipe\img'
txt_output_folder = r'C:\Users\Admin\Downloads\Body Language Detection with mediapipe\labels'
img_output_folder = r'C:\Users\Admin\Downloads\Body Language Detection with mediapipe\img_lbl'

# Process the folder of images
process_folder(input_folder, txt_output_folder, img_output_folder)



0: 384x640 1 person, 266.2ms
Speed: 0.0ms preprocess, 266.2ms inference, 3.2ms postprocess per image at shape (1, 3, 384, 640)
Processed C:\Users\Admin\Downloads\Body Language Detection with mediapipe\img\1frame_0.jpg, saved .txt to C:\Users\Admin\Downloads\Body Language Detection with mediapipe\labels\1frame_0.txt and image to C:\Users\Admin\Downloads\Body Language Detection with mediapipe\img_lbl\1frame_0.jpg

0: 384x640 1 person, 170.0ms
Speed: 4.0ms preprocess, 170.0ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)
Processed C:\Users\Admin\Downloads\Body Language Detection with mediapipe\img\1frame_120.jpg, saved .txt to C:\Users\Admin\Downloads\Body Language Detection with mediapipe\labels\1frame_120.txt and image to C:\Users\Admin\Downloads\Body Language Detection with mediapipe\img_lbl\1frame_120.jpg

0: 384x640 1 person, 144.0ms
Speed: 9.9ms preprocess, 144.0ms inference, 6.5ms postprocess per image at shape (1, 3, 384, 640)
Processed C:\Users\Admin\Downloads