## Imports 

In [8]:
from ultralytics import YOLO
import os 
import cv2 
import mediapipe as mp 
import math

## Constant

In [7]:
input_video = './20211118173000-20211118183000/car cam.mp4'
output_video = './20211118173000-20211118183000/car cam processed YOLO8.mp4'

In [5]:
width,height = 1280,720
fbs = 30
prev_landmarks = {}
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_video, fourcc, fbs, (width, height))
worker_activity = {}  # Format: {person_id: {'standing': 0, 'sitting': 0, 'moving': 0}} # to track the activites for person 
person_stabilization = {}  # Format: {person_id: stabilization_frames}


In [36]:
yolo_model = YOLO('yolov10n.pt')
# tracker_config = 'bytetrack.yaml'  # Replace with your tracker config if needed
mp_pose = mp.solutions.pose
pose = mp_pose.Pose()


In [7]:
# Function to compute Euclidean distance
def euclidean_distance(point1, point2):
    return math.sqrt((point1[0] - point2[0])**2 + (point1[1] - point2[1])**2)

In [8]:
def classify_activity (landmarks, prev_landmarks, bbox_id):
    text = ''
    left_hip = landmarks[mp_pose.PoseLandmark.LEFT_HIP]
    right_hip = landmarks[mp_pose.PoseLandmark.RIGHT_HIP]
    left_knee = landmarks[mp_pose.PoseLandmark.LEFT_KNEE]
    right_knee = landmarks[mp_pose.PoseLandmark.RIGHT_KNEE]
    left_shoulder = landmarks[mp_pose.PoseLandmark.LEFT_SHOULDER]
    right_shoulder = landmarks[mp_pose.PoseLandmark.RIGHT_SHOULDER]
    
    # Compute average positions for hips, knees, and shoulders
    avg_hip_y = (left_hip.y + right_hip.y) / 2
    avg_knee_y = (left_knee.y + right_knee.y) / 2
    avg_shoulder_y = (left_shoulder.y + right_shoulder.y) / 2
    current_shoulder = ((left_shoulder.x + right_shoulder.x) / 2, avg_shoulder_y)
    movement_detection = False
    if bbox_id in prev_landmarks:
        prev_shoulder = prev_landmarks[bbox_id]
        displacement = euclidean_distance(current_shoulder, prev_shoulder)
        if displacement >0.2:
            movement_detection = True
    prev_landmarks[bbox_id] = current_shoulder
        
    if movement_detection:
        text=  'moving'
    elif abs(avg_hip_y - avg_knee_y) < 0.1 and avg_hip_y > avg_shoulder_y:
        text= "sitting"
    elif avg_hip_y < avg_knee_y and avg_shoulder_y < avg_hip_y:
        text= 'standing'
    else:
        text= 'unknown'

    return text, prev_landmarks
    

In [9]:
# Initialize unique ID tracking
person_idx = 0
person_ids = {}

# Function to compute IoU for bounding box matching
def calculate_iou(box1, box2):
    x1 = max(box1[0], box2[0])
    y1 = max(box1[1], box2[1])
    x2 = min(box1[2], box2[2])
    y2 = min(box1[3], box2[3])
    
    inter_area = max(0, x2 - x1) * max(0, y2 - y1)
    box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
    box2_area = (box2[2] - box2[0]) * (box2[3] - box2[1])
    union_area = box1_area + box2_area - inter_area
    
    return inter_area / union_area if union_area > 0 else 0

In [10]:
cap = cv2.VideoCapture(input_video)
while True:
    ret, frame = cap.read()
    if not ret:
        break
    frame = cv2.resize(frame, (width, height))

    # Step 1: Detect humans with YOLO
    detections = yolo_model(frame)
    detections = detections[0].boxes

    for box_data in detections:
        box = box_data.xyxy[0].tolist()  # Bounding box coordinates
        conf = box_data.conf[0]  # Confidence score
        cls = int(box_data.cls[0])  # Class index

        if cls == 0:  # Class 0 corresponds to 'person'
            x1, y1, x2, y2 = map(int, box)
            bbox = (x1, y1, x2, y2)
            matched_id = None

            # Match bounding box with existing person IDs
            for pid, prev_bbox in person_ids.items():
                iou = calculate_iou(bbox, prev_bbox)
                if iou > 0.5:  # IoU threshold for matching
                    matched_id = pid
                    break

            if matched_id is None:  # New person detected
                matched_id = person_idx
                person_idx += 1

            # Update the bounding box in the dictionary
            person_ids[matched_id] = bbox

            # Step 2: Crop region for the detected person
            person_crop = frame[y1:y2, x1:x2]
            if person_crop.size > 0:
                rgb_crop = cv2.cvtColor(person_crop, cv2.COLOR_BGR2RGB)
                results = pose.process(rgb_crop)

                # Process pose landmarks
                if results.pose_landmarks:
                    # Draw pose landmarks
                    mp.solutions.drawing_utils.draw_landmarks(
                        frame[y1:y2, x1:x2], results.pose_landmarks, mp_pose.POSE_CONNECTIONS
                    )

                    # Classify activity
                    landmarks = results.pose_landmarks.landmark
                    activity, prev_landmarks = classify_activity(landmarks, prev_landmarks, matched_id)
                    label = f"ID: {matched_id} | {activity.upper()}"
                    cv2.putText(frame, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 2)

                    # Count activity occurrences
                    if matched_id not in worker_activity:
                        worker_activity[matched_id] = {'standing': 0, 'sitting': 0, 'moving': 0, 'unknown': 0}
                    worker_activity[matched_id][activity] += 1

                 

    # Write frame to output video
    out.write(frame)

    # Display frame
    cv2.imshow("Multi-Worker Activity Recognition", frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
out.release()
cv2.destroyAllWindows()


0: 384x640 3 persons, 1 chair, 6 dining tables, 1 book, 302.9ms
Speed: 135.8ms preprocess, 302.9ms inference, 30.2ms postprocess per image at shape (1, 3, 384, 640)





0: 384x640 3 persons, 1 chair, 6 dining tables, 1 book, 147.4ms
Speed: 0.0ms preprocess, 147.4ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)


NameError: name 'math' is not defined

In [35]:
cap = cv2.VideoCapture(input_video)
width = min(1903,int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)))
height = min(945,int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)))
fps = cap.get(cv2.CAP_PROP_FPS)
while True:
    success, frame = cap.read()
    if success:
        frame = cv2.resize(frame,(width, height))
        results = yolo_model(frame)
        detections = results[0].boxes
        print(detections)
            
        cv2.imshow('Production line', frame)
        
        if cv2.waitKey(1)& 0XFF== ord('q'):
            break
    else:
        break
cap.release()
cv2.destroyAllWindows()
    
        


0: 320x640 (no detections), 70.0ms
Speed: 15.7ms preprocess, 70.0ms inference, 0.0ms postprocess per image at shape (1, 3, 320, 640)
ultralytics.engine.results.Boxes object with attributes:

cls: tensor([])
conf: tensor([])
data: tensor([], size=(0, 6))
id: None
is_track: False
orig_shape: (945, 1903)
shape: torch.Size([0, 6])
xywh: tensor([], size=(0, 4))
xywhn: tensor([], size=(0, 4))
xyxy: tensor([], size=(0, 4))
xyxyn: tensor([], size=(0, 4))

0: 320x640 (no detections), 85.9ms
Speed: 4.0ms preprocess, 85.9ms inference, 0.0ms postprocess per image at shape (1, 3, 320, 640)
ultralytics.engine.results.Boxes object with attributes:

cls: tensor([])
conf: tensor([])
data: tensor([], size=(0, 6))
id: None
is_track: False
orig_shape: (945, 1903)
shape: torch.Size([0, 6])
xywh: tensor([], size=(0, 4))
xywhn: tensor([], size=(0, 4))
xyxy: tensor([], size=(0, 4))
xyxyn: tensor([], size=(0, 4))

0: 320x640 (no detections), 83.3ms
Speed: 1.9ms preprocess, 83.3ms inference, 0.0ms postprocess 

In [34]:
detections

ultralytics.engine.results.Boxes object with attributes:

cls: tensor([])
conf: tensor([])
data: tensor([], size=(0, 6))
id: None
is_track: False
orig_shape: (945, 1903)
shape: torch.Size([0, 6])
xywh: tensor([], size=(0, 4))
xywhn: tensor([], size=(0, 4))
xyxy: tensor([], size=(0, 4))
xyxyn: tensor([], size=(0, 4))

In [19]:
results = yolo_model('test.jpg')


image 1/1 D:\Abdelrahman\data analysis\production analysis\test.jpg: 192x640 11 persons, 286.8ms
Speed: 31.3ms preprocess, 286.8ms inference, 0.0ms postprocess per image at shape (1, 3, 192, 640)


In [30]:
results[0].boxes

ultralytics.engine.results.Boxes object with attributes:

cls: tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
conf: tensor([0.8557, 0.8472, 0.8287, 0.8260, 0.8062, 0.7949, 0.7906, 0.7865, 0.7794, 0.4885, 0.3377])
data: tensor([[1.2898e+02, 3.4969e+01, 1.9323e+02, 2.5158e+02, 8.5574e-01, 0.0000e+00],
        [4.4010e+02, 2.2434e+01, 5.0883e+02, 2.4875e+02, 8.4720e-01, 0.0000e+00],
        [2.5097e+01, 1.4739e+01, 9.3937e+01, 2.4817e+02, 8.2875e-01, 0.0000e+00],
        [6.3770e+02, 1.5032e+01, 7.1361e+02, 2.4984e+02, 8.2597e-01, 0.0000e+00],
        [9.4329e+02, 1.2361e+01, 1.0131e+03, 2.5240e+02, 8.0621e-01, 0.0000e+00],
        [2.3603e+02, 3.5781e+01, 2.9336e+02, 2.5111e+02, 7.9486e-01, 0.0000e+00],
        [5.4555e+02, 2.5912e+01, 6.0371e+02, 2.4788e+02, 7.9064e-01, 0.0000e+00],
        [8.5152e+02, 3.2743e+01, 9.0517e+02, 2.5035e+02, 7.8653e-01, 0.0000e+00],
        [7.5113e+02, 2.2702e+01, 8.1014e+02, 2.4713e+02, 7.7936e-01, 0.0000e+00],
        [3.2738e+02, 2.2164e+01, 4.04

In [None]:
# starting the video 
cap = cv2.VideoCapture(input_video)
while True:
    ret, frame = cap.read()
    frame = cv2.resize(frame, (width,height))
    if not ret:
        break
    # step one detect the human by YOLO & create the rectangle 
    detections = yolo_model(frame)
    detections = detections[0].boxes
    for idx, (*box, conf, cls) in (detections):
        x1, y1, x2, y2 = map(int, box)   # Bounding box coordinates
        if int(cls) == 0: # check if the object is human
            # cv2.rectangle(frame, (x1,y1),(x2,y2), (0,255,0),2)
            person_id = idx
            if person_id not in person_stabilization: # add the person if this is first time 
                person_stabilization[person_id] = 0
        
            # Step 2: Crop region for each person
            person_crop = frame[y1:y2,x1:x2]
            if person_crop.size>0:
                rgb_crop = cv2.cvtColor(person_crop, cv2.COLOR_BGR2RGB)
                results = pose.process(rgb_crop)
                
                # Draw pose landmarks on the cropped person, and save the landmarks, create activities and label it 
                if results.pose_landmarks:  
                    if person_stabilization[person_id]<5:
                        person_stabilization[person_id]+=1
                        continue
                        
                    landmarks = results.pose_landmarks.landmark
                    # Draw the landmarks
                    mp.solutions.drawing_utils.draw_landmarks(
                        frame[y1:y2, x1:x2], results.pose_landmarks, mp_pose.POSE_CONNECTIONS
                    )

                    
                    # make the activity 
                    activity, prev_landmarks = classify_activity(landmarks, prev_landmarks, idx)
                    label = f"{activity.upper()}"
                    cv2.putText(frame, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 2)

                    # Count the activity
                    if idx not in worker_activity:
                        worker_activity[idx]= {'standing': 0, 'sitting': 0, 'moving': 0, 'unknown':0}
                    worker_activity[idx][activity] +=1
                    
                    # # save the img
                    # save_img(rgb_crop, idx, output_dir)
                    
                    # #extract the landmarks
                    # landmark_data.append(extract_landmark(landmarks, activity))


    # Display the frame with bounding boxes and poses
    out.write(frame)
    cv2.imshow("Multi-Worker Activity Recognition", frame)
    
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
out.release()
cv2.destroyAllWindows()
            