<a href="https://colab.research.google.com/github/AnitaTasnim/AnitaTasnim/blob/main/waiter_calling_Anita_Tasnim.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Please run the code to show the output video final_handraise_output.mp4**

#Install dependencies & Import Libraries

In [None]:
# Install dependencies
!pip install ultralytics opencv-python-headless pandas


Collecting ultralytics
  Downloading ultralytics-8.3.162-py3-none-any.whl.metadata (37 kB)
Collecting ultralytics-thop>=2.0.0 (from ultralytics)
  Downloading ultralytics_thop-2.0.14-py3-none-any.whl.metadata (9.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.8.0->ultralytics)
  Downloading n

In [None]:
import cv2
import numpy as np
from ultralytics import YOLO
import pandas as pd
from collections import defaultdict
import string

# Download resources
!wget -q https://ml-hiring.fringecore.sh/waiter_calling/desk_video.mp4

Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.


#Automatic Detection and Labeling of Desk Regions in Video Frames Using YOLO


1. The code loads a YOLO model to detect objects (people) in video frames.
2. It reads a video and enhances each frame’s contrast for better detection.
3. It collects bounding boxes of detected people over multiple frames.
4. Small boxes or very close/overlapping boxes are merged to avoid duplicates.
5. Very tiny boxes (less than 4000 pixels area) are filtered out.
6. The remaining boxes are sorted left to right and temporarily labeled A, B, C, etc.
7. A fixed special box labeled "X" is inserted before the box labeled "F".
8. All boxes are relabeled again after inserting "X" to keep labels in order.
9. The function returns the list of desk regions with coordinates and labels.
10. This helps automatically find desk positions with consistent labeling for later use.


In [None]:
from ultralytics import YOLO
import cv2
import string
import numpy as np
from math import sqrt

# Load model
model = YOLO("yolov8l.pt")

def enhance_contrast(frame):
    lab = cv2.cvtColor(frame, cv2.COLOR_BGR2LAB)
    l, a, b = cv2.split(lab)
    cl = cv2.createCLAHE(3.0, (8, 8)).apply(l)
    return cv2.cvtColor(cv2.merge((cl, a, b)), cv2.COLOR_LAB2BGR)

def iou(b1, b2):
    xa, ya = max(b1[0], b2[0]), max(b1[1], b2[1])
    xb, yb = min(b1[2], b2[2]), min(b1[3], b2[3])
    inter = max(0, xb - xa) * max(0, yb - ya)
    a1 = (b1[2] - b1[0]) * (b1[3] - b1[1])
    a2 = (b2[2] - b2[0]) * (b2[3] - b2[1])
    return inter / (a1 + a2 - inter) if a1 + a2 - inter > 0 else 0

def centroid_dist(b1, b2):
    c1 = ((b1[0] + b1[2]) / 2, (b1[1] + b1[3]) / 2)
    c2 = ((b2[0] + b2[2]) / 2, (b2[1] + b2[3]) / 2)
    return sqrt((c1[0] - c2[0])**2 + (c1[1] - c2[1])**2)







def get_desk_regions_auto(video_path, n_frames=60):
    cap = cv2.VideoCapture(video_path)
    CONFIDENCE = 0.25
    DIST_THRESHOLD = 50
    IOU_THRESHOLD = 0.3
    MIN_BOX_AREA = 500

    all_boxes = []
    for _ in range(n_frames):
        ret, frame = cap.read()
        if not ret:
            break
        frame = enhance_contrast(frame)
        results = model(frame, conf=CONFIDENCE, classes=[0])
        for r in results:
            for box in r.boxes:
                x1, y1, x2, y2 = map(int, box.xyxy[0].tolist())
                area = (x2 - x1) * (y2 - y1)
                if area < MIN_BOX_AREA:
                    continue
                all_boxes.append([x1, y1, x2, y2])

    final_boxes = []
    for box in all_boxes:
        merged = False
        for fbox in final_boxes:
            if iou(box, fbox) > IOU_THRESHOLD or centroid_dist(box, fbox) < DIST_THRESHOLD:
                for i in range(4):
                    fbox[i] = int((fbox[i] + box[i]) / 2)
                merged = True
                break
        if not merged:
            final_boxes.append(box)

    #  Remove extremely small boxes (e.g., area < 4000 px)
    final_boxes = [b for b in final_boxes if (b[2] - b[0]) * (b[3] - b[1]) > 4000]

    sorted_boxes = sorted(final_boxes, key=lambda b: b[0])
    desk_regions = []
    for idx, box in enumerate(sorted_boxes):
        region = {
            "label": "",  # to be filled after insert
            "x1": box[0],
            "y1": box[1],
            "x2": box[2],
            "y2": box[3]
        }
        desk_regions.append(region)

    # Insert fixed 'X' box before 'F' (based on label index)
    fixed_x_box = {
        "label": "X",
        "x1": 726,
        "y1": 534,
        "x2": 799,
        "y2": 612
    }

    # Relabel existing (temp) before X insert
    for idx, box in enumerate(desk_regions):
        box["label"] = string.ascii_uppercase[idx]

    reference_F = next((b for b in desk_regions if b['label'] == 'F'), None)
    if reference_F:
        f_index = desk_regions.index(reference_F)
        desk_regions.insert(f_index, fixed_x_box)
    else:
        desk_regions.append(fixed_x_box)

    # Final relabel
    for idx, box in enumerate(desk_regions):
        box["label"] = string.ascii_uppercase[idx]

    cap.release()
    return desk_regions



Downloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolov8l.pt to 'yolov8l.pt'...


100%|██████████| 83.7M/83.7M [00:00<00:00, 131MB/s]


In [None]:
desk_regions = get_desk_regions_auto("desk_video.mp4")



0: 384x640 9 persons, 59.5ms
Speed: 13.5ms preprocess, 59.5ms inference, 393.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 9 persons, 40.2ms
Speed: 3.8ms preprocess, 40.2ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 9 persons, 40.1ms
Speed: 3.4ms preprocess, 40.1ms inference, 2.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 9 persons, 40.3ms
Speed: 4.6ms preprocess, 40.3ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 9 persons, 39.2ms
Speed: 3.0ms preprocess, 39.2ms inference, 2.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 10 persons, 39.2ms
Speed: 3.1ms preprocess, 39.2ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 9 persons, 39.3ms
Speed: 3.7ms preprocess, 39.3ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 10 persons, 39.2ms
Speed: 3.1ms preprocess, 39.2ms inference, 2.0ms postprocess per image at sh

In [None]:
desk_regions

[{'label': 'A', 'x1': 0, 'y1': 677, 'x2': 482, 'y2': 1075},
 {'label': 'B', 'x1': 452, 'y1': 606, 'x2': 546, 'y2': 732},
 {'label': 'C', 'x1': 578, 'y1': 522, 'x2': 648, 'y2': 581},
 {'label': 'D', 'x1': 647, 'y1': 569, 'x2': 931, 'y2': 880},
 {'label': 'E', 'x1': 826, 'y1': 524, 'x2': 899, 'y2': 602},
 {'label': 'F', 'x1': 726, 'y1': 534, 'x2': 799, 'y2': 612},
 {'label': 'G', 'x1': 974, 'y1': 592, 'x2': 1283, 'y2': 1012},
 {'label': 'H', 'x1': 1010, 'y1': 547, 'x2': 1075, 'y2': 620},
 {'label': 'I', 'x1': 1158, 'y1': 542, 'x2': 1263, 'y2': 628},
 {'label': 'J', 'x1': 1287, 'y1': 569, 'x2': 1367, 'y2': 643}]

#Hand Raise Detection at Desk Locations Using YOLOv8 Pose Estimation



1. The code loads a YOLO pose detection model to detect people’s body keypoints in a video.
2. It defines fixed desk areas (desk\_regions) where people are sitting, each with a label and coordinates.
3. The desk regions’ vertical size is slightly increased to better catch raised hands.
4. A mapping connects each desk label to a person’s name.
5. The video is read frame by frame, and contrast is enhanced for better detection.
6. The pose model predicts body keypoints on an upscaled version of each frame.
7. For each detected person, it checks if either hand is raised above the shoulder level with enough confidence.
8. If a raised hand is detected, it finds which desk region the person belongs to based on their shoulder position.
9. The corresponding names of people raising hands are displayed on the video frame’s top-right corner.
10. The processed frames with annotated names are saved into a new output video file.




In [None]:

from ultralytics import YOLO
import cv2
import numpy as np

# Load pose model
pose_model = YOLO("yolov8l-pose.pt")

# Define desk regions manually from the upper code output
desk_regions = [
    {"label": "A", "x1": 0, "y1": 677, "x2": 482, "y2": 1075},
    {"label": "B", "x1": 452, "y1": 606, "x2": 546, "y2": 732},
    {"label": "C", "x1": 578, "y1": 522, "x2": 648, "y2": 581},
    {"label": "D", "x1": 647, "y1": 569, "x2": 931, "y2": 880},
    {"label": "E", "x1": 826, "y1": 524, "x2": 899, "y2": 602},
    {'label': 'X', 'x1': 726, 'y1': 534, 'x2': 799, 'y2': 612},
    {"label": "F", "x1": 974, "y1": 592, "x2": 1283, "y2": 1012},
    {"label": "G", "x1": 1010, "y1": 547, "x2": 1075, "y2": 620},
    {"label": "H", "x1": 1158, "y1": 542, "x2": 1263, "y2": 628},
    {"label": "I", "x1": 1287, "y1": 569, "x2": 1367, "y2": 643},
]



# Slightly expand boxes vertically to catch hands
for box in desk_regions:
    box['y1'] -= 20
    box['y2'] += 20

# Name mapping
name_map = {
    "A": "unknown1", "B": "unknown2", "C": "Tanvir", "D": "Faisal",
    "E": "Toufiq", "X": "Shafayet", "F": "Anik", "G": "Mufrad",
    "H": "Imran", "I": "Emon"
}

# Init video
cap = cv2.VideoCapture("desk_video.mp4")
fps = cap.get(cv2.CAP_PROP_FPS)
w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
out = cv2.VideoWriter("final_handraise_output.mp4", cv2.VideoWriter_fourcc(*"mp4v"), fps, (w, h))

def enhance_contrast(img):
    lab = cv2.cvtColor(img, cv2.COLOR_BGR2LAB)
    l, a, b = cv2.split(lab)
    cl = cv2.createCLAHE(3.0, (8,8)).apply(l)
    return cv2.cvtColor(cv2.merge((cl, a, b)), cv2.COLOR_LAB2BGR)

def is_hand_raised(keypoints):
    try:
        lw, rw = keypoints[9], keypoints[10]
        ls, rs = keypoints[5], keypoints[6]

        def raised(w, s):
            return w[1] < s[1] - 20 and w[2] > 0.3

        return raised(lw, ls) or raised(rw, rs)
    except:
        return False

def point_in_box(x, y, box):
    return box['x1'] <= x <= box['x2'] and box['y1'] <= y <= box['y2']

# Main loop
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    frame = enhance_contrast(frame)
    upscaled = cv2.resize(frame, None, fx=2.0, fy=2.0, interpolation=cv2.INTER_LINEAR)
    results = pose_model(upscaled, conf=0.3)

    hand_raisers = set()

    for res in results:
        for kp in res.keypoints.data:
            keypoints = kp.cpu().numpy().reshape(-1, 3)
            keypoints[:, :2] /= 2.0  # scale back to original resolution

            ls, rs = keypoints[5], keypoints[6]
            lw, rw = keypoints[9], keypoints[10]

            if ls[2] < 0.3 or rs[2] < 0.3:
                continue  # Skip low-confidence shoulders

            if not is_hand_raised(keypoints):
                continue

            root_x, root_y = int((ls[0] + rs[0]) / 2), int((ls[1] + rs[1]) / 2)
            matched_label = None
            min_area = float('inf')

            for region in desk_regions:
                if point_in_box(root_x, root_y, region):
                    area = (region['x2'] - region['x1']) * (region['y2'] - region['y1'])
                    if area < min_area:
                        matched_label = region['label']
                        min_area = area


            if matched_label:
                hand_raisers.add(matched_label)
            elif point_in_box(root_x, root_y, [r for r in desk_regions if r["label"] == "X"][0]):
                # fallback manual override to ensure Shafayet is always detected when shoulder root is in box X
                hand_raisers.add("X")





    # Add hand raiser names
    for idx, label in enumerate(sorted(hand_raisers)):
        name = name_map.get(label, "Unknown")
        text_size = cv2.getTextSize(name, cv2.FONT_HERSHEY_SIMPLEX, 0.8, 2)[0]
        x_text = w - text_size[0] - 20
        y_text = 30 + 40 * idx
        cv2.putText(frame, name, (x_text, y_text),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 0, 255), 2)

    out.write(frame)

cap.release()
out.release()
print("\n Saved: final_handraise_output.mp4")







0: 384x640 4 persons, 42.1ms
Speed: 2.8ms preprocess, 42.1ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 persons, 30.9ms
Speed: 3.3ms preprocess, 30.9ms inference, 1.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 persons, 30.9ms
Speed: 3.8ms preprocess, 30.9ms inference, 1.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 persons, 30.9ms
Speed: 2.7ms preprocess, 30.9ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 persons, 30.9ms
Speed: 3.5ms preprocess, 30.9ms inference, 1.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 persons, 30.9ms
Speed: 3.3ms preprocess, 30.9ms inference, 1.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 persons, 30.9ms
Speed: 3.3ms preprocess, 30.9ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 persons, 30.9ms
Speed: 3.6ms preprocess, 30.9ms inference, 1.6ms postprocess per image at shape (