In [7]:
import cv2
import numpy as np
from ultralytics import YOLO
from collections import deque
import os

def calculate_angle(a, b, c):
    a, b, c = np.array(a), np.array(b), np.array(c)
    ba = a - b
    bc = c - b
    cosine_angle = np.dot(ba, bc) / (np.linalg.norm(ba) * np.linalg.norm(bc))
    return np.degrees(np.arccos(np.clip(cosine_angle, -1.0, 1.0)))

def get_elbow_angle_with_confidence(keypoints, confs):
    try:
        ls, le, lw = keypoints[5], keypoints[7], keypoints[9]
        l_conf = min(confs[5], confs[7], confs[9])
        l_angle = calculate_angle(ls, le, lw) if l_conf > 0.3 else None

        rs, re, rw = keypoints[6], keypoints[8], keypoints[10]
        r_conf = min(confs[6], confs[8], confs[10])
        r_angle = calculate_angle(rs, re, rw) if r_conf > 0.3 else None

        if l_angle and r_angle:
            return min(l_angle, r_angle)
        return l_angle or r_angle
    except:
        return None

def add_summary_frame(video_writer, width, height, exercise_name, count):
    calories_per_rep = {'pushup': 0.5, 'lunge': 0.3}
    cal_burned = round(count * calories_per_rep.get(exercise_name.lower(), 0.4), 2)

    summary_frame = np.zeros((height, width, 3), dtype=np.uint8)
    summary_frame[:] = (30, 30, 30)

    font = cv2.FONT_HERSHEY_SIMPLEX
    x = 50
    y_start = 150
    spacing = 60

    cv2.putText(summary_frame, "Workout Summary", (x, y_start), font, 2, (255, 255, 255), 4)
    cv2.putText(summary_frame, f"Exercise: {exercise_name}", (x, y_start + spacing * 2), font, 1.2, (180, 200, 255), 2)
    cv2.putText(summary_frame, f"Reps: {count}", (x, y_start + spacing * 3), font, 1.2, (200, 255, 200), 2)
    cv2.putText(summary_frame, f"Estimated Calories: {cal_burned} kcal", (x, y_start + spacing * 4), font, 1.2, (255, 220, 200), 2)

    for _ in range(60):  # 2 seconds @ 30 fps
        video_writer.write(summary_frame)

class PushupCounter:
    def __init__(self, model_name='yolo11s-pose.pt'):
        self.model = YOLO(model_name)
        self.count = 0
        self.stage = 'up'
        self.down_threshold = 110
        self.up_threshold = 145
        self.angle_history = deque(maxlen=5)
        self.frame_count = 0
        self.last_count_frame = -15
        self.cooldown_frames = 15

    def detect(self, frame):
        results = self.model(frame)
        keypoints = results[0].keypoints.xy[0].cpu().numpy() if results[0].keypoints is not None else None
        confs = results[0].keypoints.conf[0].cpu().numpy() if results[0].keypoints.conf is not None else None
        return results[0], keypoints, confs

    def update(self, angle):
        if angle is None:
            return False
        self.angle_history.append(angle)
        smoothed = np.mean(self.angle_history)
        detected = False
        current_frame = self.frame_count
        if smoothed < self.down_threshold and self.stage == 'up':
            self.stage = 'down'
        elif smoothed > self.up_threshold and self.stage == 'down':
            if (current_frame - self.last_count_frame) > self.cooldown_frames:
                self.stage = 'up'
                self.count += 1
                self.last_count_frame = current_frame
                detected = True
        return detected

def process_pushups(video_path, model_name='yolo11s-pose.pt', output_path="output_pushup_clean.mp4"):
    counter = PushupCounter(model_name)
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print("Video load failed.")
        return
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*'mp4v'), fps, (width, height))

    while True:
        ret, frame = cap.read()
        if not ret:
            break
        counter.frame_count += 1
        result, keypoints, confs = counter.detect(frame)
        if keypoints is not None:
            frame = result.plot()
        angle = get_elbow_angle_with_confidence(keypoints, confs)
        counter.update(angle)
        cv2.putText(frame, f"PUSH-UPS: {counter.count}", (20, 50), cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0, 255, 100), 3)
        cv2.putText(frame, f"XP: {counter.count * 10}", (20, 100), cv2.FONT_HERSHEY_SIMPLEX, 1.2, (255, 150, 50), 3)
        if angle:
            cv2.putText(frame, f"Angle: {angle:.1f}", (20, 150), cv2.FONT_HERSHEY_SIMPLEX, 1.2, (255, 255, 0), 2)
        out.write(frame)

    add_summary_frame(out, width, height, "Pushup", counter.count)
    cap.release()
    out.release()
    print(f"Done: {output_path}")

if __name__ == "__main__":
    VIDEO_DIR = "/Users/shruti.kalaskar/Documents/Northwestern/Spring 2025/Computer Vision/Project/"
    MODEL_NAME = "yolo11s-pose.pt"
    video_files = ["push_up_1.mp4", "push_up_2.mp4"]
    for file in video_files:
        input_path = os.path.join(VIDEO_DIR, file)
        output_path = os.path.join(VIDEO_DIR, file.replace(".mp4", "_output.mp4"))
        process_pushups(input_path, MODEL_NAME, output_path)


0: 384x640 1 person, 373.8ms
Speed: 4.7ms preprocess, 373.8ms inference, 0.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 326.5ms
Speed: 2.5ms preprocess, 326.5ms inference, 2.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 407.4ms
Speed: 2.8ms preprocess, 407.4ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 269.9ms
Speed: 1.5ms preprocess, 269.9ms inference, 0.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 323.0ms
Speed: 1.6ms preprocess, 323.0ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 350.2ms
Speed: 1.8ms preprocess, 350.2ms inference, 0.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 296.8ms
Speed: 2.3ms preprocess, 296.8ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 334.9ms
Speed: 7.2ms preprocess, 334.9ms inference, 0.9ms postprocess per image at

  cosine_angle = np.dot(ba, bc) / (np.linalg.norm(ba) * np.linalg.norm(bc))


0: 384x640 1 person, 384.8ms
Speed: 2.2ms preprocess, 384.8ms inference, 1.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 332.4ms
Speed: 2.8ms preprocess, 332.4ms inference, 2.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 332.5ms
Speed: 4.3ms preprocess, 332.5ms inference, 0.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 322.0ms
Speed: 1.7ms preprocess, 322.0ms inference, 2.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 296.2ms
Speed: 1.6ms preprocess, 296.2ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 350.7ms
Speed: 1.6ms preprocess, 350.7ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 412.4ms
Speed: 3.9ms preprocess, 412.4ms inference, 0.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 430.1ms
Speed: 1.8ms preprocess, 430.1ms inference, 1.0ms postprocess per image at 