In [2]:
import cv2
import matplotlib.pyplot as plt
import torch

# Load custom YOLOv5 model
model_path = 'best.pt'  # Update with your model path

model = torch.hub.load('ultralytics/yolov5', 'custom', path=model_path)

def process_frame(frame, object_detector, model, kernel, roi_coords):
    # Convert frame to RGB
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # Perform inference
    results = model(frame_rgb)

    # Extract bounding box coordinates and calculate center points and heights
    max_area = 0
    max_box = None
    for result in results.xyxy[0]:  # xyxy format
        x_min, y_min, x_max, y_max, confidence, class_id = result.tolist()
        area = (x_max - x_min) * (y_max - y_min)
        if area > max_area:
            max_area = area
            max_box = (x_min, y_min, x_max, y_max)

    if max_box:
        x_min, y_min, x_max, y_max = [int(coord) for coord in max_box]
        center_x = (x_min + x_max) / 2
        center_y = (y_min + y_max) / 2
        height = y_max - y_min

        return (center_x, center_y), height, frame_rgb

    return None, None, frame_rgb


def apply_bg_subtraction(frame, object_detector, kernel, roi_coords):
    roi_x, roi_y, roi_width, roi_height = roi_coords

    # Apply Gaussian blur to reduce noise
    blurred_frame = cv2.GaussianBlur(frame, (5, 5), 0)

    # Apply the background subtractor to get the mask
    mask = object_detector.apply(blurred_frame)

    # Apply morphological operations to reduce noise
    mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel)
    mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel)

    # Extract the ROI from the mask
    roi_mask = mask[roi_y:roi_y + roi_height, roi_x:roi_x + roi_width]

    # Count the number of non-zero pixels in the ROI mask
    movement = cv2.countNonZero(roi_mask)

    return movement

def determine_state(height_diff, movement_detected):
    threshold1 = 2
    threshold2 = 10
    if height_diff > threshold2 and movement_detected:
        return 'working'
    elif height_diff > threshold1 and movement_detected:
        return 'moving'
    else:
        return 'idle'

def annotate_frame(frame, center_point, height, state, movement_detected, text_x):
    center_x, center_y = center_point

    # Annotate the state and probabilities on the frame
    cv2.putText(frame, f'Movement: {movement_detected}', (text_x, 30),
                cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 2, cv2.LINE_AA)
    cv2.putText(frame, f'State: {state}', (text_x, 50),
                cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 2, cv2.LINE_AA)
    
    # Draw the center point on the frame
    cv2.circle(frame, (int(center_x), int(center_y)), 5, (0, 0, 255), -1)
    # Annotate the height on the frame
    cv2.putText(frame, f'Height: {int(height)}', (int(center_x), int(center_y - 10)),
                cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2, cv2.LINE_AA)

    return frame


def plot_metrics(center_points, heights, height_diffs):
    if center_points:
        tracked_centers_x = [pt[0] for pt in center_points]
        tracked_centers_y = [pt[1] for pt in center_points]
        tracked_heights = heights
        tracked_height_diffs = height_diffs

        plt.figure(figsize=(18, 6))

        # Plotting center points
        plt.subplot(1, 3, 1)
        plt.plot(tracked_centers_x, tracked_centers_y, marker='o')
        plt.title('Movement of Center Points')
        plt.xlabel('Frame')
        plt.ylabel('Center Point (x, y)')

        # Plotting heights
        plt.subplot(1, 3, 2)
        plt.plot(tracked_heights, marker='o')
        plt.title('Heights of Bounding Boxes')
        plt.xlabel('Frame')
        plt.ylabel('Height')

        # Plotting height differences
        plt.subplot(1, 3, 3)
        plt.plot(tracked_height_diffs, marker='o')
        plt.title('Height Differences')
        plt.xlabel('Frame')
        plt.ylabel('Height Difference')

        plt.show()

def test_model_on_video(video_path, output_video_path, model, interval_seconds=30, movement_threshold=8000):
    cap = cv2.VideoCapture(video_path)
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = int(cap.get(cv2.CAP_PROP_FPS))

    fourcc = cv2.VideoWriter_fourcc(*'XVID')
    out = cv2.VideoWriter(output_video_path, fourcc, fps, (frame_width, frame_height))

    roi_coords = (250, 200, 500, frame_height - 200)
    object_detector = cv2.createBackgroundSubtractorMOG2()
    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (5, 5))

    initial_height = None
    frames_processed = 0
    status_counts = {'working': 0, 'moving': 0, 'idle': 0}
    tags = []

    interval_frames = interval_seconds * fps
    current_interval_frames = 0

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        center_point, height, frame_rgb = process_frame(frame, object_detector, model, kernel, roi_coords)

        if center_point:
            if initial_height is None:
                initial_height = height

            height_diff = abs(height - initial_height)
            movement = apply_bg_subtraction(frame, object_detector, kernel, roi_coords)
            movement_detected = movement > movement_threshold
            state = determine_state(height_diff, movement_detected)
            status_counts[state] += 1

            text_x = frame_width - 200
            frame = annotate_frame(frame, center_point, height, state, movement_detected, text_x)
            cv2.rectangle(frame, (roi_coords[0], roi_coords[1]), (roi_coords[0] + roi_coords[2], roi_coords[1] + roi_coords[3]), (255, 0, 0), 2)

        out.write(frame)
        frames_processed += 1
        current_interval_frames += 1

        if current_interval_frames >= interval_frames:
            if status_counts['working'] + status_counts['moving'] > status_counts['idle']:
                tags.append(1)
            else:
                tags.append(0)
            status_counts = {'working': 0, 'moving': 0, 'idle': 0}
            current_interval_frames = 0

        key = cv2.waitKey(30)
        if key == 27: 
            break

    cap.release()
    out.release()
    cv2.destroyAllWindows()

    print('Tags for each interval:', tags)

# Example usage: Prompt user for interval duration
interval_seconds = int(input("Enter the interval duration in seconds: "))
test_model_on_video('input_video/10.mp4', 'output_video/processed-video10.mp4', model, interval_seconds=interval_seconds, movement_threshold=7500)


Using cache found in /Users/arnaav/.cache/torch/hub/ultralytics_yolov5_master
YOLOv5 🚀 2024-6-27 Python-3.11.5 torch-2.1.1 CPU

Fusing layers... 
Model summary: 157 layers, 7012822 parameters, 0 gradients, 15.8 GFLOPs
Adding AutoShape... 


Enter the interval duration in seconds: 1


OpenCV: FFMPEG: tag 0x44495658/'XVID' is not supported with codec id 12 and format 'mp4 / MP4 (MPEG-4 Part 14)'
OpenCV: FFMPEG: fallback to use tag 0x7634706d/'mp4v'


Tags for each interval: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [8]:
import cv2
import matplotlib.pyplot as plt
import torch

# Load custom YOLOv5 model
model_path = 'best.pt'  # Update with your model path

model = torch.hub.load('ultralytics/yolov5', 'custom', path=model_path)

def process_frame(frame, object_detector, model, kernel, roi_coords):
    # Convert frame to RGB
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # Perform inference
    results = model(frame_rgb)

    # Extract bounding box coordinates and calculate center points and heights
    max_area = 0
    max_box = None
    for result in results.xyxy[0]:  # xyxy format
        x_min, y_min, x_max, y_max, confidence, class_id = result.tolist()
        area = (x_max - x_min) * (y_max - y_min)
        if area > max_area:
            max_area = area
            max_box = (x_min, y_min, x_max, y_max)

    if max_box:
        x_min, y_min, x_max, y_max = [int(coord) for coord in max_box]
        center_x = (x_min + x_max) / 2
        center_y = (y_min + y_max) / 2
        height = y_max - y_min

        return (center_x, center_y), height, frame_rgb

    return None, None, frame_rgb


def apply_bg_subtraction(frame, object_detector, kernel, roi_coords):
    roi_x, roi_y, roi_width, roi_height = roi_coords

    # Apply Gaussian blur to reduce noise
    blurred_frame = cv2.GaussianBlur(frame, (5, 5), 0)

    # Apply the background subtractor to get the mask
    mask = object_detector.apply(blurred_frame)

    # Apply morphological operations to reduce noise
    mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel)
    mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel)

    # Extract the ROI from the mask
    roi_mask = mask[roi_y:roi_y + roi_height, roi_x:roi_x + roi_width]

    # Count the number of non-zero pixels in the ROI mask
    movement = cv2.countNonZero(roi_mask)

    return movement

def determine_state(height_diff, movement_detected):
    threshold1 = 2
    threshold2 = 10
    if height_diff > threshold2 and movement_detected:
        return 'working'
    elif height_diff > threshold1 and movement_detected:
        return 'moving'
    else:
        return 'idle'

def annotate_frame(frame, center_point, height, state, movement_detected, text_x):
    center_x, center_y = center_point

    # Annotate the state and probabilities on the frame
    cv2.putText(frame, f'Movement: {movement_detected}', (text_x, 30),
                cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 2, cv2.LINE_AA)
    cv2.putText(frame, f'State: {state}', (text_x, 50),
                cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 2, cv2.LINE_AA)
    
    # Draw the center point on the frame
    cv2.circle(frame, (int(center_x), int(center_y)), 5, (0, 0, 255), -1)
    # Annotate the height on the frame
    cv2.putText(frame, f'Height: {int(height)}', (int(center_x), int(center_y - 10)),
                cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2, cv2.LINE_AA)

    return frame


def plot_metrics(center_points, heights, height_diffs):
    if center_points:
        tracked_centers_x = [pt[0] for pt in center_points]
        tracked_centers_y = [pt[1] for pt in center_points]
        tracked_heights = heights
        tracked_height_diffs = height_diffs

        plt.figure(figsize=(18, 6))

        # Plotting center points
        plt.subplot(1, 3, 1)
        plt.plot(tracked_centers_x, tracked_centers_y, marker='o')
        plt.title('Movement of Center Points')
        plt.xlabel('Frame')
        plt.ylabel('Center Point (x, y)')

        # Plotting heights
        plt.subplot(1, 3, 2)
        plt.plot(tracked_heights, marker='o')
        plt.title('Heights of Bounding Boxes')
        plt.xlabel('Frame')
        plt.ylabel('Height')

        # Plotting height differences
        plt.subplot(1, 3, 3)
        plt.plot(tracked_height_diffs, marker='o')
        plt.title('Height Differences')
        plt.xlabel('Frame')
        plt.ylabel('Height Difference')

        plt.show()

def test_model_on_video(video_path, output_video_path, model, movement_threshold=8000):
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"Error: Unable to open video file {video_path}")
        return

    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    total_duration_seconds = total_frames / fps

    # Determine video length and prompt for interval
    print(f"Total video duration: {total_duration_seconds} seconds")
    interval_seconds = int(input("Enter the interval in seconds for state evaluation: "))
    interval_frames = interval_seconds * fps

    fourcc = cv2.VideoWriter_fourcc(*'XVID')
    out = cv2.VideoWriter(output_video_path, fourcc, fps, (frame_width, frame_height))  # Use the actual FPS of the input video

    roi_coords = (250, 200, 500, frame_height - 200)
    object_detector = cv2.createBackgroundSubtractorMOG2()
    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (5, 5))

    initial_height = None
    frames_processed = 0
    status_counts = {'working': 0, 'moving': 0, 'idle': 0}
    tags = []

    current_interval_frames = 0

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        center_point, height, frame_rgb = process_frame(frame, object_detector, model, kernel, roi_coords)

        if center_point:
            if initial_height is None:
                initial_height = height

            height_diff = abs(height - initial_height)
            movement = apply_bg_subtraction(frame, object_detector, kernel, roi_coords)
            movement_detected = movement > movement_threshold
            state = determine_state(height_diff, movement_detected)
            status_counts[state] += 1

            text_x = frame_width - 200
            frame = annotate_frame(frame, center_point, height, state, movement_detected, text_x)
            cv2.rectangle(frame, (roi_coords[0], roi_coords[1]), (roi_coords[0] + roi_coords[2], roi_coords[1] + roi_coords[3]), (255, 0, 0), 2)

            print(f"Frame {frames_processed}: State = {state}, Movement Detected = {movement_detected}")

        out.write(frame)
        frames_processed += 1
        current_interval_frames += 1

        if current_interval_frames >= interval_frames:
            if status_counts['working'] + status_counts['moving'] > status_counts['idle']:
                tags.append(1)
            else:
                tags.append(0)
            print(f"Interval {len(tags)}: Working = {status_counts['working']}, Moving = {status_counts['moving']}, Idle = {status_counts['idle']}")
            status_counts = {'working': 0, 'moving': 0, 'idle': 0}
            current_interval_frames = 0

        key = cv2.waitKey(30)
        if key == 27: 
            break

    cap.release()
    out.release()
    cv2.destroyAllWindows()

    print('Tags for each interval:', tags)

# Example usage:
video_path = 'input_video/9.mp4'
output_video_path = 'output_video/processed-video9.mp4'
test_model_on_video(video_path, output_video_path, model)


Using cache found in /Users/arnaav/.cache/torch/hub/ultralytics_yolov5_master
YOLOv5 🚀 2024-6-27 Python-3.11.5 torch-2.1.1 CPU

Fusing layers... 
Model summary: 157 layers, 7012822 parameters, 0 gradients, 15.8 GFLOPs
Adding AutoShape... 


Total video duration: 18.178571428571427 seconds
Enter the interval in seconds for state evaluation: 2


OpenCV: FFMPEG: tag 0x44495658/'XVID' is not supported with codec id 12 and format 'mp4 / MP4 (MPEG-4 Part 14)'
OpenCV: FFMPEG: fallback to use tag 0x7634706d/'mp4v'


Frame 0: State = idle, Movement Detected = True
Frame 1: State = idle, Movement Detected = True
Frame 2: State = idle, Movement Detected = False
Frame 3: State = idle, Movement Detected = False
Frame 4: State = idle, Movement Detected = False
Frame 5: State = idle, Movement Detected = False
Frame 6: State = idle, Movement Detected = False
Frame 7: State = idle, Movement Detected = False
Frame 8: State = idle, Movement Detected = False
Frame 9: State = idle, Movement Detected = False
Frame 10: State = idle, Movement Detected = False
Frame 11: State = idle, Movement Detected = False
Frame 12: State = idle, Movement Detected = False
Frame 13: State = idle, Movement Detected = False
Frame 14: State = idle, Movement Detected = False
Frame 15: State = idle, Movement Detected = True
Frame 16: State = idle, Movement Detected = True
Frame 17: State = idle, Movement Detected = True
Frame 18: State = idle, Movement Detected = True
Frame 19: State = idle, Movement Detected = True
Frame 20: State =

Frame 165: State = idle, Movement Detected = False
Frame 166: State = idle, Movement Detected = False
Frame 167: State = idle, Movement Detected = False
Interval 3: Working = 0, Moving = 3, Idle = 53
Frame 168: State = idle, Movement Detected = False
Frame 169: State = idle, Movement Detected = False
Frame 170: State = idle, Movement Detected = False
Frame 171: State = idle, Movement Detected = False
Frame 172: State = idle, Movement Detected = False
Frame 173: State = idle, Movement Detected = False
Frame 174: State = idle, Movement Detected = False
Frame 175: State = idle, Movement Detected = False
Frame 176: State = idle, Movement Detected = False
Frame 177: State = idle, Movement Detected = False
Frame 178: State = idle, Movement Detected = False
Frame 179: State = idle, Movement Detected = False
Frame 180: State = idle, Movement Detected = False
Frame 181: State = idle, Movement Detected = False
Frame 182: State = idle, Movement Detected = False
Frame 183: State = idle, Movement D

Frame 327: State = idle, Movement Detected = True
Frame 328: State = moving, Movement Detected = True
Frame 329: State = moving, Movement Detected = True
Frame 330: State = moving, Movement Detected = True
Frame 331: State = moving, Movement Detected = True
Frame 332: State = idle, Movement Detected = True
Frame 333: State = moving, Movement Detected = True
Frame 334: State = moving, Movement Detected = True
Frame 335: State = idle, Movement Detected = True
Interval 6: Working = 0, Moving = 8, Idle = 48
Frame 336: State = idle, Movement Detected = True
Frame 337: State = idle, Movement Detected = True
Frame 338: State = idle, Movement Detected = True
Frame 339: State = idle, Movement Detected = True
Frame 340: State = idle, Movement Detected = True
Frame 341: State = idle, Movement Detected = True
Frame 342: State = idle, Movement Detected = True
Frame 343: State = idle, Movement Detected = True
Frame 344: State = idle, Movement Detected = True
Frame 345: State = idle, Movement Detecte

Frame 485: State = moving, Movement Detected = True
Frame 486: State = idle, Movement Detected = False
Frame 487: State = idle, Movement Detected = False
Frame 488: State = idle, Movement Detected = False
Frame 489: State = idle, Movement Detected = False
Frame 490: State = idle, Movement Detected = False
Frame 491: State = idle, Movement Detected = False
Frame 492: State = idle, Movement Detected = False
Frame 493: State = idle, Movement Detected = False
Frame 494: State = idle, Movement Detected = False
Frame 495: State = idle, Movement Detected = False
Frame 496: State = idle, Movement Detected = False
Frame 497: State = idle, Movement Detected = False
Frame 498: State = idle, Movement Detected = False
Frame 499: State = idle, Movement Detected = False
Frame 500: State = idle, Movement Detected = False
Frame 501: State = idle, Movement Detected = False
Frame 502: State = idle, Movement Detected = False
Frame 503: State = idle, Movement Detected = False
Interval 9: Working = 0, Movin