In [None]:
import cv2
import numpy as np
import os
from glob import glob
import torch
import matplotlib.pyplot as plt

# Directory structure setup
base_dir = "C:\\Users\\irisv\\Documents\\DTU\\Perception for autonomous systems\\FINAL PROJECT\\34759_final_project_rect"
sequences = ["seq_01", "seq_02", "seq_03"]

# Stereo and YOLO settings
focal_length = 707.0493  # Focal length in pixels
baseline = 0.537  
num_disparities = 16 * 20  
block_size = 31
stereo = cv2.StereoBM_create(numDisparities=num_disparities, blockSize=block_size)

# Load YOLO model
model = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True)

# Process each sequence
for seq in sequences:
    left_images = sorted(glob(os.path.join(base_dir, seq, "image_02", "data", "*.png")))
    right_images = sorted(glob(os.path.join(base_dir, seq, "image_03", "data", "*.png")))

    if len(left_images) != len(right_images):
        print(f"Mismatch in the number of images in {seq}")
        continue

    print(f"Processing sequence: {seq}")

    # Prepare Video Writers for output videos
    output_annotated_video_path = f"{seq}_annotated_video4.avi"
    output_depth_video_path = f"{seq}_depth_video4.avi"
    fourcc = cv2.VideoWriter_fourcc(*'XVID')
    fps = 30  # Assuming 30 FPS for video

    # Load the first frame to get the frame size
    img_left = cv2.imread(left_images[0])
    frame_size = (img_left.shape[1], img_left.shape[0])  # Get dimensions from the first frame
    out_annotated = cv2.VideoWriter(output_annotated_video_path, fourcc, fps, frame_size)
    out_depth = cv2.VideoWriter(output_depth_video_path, fourcc, fps, frame_size)

    # Open the text file for writing detections
    label_file_path = f"{seq}_detections.txt"
    with open(label_file_path, 'w') as label_file:

        # Process each image in the sequence
        for frame_idx, (left_image_path, right_image_path) in enumerate(zip(left_images, right_images)):
            img_left = cv2.imread(left_image_path)
            img_right = cv2.imread(right_image_path)

            # Perform YOLO object detection on the left image
            results = model(img_left)
            detections = results.xyxy[0]  # Bounding boxes and scores

            # Compute disparity and depth map
            gray_left = cv2.cvtColor(img_left, cv2.COLOR_BGR2GRAY)
            gray_right = cv2.cvtColor(img_right, cv2.COLOR_BGR2GRAY)
            disparity = stereo.compute(gray_left, gray_right).astype(np.float32) / 16.0

            # Mask invalid disparities
            valid_disparity = (disparity > 0) & (disparity < 96)  # Set a max threshold to discard outliers
            depth_map = np.zeros(disparity.shape, dtype=np.float32)
            depth_map[valid_disparity] = (focal_length * baseline) / disparity[valid_disparity]
            
            # Annotate the left image with bounding boxes and filtered depth values
            annotated_image = img_left.copy()
            for *xyxy, confidence, class_id in detections:
                x1, y1, x2, y2 = map(int, xyxy)

                # Extract the depth values within the bounding box
                object_depth = depth_map[y1:y2, x1:x2]
                valid_object_depth = object_depth[(object_depth > 0.5) & (object_depth < 100)]  # depth range

                # Check if there are any valid depth values inside the box
                if valid_object_depth.size > 0:
                    # Calculate the median depth for robustness
                    avg_depth = np.median(valid_object_depth)

                    # Calculate the 3D location in meters (X, Y, Z)
                    # Using the depth and focal length to estimate the 3D coordinates
                    # Assuming the center of the bounding box as the object location
                    object_center_x = (x1 + x2) / 2
                    object_center_y = (y1 + y2) / 2

                    # Calculate the 3D coordinates based on depth and image coordinates
                    depth_value = avg_depth
                    X = (object_center_x - (img_left.shape[1] / 2)) * depth_value / focal_length
                    Y = (object_center_y - (img_left.shape[0] / 2)) * depth_value / focal_length
                    Z = depth_value

                    # Write the data to the label file
                    label_file.write(f"{frame_idx} {model.names[int(class_id)]} {X:.6f} {Y:.6f} {Z:.6f}\n")

                    # Draw the bounding box and label
                    box_color = (255, 255, 255)  # Default to white
                    cv2.rectangle(annotated_image, (x1, y1), (x2, y2), box_color, 2)
                    label = model.names[int(class_id)]
                    label_text = f"{label}: {avg_depth:.2f}m"
                    cv2.putText(annotated_image, label_text, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, box_color, 2)

            # Write frames to the output videos
            #out_annotated.write(annotated_image)
            #out_depth.write(depth_colormap)

    # Release the video writers
    #out_annotated.release()
    #out_depth.release()
    #print(f"Saved annotated video for {seq} to {output_annotated_video_path}")
    #print(f"Saved depth map video for {seq} to {output_depth_video_path}")
    print(f"Saved label file for {seq} to {label_file_path}")

print("Completed processing and saving videos for all sequences.")




Using cache found in C:\Users\irisv/.cache\torch\hub\ultralytics_yolov5_master
YOLOv5  2024-11-14 Python-3.11.7 torch-2.5.1+cpu CPU

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients, 16.4 GFLOPs
Adding AutoShape... 
  with amp.autocast(autocast):


Processing sequence: seq_01


  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with a

Saved label file for seq_01 to seq_01_detections.txt
Processing sequence: seq_02


  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with a

Saved label file for seq_02 to seq_02_detections.txt
Processing sequence: seq_03


  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with a