<a href="https://colab.research.google.com/github/EHaarer/Translating-Pedestrian-Indoor-Images-into-Maps/blob/main/YOLO_depth_BEV_Translator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install torch
!pip install torchvision
!pip install filterpy timm lap
!pip install --upgrade scipy
!wget https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v4_pre/yolov4.weights -O yolov4.weights
!wget https://raw.githubusercontent.com/AlexeyAB/darknet/master/cfg/yolov4.cfg -O yolov4.cfg

import matplotlib
matplotlib.use('Agg')

!git clone https://github.com/abewley/sort.git || echo "SORT directory already exists"

!sed -i 's/matplotlib.use(.TkAgg.)/# matplotlib.use("TkAgg")/' /content/sort/sort.py

import sys
sys.path.append('/content/sort')

import cv2
import numpy as np
import torch
from torchvision.transforms import Compose, ToTensor, Normalize, Resize
import matplotlib.pyplot as plt
from PIL import Image as PILImage
from IPython.display import Image as IPImage, display

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [None]:
# Assuming torchvision and other dependencies are installed
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize MiDaS model for depth estimation
midas = torch.hub.load("intel-isl/MiDaS", "DPT_Large", pretrained=True).to(device).eval()

# Setup transforms for MiDaS model
midas_transforms = Compose([
    Resize((384, 384)),
    ToTensor(),
    Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Setup YOLO model for object detection
yolo_cfg = '/content/yolov4.cfg'
yolo_weights = '/content/yolov4.weights'
net = cv2.dnn.readNetFromDarknet(yolo_cfg, yolo_weights)
net.setPreferableBackend(cv2.dnn.DNN_BACKEND_CUDA)
net.setPreferableTarget(cv2.dnn.DNN_TARGET_CUDA_FP16)

def process_image(image):
    blob = cv2.dnn.blobFromImage(image, scalefactor=1/255.0, size=(416, 416), swapRB=True, crop=False)
    net.setInput(blob)
    outputs = net.forward(net.getUnconnectedOutLayersNames())
    boxes = []
    confidences = []
    classIDs = []
    for output in outputs:
        for detection in output:
            scores = detection[5:]
            classID = np.argmax(scores)
            confidence = scores[classID]
            if classID == 0 and confidence > 0.5:  # Person class
                box = detection[0:4] * np.array([image.shape[1], image.shape[0], image.shape[1], image.shape[0]])
                centerX, centerY, width, height = box.astype("int")
                x, y = int(centerX - width / 2), int(centerY - height / 2)
                boxes.append([x, y, int(width), int(height), confidence])
    return np.array(boxes)

def main(input_path):
    image = cv2.imread(input_path)
    if image is None:
        print("Error opening image file.")
        return

    original_width, original_height = image.shape[1], image.shape[0]
    pil_image = PILImage.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    input_batch = midas_transforms(pil_image).to(device).unsqueeze(0)

    with torch.no_grad():
        depth_map = midas(input_batch).squeeze().cpu().numpy()

    dets = process_image(image)
    plt.figure(figsize=(12, 6))

    # Plot input image
    plt.subplot(1, 2, 1)
    plt.imshow(image)
    plt.title('Input Image')
    plt.axis('off')

    # Plot top-down view of detected people with camera position
    plt.subplot(1, 2, 2)
    colors = np.random.rand(3)  # Generate random colors for plotting

    # Camera intrinsic parameters
    f_x = (28 / 5.76) * 4032
    f_y = (28 / 5.76) * 3024
    c_x = 4032 / 2
    c_y = 3024 / 2
    K = np.array([[f_x, 0, c_x], [0, f_y, c_y], [0, 0, 1]])  # Define K within the scope it's used

    # Calculate scaling factors
    scale_x = depth_map.shape[1] / original_width
    scale_y = depth_map.shape[0] / original_height

    # Track already plotted points
    plotted_points = set()

    # Store the depths of detected people
    depths = []

    for d in dets:
        # Scale centerX and centerY for depth map resolution
        centerX = int((d[0] + d[2] // 2) * scale_x)
        centerY = int((d[1] + d[3] // 2) * scale_y)

        depth = depth_map[centerY, centerX]
        depths.append(depth)

        # Adjusted world_coords calculation using the defined K
        world_coords = np.linalg.inv(K) @ np.array([centerX * depth, centerY * depth, depth])

        # Check if a point has already been plotted nearby
        close_point = False
        for p in plotted_points:
            if np.linalg.norm(p - world_coords[:2]) < 1.0:
                close_point = True
                break

        if not close_point:
            plt.scatter(world_coords[0], world_coords[1], color=colors, label='Detected Person')
            plotted_points.add(tuple(world_coords[:2]))  # Convert numpy array to tuple

    # Calculate average depth of detected people
    avg_depth = np.mean(depths)

    # Calculate approximate camera position
    camera_position = np.array([0, 0, avg_depth])

    # Plot camera position
    plt.scatter(camera_position[0], camera_position[1], color='red', label='Camera Position', marker='^')

    plt.xlabel('X Position')
    plt.ylabel('Y Position')
    plt.title('Top-Down View of Detected People with Camera Position')
    plt.legend()
    plt.axis('equal')  # Ensure aspect ratio is equal for top-down view
    plt.tight_layout()

    plt.savefig('detections_with_camera_position.png')
    plt.close()
    display(IPImage('detections_with_camera_position.png'))

if __name__ == "__main__":
    input_path = '/content/drive/My Drive/Adv CV files/3 people.jpg'
    main(input_path)

NameError: name 'torch' is not defined