In [14]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install ultralytics

Collecting ultralytics
  Downloading ultralytics-8.3.155-py3-none-any.whl.metadata (37 kB)
Collecting ultralytics-thop>=2.0.0 (from ultralytics)
  Downloading ultralytics_thop-2.0.14-py3-none-any.whl.metadata (9.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.8.0->ultralytics)
  Downloading n

In [3]:
from ultralytics import YOLO
import numpy as np
import scipy
import torch
import cv2
model = YOLO("yolov8m.pt")
yolo_classes = model.names

Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.
Downloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolov8m.pt to 'yolov8m.pt'...


100%|██████████| 49.7M/49.7M [00:00<00:00, 126MB/s]


In [4]:
def yolo_predict(imgs, YOLO_model):
    # Load YOLO model
    model = YOLO_model

    det = []
    lbls = []
    mask = []
    plotting = []

    for img in imgs:
        # Predict using the model
        results = model.predict(img)
        result = results[0]
        plot = result.plot()

        # Get bounding boxes, class labels, confidences, and masks
        detections = [box.xyxy[0].tolist() for box in result.boxes]
        labels = [result.names[box.cls[0].item()] for box in result.boxes]

        det.append(detections)
        lbls.append(labels)
        plotting.append(plot)

    return det, lbls, plotting

In [5]:
# det is the bounding boxes, lbls is the class labels for each detection and plotting is the left and right images ready to be shown
# get centr, top left and bottom right of boxes

def tlbr_to_center1(boxes):
    points = []
    for tlx, tly, brx, bry in boxes:
        cx = (tlx + brx) / 2
        cy = (tly + bry) / 2
        points.append([cx, cy])
    return points


def tlbr_to_corner(boxes):
    points = []
    for tlx, tly, brx, bry in boxes:
        cx = (tlx + tlx) / 2
        cy = (tly + tly) / 2
        points.append((cx, cy))
    return points


def tlbr_to_corner_br(boxes):
    points = []
    for tlx, tly, brx, bry in boxes:
        cx = (brx + brx) / 2
        cy = (bry + bry) / 2
        points.append((cx, cy))
    return points


def tlbr_to_area(boxes):
    areas = []
    for tlx, tly, brx, bry in boxes:
        cx = brx - tlx
        cy = bry - tly
        areas.append(abs(cx * cy))
    return areas

In [6]:
# get all distances from every object box to every other object box
# left image is boxes[0]
# right image is boxes[1]

# do broad casting.
# in python, col vector - row vector gives matrix:
# [a] - [c,d] = [a-c, a-d]
# [b]           [b-c, b-d]

def get_horiz_dist_centre(boxes):
    pnts1 = np.array(tlbr_to_center1(boxes[0]))[:, 0]
    pnts2 = np.array(tlbr_to_center1(boxes[1]))[:, 0]
    return pnts1[:, None] - pnts2[None]


def get_horiz_dist_corner_tl(boxes):
    pnts1 = np.array(tlbr_to_corner(boxes[0]))[:, 0]
    pnts2 = np.array(tlbr_to_corner(boxes[1]))[:, 0]
    return pnts1[:, None] - pnts2[None]


def get_horiz_dist_corner_br(boxes):
    pnts1 = np.array(tlbr_to_corner_br(boxes[0]))[:, 0]
    pnts2 = np.array(tlbr_to_corner_br(boxes[1]))[:, 0]
    return pnts1[:, None] - pnts2[None]


def get_vertic_dist_centre(boxes):
    pnts1 = np.array(tlbr_to_center1(boxes[0]))[:, 1]
    pnts2 = np.array(tlbr_to_center1(boxes[1]))[:, 1]
    return pnts1[:, None] - pnts2[None]


def get_area_diffs(boxes):
    pnts1 = np.array(tlbr_to_area(boxes[0]))
    pnts2 = np.array(tlbr_to_area(boxes[1]))
    return abs(pnts1[:, None] - pnts2[None])


def get_dist_to_centre_tl(box, img):
    sz1 = img.shape[1]
    center = sz1 / 2
    pnts = np.array(tlbr_to_corner(box))[:, 0]
    return abs(pnts - center)


def get_dist_to_centre_br(box, img):
    sz1 = img.shape[1]
    center = sz1 / 2
    pnts = np.array(tlbr_to_corner_br(box))[:, 0]
    return abs(pnts - center)

In [7]:
# create the tracking cost function.
# consists of theree parts.
#  1. The vertical move up and down of object centre of mass. Scale this up because we do not expect this to be very much.
#  2. The move left or right by the object. We only expect it to move right (from the left eye image). So penalise if it moves left.
#  3. The difference in area of pixels. Area of image is width x height, so divide by height, there for this will have max value of width

def get_cost(boxes, img, lbls=None):

    sz1 = img.shape[1]

    alpha = sz1
    beta = 10
    gamma = 5

    # vertical_dist, scale by gamma since can't move up or down
    vert_dist = gamma * abs(get_vertic_dist_centre(boxes))

    # horizonatl distance.
    horiz_dist = get_horiz_dist_centre(boxes)

    # increase cost if object has moved from right to left.
    horiz_dist[horiz_dist < 0] = beta * abs(horiz_dist[horiz_dist < 0])

    # area of box
    area_diffs = get_area_diffs(boxes) / alpha

    cost = np.array([vert_dist, horiz_dist, area_diffs])

    cost = cost.sum(axis=0)

    # add penalty term for different object classes
    if lbls is not None:
        for i in range(cost.shape[0]):
            for j in range(cost.shape[1]):
                if lbls[0][i] != lbls[1][j]:
                    cost[i, j] += 150
    return cost

In [9]:
def get_horiz_dist(masks, prob_thresh=0.7):
    # gets the horizontal distance between the centre of mass for each object
    # left masks
    mask_bool = masks[0] > prob_thresh
    mask_bool = mask_bool.squeeze(1)
    # right masks
    mask_bool2 = masks[1] > prob_thresh
    mask_bool2 = mask_bool2.squeeze(1)

    # left params
    # com1 is center of mass of height
    # com2 is center of mass of width
    mask_size = (mask_bool).sum(dim=[1, 2])
    mask_com_matrix_1 = torch.tensor(range(mask_bool.shape[1]))
    com1 = ((mask_com_matrix_1.unsqueeze(1)) * mask_bool).sum(dim=[1, 2]) / mask_size
    mask_com_matrix_2 = torch.tensor(range(mask_bool.shape[2]))
    com2 = ((mask_com_matrix_2.unsqueeze(0)) * mask_bool).sum(dim=[1, 2]) / mask_size

    left_params = torch.stack((com1, com2, mask_size)).transpose(1, 0)

    # get right params
    mask_size2 = (mask_bool2).sum(dim=[1, 2])
    mask_com_matrix_12 = torch.tensor(range(mask_bool2.shape[1]))
    com12 = ((mask_com_matrix_12.unsqueeze(1)) * mask_bool2).sum(
        dim=[1, 2]
    ) / mask_size2
    mask_com_matrix_22 = torch.tensor(range(mask_bool2.shape[2]))
    com22 = ((mask_com_matrix_22.unsqueeze(0)) * mask_bool2).sum(
        dim=[1, 2]
    ) / mask_size2

    right_params = torch.stack((com12, com22, mask_size2)).transpose(1, 0)

    # calculate cost function
    cost = left_params[:, None] - right_params[None]
    return cost[:, :, 1]


In [10]:
def get_tracks(cost):
    return scipy.optimize.linear_sum_assignment(cost)

In [11]:
def get_object_dist(object_name, final_dists_list, tantheta, fl, sz1):
    distance = None
    try:
        for dist, label in final_dists_list:
            if label == object_name:
                distance = dist
                break
        else:
            raise ValueError(f"No object found with label: {object_name}")

        x = (7.05 / 2) * sz1 * (1 / tantheta) / distance + fl
        return round(x, ndigits=1)
    except ValueError as e:
        print(e)
        return None

In [12]:
def recognise_distance(left, right, object_name):
    Left_img=cv2.imread(left)
    Right_img=cv2.imread(right)
    imgs = [Left_img, Right_img]
    det, lbls, plotting = yolo_predict(imgs, model)
    sz1 = Right_img.shape[1]
    sz2 = Right_img.shape[0]
    centre = sz1 / 2
    tmp1 = get_dist_to_centre_br(det[0],Right_img)
    tmp2 = get_dist_to_centre_br(det[1],Right_img)
    cost = get_cost(det,Right_img, lbls=lbls)
    tracks = get_tracks(cost)
    h_d = [[lbls[0][i], lbls[1][j]] for i, j in zip(*tracks)]
    dists_tl = get_horiz_dist_corner_tl(det)
    dists_br = get_horiz_dist_corner_br(det)

    final_dists = []
    dctl = get_dist_to_centre_tl(det[0],Left_img)
    dcbr = get_dist_to_centre_br(det[0],Left_img)

    for i, j in zip(*tracks):
        if dctl[i] < dcbr[i]:
            final_dists.append((dists_tl[i][j], lbls[0][i]))

        else:
            final_dists.append((dists_br[i][j], lbls[0][i]))
    fl = 30 - 37.9 * 50 / 68.2459
    tantheta = (1 / (50 - fl)) * (7.05 / 2) * sz1 / 37.9
    fd = [i for (i, j) in final_dists]
    distance = get_object_dist(
        final_dists_list=final_dists,
        object_name=object_name,
        fl=fl,
        tantheta=tantheta,
        sz1=sz1,
    )

    return distance


In [15]:
distance=recognise_distance(
    "/content/drive/MyDrive/distance/left_eye_50cm.jpg",
    "/content/drive/MyDrive/distance/right_eye_50cm.jpg",
    "bottle",
)


0: 480x640 1 bottle, 1 tv, 1 cell phone, 1203.6ms
Speed: 10.8ms preprocess, 1203.6ms inference, 5.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 bottle, 1 tv, 1 cell phone, 1012.3ms
Speed: 4.6ms preprocess, 1012.3ms inference, 1.8ms postprocess per image at shape (1, 3, 480, 640)


In [16]:
print(f"The final distance is {distance} cm")

The final distance is 49.7 cm
