In [3]:
import pixellib
from pixellib.torchbackend.instance import instanceSegmentation
import cv2
import torch
import timm
import numpy as np
import pyttsx3


In [4]:
segment_video = instanceSegmentation()
segment_video.load_model("pointrend_resnet50.pkl",detection_speed="fast")
target_classes = segment_video.select_target_classes(person = True,bicycle = True, car = True, motorcycle= True,bus= True,truck= True, 
                                           fire_hydrant= True, stop_sign= True, parking_meter= True, bench= True
                                          )

midas = torch.hub.load("intel-isl/MiDaS", "DPT_Hybrid")
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
midas.to(device)
midas.eval()
midas_transforms = torch.hub.load("intel-isl/MiDaS", "transforms")
transform = midas_transforms.small_transform

The checkpoint state_dict contains keys that are not used by the model:
  [35mproposal_generator.anchor_generator.cell_anchors.{0, 1, 2, 3, 4}[0m
Using cache found in /home/adit/.cache/torch/hub/intel-isl_MiDaS_master
Using cache found in /home/adit/.cache/torch/hub/intel-isl_MiDaS_master


In [5]:
def get_instance_boundary(instance_mask,height,width):
    instance_boundary = np.zeros((int(height),int(width)))
    for i in instance_mask:
        instance_boundary[i[1]][i[0]] = 1
    return instance_boundary

def check_mask(mask):
    mask_segments = len(mask)
    if mask_segments > 1:
        max_idx = 0
        max_len = 0
        for i in range(mask_segments):
            tmp = len(mask[i])
            if tmp > max_len:
                max_len = tmp
                max_idx = i
        ret_mask = mask[max_idx]
    else:
        ret_mask = mask[0]
    return ret_mask

def get_mean_distance(depth_map,instance_boundary):
    distance_map = (1/depth_map)*1000
    object_boundary = np.multiply(distance_map,instance_boundary)
    count = 0
    for i in object_boundary:
        for j in i:
            if j>0:
                count += 1
    mean_distance = np.sum(object_boundary)/count
    return mean_distance

def create_positional_list(class_names,masks,output):
    positional_list = list()
    for idx, name in enumerate(class_names):
        mask = check_mask(masks[idx])
        instance_boundary = get_instance_boundary(mask,height,width)
        mean_distance = get_mean_distance(output,instance_boundary)
        tmp_dict = {'class_name':name,'mean_distance': mean_distance}    
        positional_list.append(tmp_dict)
    return positional_list

def speak(text):
    engine = pyttsx3.init()
    engine.setProperty('rate', 112) 
    engine.say(text)
    engine.runAndWait()

In [8]:
depth_map = masks =  class_names  = list()
resu = []
cap = cv2.VideoCapture(-1)
width = cap.get(cv2.CAP_PROP_FRAME_WIDTH )
height = cap.get(cv2.CAP_PROP_FRAME_HEIGHT )
while cap.isOpened():
    ret, frame = cap.read()
    
    #Instance segmentation
    res = segment_video.segmentFrame(frame,
                                     segment_target_classes= target_classes,
                                     mask_points_values=True)
    masks = res[0]['masks']
    class_names = res[0]['class_names']
    image = res[1]
    cv2.imshow('Instance Segmentation', image)

    #Depth Mapping
    img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    input_batch = transform(img).to(device)
    with torch.no_grad():
        prediction = midas(input_batch)
        prediction = torch.nn.functional.interpolate(
            prediction.unsqueeze(1),
            size=img.shape[:2],
            mode="bicubic",
            align_corners=False,
        ).squeeze()
        
        depth_map = prediction.cpu().numpy()
        positional_list = create_positional_list(class_names,masks,depth_map)
        for i in positional_list:
            if i['mean_distance'] < 5:
                print(i['mean_distance'])
                text = f"Please be alert there is a {i['class_name']} coming towards you"
                print(text)
                speak(text)
    if cv2.waitKey(10) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()


0.8618791904300451
Please be alert there is a car coming towards you
0.8679078946942869
Please be alert there is a car coming towards you
0.8633994447941683
Please be alert there is a truck coming towards you
0.7329586165052064
Please be alert there is a car coming towards you
0.7664462712903818
Please be alert there is a car coming towards you
0.5735914102524002
Please be alert there is a person coming towards you


In [None]:
cap.release()
cv2.destroyAllWindows()

In [16]:
print((1/depth_map)*1000)

[[1.3116437  1.3131962  1.3172106  ... 2.141976   2.4051151  2.5498366 ]
 [1.3153832  1.3160732  1.3181812  ... 2.1085997  2.3279576  2.4456408 ]
 [1.3282143  1.3262472  1.3225685  ... 2.029001   2.15222    2.2148237 ]
 ...
 [0.35039243 0.35044178 0.35047305 ... 0.6281327  0.62910354 0.62952954]
 [0.35022214 0.35025615 0.35022512 ... 0.62683433 0.6274361  0.62770694]
 [0.3501146  0.35014945 0.35011068 ... 0.6262101  0.6267089  0.6269356 ]]
