# Imports

In [1]:
import numpy as np
import cv2
import os
import time 
import tensorflow as tf
from tensorflow.keras.layers import Conv2D, Dense, Dropout, MaxPool2D, BatchNormalization,Input,Flatten, LeakyReLU, Reshape
from tensorflow.keras.models import Model
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import itertools
from Models import yoloModel
from Models import alpha_model
from scipy.optimize import fsolve


# The used paths

In [3]:
#The file locations
file_path="./kitti/data_object_label_2/training/label_2/"
image_path="./kitti/data_object_image_2/training/image_2/"
test_path="./kitti/data_object_image_3/testing/image_3/"

# Camera and classes parameters of the system

In [4]:
# The average dimentions of the objects in the kitti dataset
dims_avg = {'Cyclist': np.array([ 1.73532436,  0.58028152,  1.77413709]), 
            'Van': np.array([ 2.18928571,  1.90979592,  5.07087755]), 
            'Tram': np.array([  3.56092896,   2.39601093,  18.34125683]), 
            'Car': np.array([ 1.52159147,  1.64443089,  3.85813679]), 
            'Pedestrian': np.array([ 1.75554637,  0.66860882,  0.87623049]),
            'Truck': np.array([  3.07392252,   2.63079903,  11.2190799 ])}  

fx=725   #focal length on the x-axis
fy=370   #focal length on the y-axis
fov=1.6372261694540755 # The field of view angle about the y-axis
x_c=610 #Center position of the image on the x-axis
y_c=175 #Center position of the image on the y-axis
BIN=16 #The number of bins for the orientation model
Iw=1242 #image width
Ih=375 #image height

# Helper functions

In [5]:
#reads the kitti dataset images
def read_text_image_file(n,test=False):
    file='00'
    file=file+str(int(n/1000)%10)
    file=file+str(int(n/100)%10)
    file=file+str(int(n/10)%10)
    file=file+str(int(n/1)%10)
    if(test):
        file_image=test_path+file+".png"
    else:
        file_image=image_path+file+".png"
    file=file+".txt"
    file1=open(file_path+file,'r')
    lines=file1.readlines()
    image=cv2.imread(file_image,cv2.IMREAD_COLOR)
    return lines,image

In [6]:
#extract the information from the kitti dataset, for training 
def get_data_line(data):
    j=0
    x=data.split()
    data_dict={"label":x[0],
               "truncated":x[1],
               "occluded":x[2],
               "alpha":float(x[3]),
               "box":[float(x[4]),float(x[5]),float(x[6]),float(x[7])],
               "Dims":[float(x[8]),float(x[9]),float(x[10])],
               "location":[float(x[11]),float(x[12]),float(x[13])],
                "rotation_y":float(x[14])}
    return data_dict

In [7]:
#code to get the angle of the object about the y axis using the apparent angle in the image
def get_rot_y(data_dict_loc,field_of_view=fov):
    xc=(data_dict_loc["box"][2]+data_dict_loc["box"][0])/2
    Xc=xc-x_c
    theta_ray=(3*np.pi/2)+(Xc/1242)*field_of_view
    if(data_dict_loc["alpha"]<0):
        alpha=2*np.pi+data_dict_loc["alpha"]
    else:
        alpha=data_dict_loc["alpha"]
    theta_y=alpha+theta_ray-3*np.pi/2
    
    if(theta_y>2*np.pi):
        theta_y=theta_y-2*np.pi
    
    if(theta_y>np.pi):
        theta_y=theta_y-2*np.pi
    if(theta_y<-np.pi):
        theta_y=theta_y+2*np.pi
    
    return theta_y

In [8]:
#Get the rotation matrix 
def get_rotation_y_mat(data):
    t_y=data_dict["rotation_y"]
    Mat_y=np.array([[np.cos(t_y),0,np.sin(t_y)],
           [0,1,0],
           [-np.sin(t_y),0,np.cos(t_y)]])
    return Mat_y

In [9]:
#Get the relation between the camera frame and the image frame
def get_projection_mat(point,fx=fx,fy=fy): 
    #Get the projection matrix as [[fx/z,0,0],[0,fy/z,0]]
    Z=point[2]
    P=[[fx/Z,0,0],
       [0,fy/Z,0]]
    return P

In [10]:
#get the position of the object using the geometric constraints
def get_translation(data,x0):

    
    alpha=data["alpha"]
 
    L=data["Dims"][2]  # The length of the object
    W=data["Dims"][1]  # The width of the object

    
    xi_r=data["box"][2]-x_c  #The position of the right most pixel if the 2D bounding box
    xi_l=data["box"][0]-x_c  #The position of the left most pixel if the 2D bounding box


    yi=0.5*(data["box"][1]+data["box"][3])-y_c
    
    # (Xcr,Ycr) is the coordinates of the right most edge of the car
    # (Xcl,Ycl) is the coordinates of the left most edge of the car
    
    if(alpha<=-np.pi/2):
        Xcr=-(L/2)
        Zcr=-(W/2)
        Xcl=(L/2)
        Zcl=(W/2)

    elif(alpha<=0):
        Xcr=(L/2)
        Zcr=-(W/2)
        Xcl=-(L/2)
        Zcl=(W/2)

    elif(alpha<=np.pi/2):
        Xcr=(L/2)
        Zcr=(W/2)
        Xcl=-(L/2)
        Zcl=-(W/2)

    else:
        Xcr=-(L/2)
        Zcr=(W/2)
        Xcl=(L/2)
        Zcl=-(W/2)

    # Geometric constraints between the 2D bounding box and the actual object
    def constraints(Input):
        Tx,Ty,Tz=Input
        
        
        eq1=(Tz)*(xi_r/fx)-Xcr*np.cos(alpha)-Zcr*np.sin(alpha)-Tx
        eq3=(Tz)*(xi_l/fx)-Xcl*np.cos(alpha)-Zcl*np.sin(alpha)-Tx
        eq5=(Ty/Tz)-(yi/fy)
        
        return [eq1,eq3,eq5]
    out=fsolve(constraints,x0)
    if(out[2]<0):
        out=-out
    return out

In [11]:
#draw the 3D box 
def Draw_box(data_dict,image):
    perm=[seq for seq in itertools.product([-1,1], repeat=3)]
    Points=[]
    rotation=get_rotation_y_mat(data_dict)
    for seq in perm:
        point_car=np.array(list(map(lambda x: 0.5*x,data_dict["Dims"]))) # The point in the car frame
        point_car=np.array([point_car[2],point_car[0],point_car[1]])
        point_car=point_car*seq
        point_cam=data_dict["location"]+np.dot(rotation,point_car) # The point in the camera frame
        project_mat=get_projection_mat(point_cam)
        project_point=np.dot(project_mat,point_cam)+ np.array([610,175])# The point in the image
        Points=Points+[project_point]
        
    for i in range(len(Points)):
        cv2.circle(image,(int(Points[i][0]),int(Points[i][1])),2,(255,0,0),2)
        for j in range (len(Points)-1):
            if sum(abs(np.array(perm[i])-np.array(perm[j])))>=4:
                continue
            else:
                cv2.line(image, (int(Points[i][0]),int(Points[i][1])), (int(Points[j][0]),int(Points[j][1])),(255,0,0),1)

# Load the deep learning models used

In [12]:
#load yolo model
yolo=yoloModel.YoloV3(size=416,classes=80)
weights_path='./Checkpoints/yolov3.tf'
yolo.load_weights(weights_path)

Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.


<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f5670467400>

In [13]:
angle_model=alpha_model.Get_Model(BIN=BIN)
angle_model.load_weights("./Checkpoints/angle_16_Bin_5.h5")

In [14]:
#Get the range for each bin
step=2*np.pi/BIN
bin_ranges=[[-np.pi+i*step,-np.pi+(i+1)*step] for i in range(BIN)]

# Test the model

In [15]:

kitti_image_number=3107
thres=0.6
_,image=read_text_image_file(kitti_image_number,test=True) #Get the image


#image=image.astype("uint8")

img=np.expand_dims(image,0)
img=tf.image.resize(img,(416,416))

img=img/255

data_dict={}

t=time.time()
boxes, scores, classes, nums = yolo(img)
print(scores)
boxes_2=boxes[scores>thres]
classes_2=classes[scores>thres]
for box,Class in zip(boxes_2,classes_2):
    f=0
    if(Class==2):
        data_dict["Dims"]=dims_avg["Car"]
        f=1
    elif Class==0:
        data_dict["Dims"]=dims_avg["Pedestrian"]
        f=1
    if(f==1):
        box_2=box*np.array([Iw,Ih,Iw,Ih])
        data_dict["box"]=box_2
        img_cropped=image[np.max([int(box_2[1]),0]):int(box_2[3]),np.max([int(box_2[0]),0]):int(box_2[2]),:]
        img_cropped=cv2.resize(img_cropped,(64,64))
        bins_prob=angle_model(np.expand_dims(img_cropped,0))
        n_bin=np.argmax(bins_prob)
        alpha=np.mean(bin_ranges[n_bin])
        data_dict["alpha"]=alpha
        T=get_translation(data_dict,[1,1,1])
        data_dict["location"]=list(T)
        data_dict["rotation_y"]=get_rot_y(data_dict,fov)
        Draw_box(data_dict,image)
    
dt=time.time()-t
print("The elapsed time is "+str(dt))
cv2.imshow("Annotated image",image)
cv2.waitKey(0)
cv2.destroyAllWindows()


tf.Tensor(
[[0.98971224 0.8785205  0.82696307 0.6382371  0.5124569  0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.   