In [1]:
from typing import List, Dict
import numpy as np
import cv2
import sort
from utils import *
import pandas as pd 

In [2]:

"""
YOLO-v3 based object detector. This YOLO-v3 is pretrained on MS-COCO dataset.
"""
network = cv2.dnn.readNet("yolo/weights/yolov3.weights","yolo/cfg/yolov3.cfg") #  "yolo/cfg/coco.data"


def detect(img):
  """
  Parameters
  ----------
  img: PIL Input Image
  category: category of the object to filter(should be one of the categories from MS-COCO dataset)

  Returns
  ---------- 
  detections: List of detections. Each detection is a tuple of form (object_name, score, bbox).
  """  
  
  classes = []
  with open("yolo/data/coco.names", "r") as f: # read the coco dataset
      classes = f.read().splitlines()  

  # capture the height and width of every frame that we are going to use it scale back to the original image size
  height, width, _ = img.shape  # Frame shape (1440, 2560, 3) 

  # creating a blob input (image, scaling, size of the image) Shape (1, 3, 416, 416)
  blob = cv2.dnn.blobFromImage(img, 1/255, (416, 416), (0,0,0), swapRB=True, crop=False)

  # passing the blob into input function
  network.setInput(blob)

  # getting the output layers name ['yolo_82', 'yolo_94', 'yolo_106']
  output_layers_names = network.getUnconnectedOutLayersNames()

  # getting the output layer list len 3 [0.9875224 , 0.99220854, 0.18105118, ..., 0. ,0.,0.]], dtype=float32)] 
  layerOutputs = network.forward(output_layers_names) 

  boxes = []
  confidences = []
  class_ids = [] # represent the predicted classes

  detections = [] 

  for output in layerOutputs: # extract the information from each of the input
      # print(type(output), output.shape) <class 'numpy.ndarray'> (507, 85) <class 'numpy.ndarray'> (2028, 85) <class 'numpy.ndarray'> (8112, 85)
      
      for detection in output: # extract the information from each of the output
          det_data = []
          scores = detection[5:]
          class_id = np.argmax(scores) 
          confidence = scores[class_id]
          # print(detection[0],detection[1]) 0.8738878 0.5129194

          if confidence > 0.5:   # 0.5
              center_x = int(detection[0]*width)
              center_y = int(detection[1]*height) 

              w = int(detection[2]*width)
              h = int(detection[3]*height)

              x = int(center_x - w/2)
              y = int(center_y - h/2)

              class_name = str(classes[class_id])
              if class_name == 'person':
                  det_data.append(class_name)
                  boxes.append([x, y, w, h])
                  confidences.append((float(confidence)))
                  class_ids.append(class_id) 
                  det_data.append(confidence)
                  det_data.append([x,y,w,h])
              # first 4 coeffcient is the location of the bounding box and the 5th element is the box confidence
          if det_data:
            detections.append(tuple(det_data)) 
          #else:
          #  detections.append(np.empty((0,5)))
      # (obj, score, [cx,cy,w,h])
  return detections


In [45]:

def detect_and_track(video_filename: str) -> Dict[str, List]:
  """
  Detection and Tracking function based on YOLO-v3 object detector and kalman filter based SORT tracker.
  Parameters
    ----------
    video_frames: path to the video file. Video would be a 4 dimesional np array of shape <N, C, H, W>.
    
    Returns
    ----------
    tracks: Dictionary of tracks where each key is the objectID and value is the list of the center of the
    object on the floor plane.
  """

  data_input = np.array([0,0,0,0])
  tracks = {}
  person_tracker = sort.Sort() 

  # 1. Start reading the video file frame by frame
  cap = cv2.VideoCapture(video_filename) 

  frameID = 0
  while cap.isOpened():
    frameID += 1
    try:
      # 2. Iterate through each frame in the video
      ret, frame = cap.read() 

      img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) 
      
      # If video end reached
      if not ret:
          break 

      # 3. Get the detections from the object detector
      detections = detect(img)

      # 4. Transform the detected points on floor plane from camera image plane
      detections_on_floor_plane = []
      dets = np.empty((0,5))  
      for (obj, score, [cx,cy,w,h]) in detections:
          #convert coordinates cx,cy,w,h to x1,y1,x2,y2. Project them onto floor plane and
          # reorder the results to (bbox, score, object_name)
          x1, y1, x2, y2 = get_corner_coordinates([cx, cy, w, h])
          detection = np.array([x1, y1, x2, y2, score])   
          dets = np.vstack((dets,detection)) 

      try:
          # 5. Find association of the detected objects and add the objects into list of tracks Using SORT.
          if detections is not None:
            # 6. Update the tracks
            tracked_persons = person_tracker.update(dets)
            #print(tracked_persons)

            for x1, y1, x2, y2, personid in tracked_persons:
              # 7. For each tracked object, get the center pixel on the image plane and add it to the object trajectory.
              center_pos = (int((x1 + x2)/2), int(y1 + y2)/2)
              tracks[personid] = tracks.get(personid, []) + [center_pos]
              for each_record in tracks[personid]:
                record = np.array([frameID, personid, each_record[0], each_record[1] ]) 
                data_input=np.vstack((data_input, record))

      except Exception as e:
          print(e) 
          break
    except Exception as ex:
      print(ex)
      break 
  return tracks, data_input 

In [54]:
if __name__ == '__main__':
  video_path= 'Videos/cam3_004.mp4'
  tracks, data_input = detect_and_track(video_path) 
  data_input = data_input[data_input[:, 1].argsort()]
  data_input = data_input[data_input[:, 0].argsort(kind='mergesort')]    

OpenCV(4.5.5) D:\a\opencv-python\opencv-python\opencv\modules\imgproc\src\color.cpp:182: error: (-215:Assertion failed) !_src.empty() in function 'cv::cvtColor'



In [55]:
print(data_input)

[[   0.     0.     0.     0. ]
 [  24.  2156.  1994.   171. ]
 [  24.  2157.  2108.   174. ]
 ...
 [ 619.  2675.   628.    87. ]
 [ 619.  2675.   630.    86.5]
 [ 619.  2675.   630.    87.5]]


In [56]:
print(data_input.shape) 

(23621, 4)


In [57]:
pixel_pos_raw = pd.DataFrame(data_input)
pixel_pos = pixel_pos_raw.T 
pixel_pos = pixel_pos.iloc[:,1:]  
pixel_pos.to_csv('pixel_pos_pixels.csv', index=None)  

In [58]:
from typing import List
import cv2
import numpy as np

def get_inverse_pespective(perspective_matrix: np.array)-> np.array:
  """
  This method calculates the inverse of perspective matrix by homography. 
  - Takes 4 random points on the floor plane(destination_plane) and calculates the corresponding points 
  on the camera image plane(src_plane) using perspective matrix.
  - Calculates the Homography matrix to map any point in image plane to floor plane.

  Parameters
  ----------
  perspective_matrix: 3 x 4 camera prespective matrix to convert 3d homogeneous world coordinates to 
  2d homogeneous camera coordinates.

  Returns
  ----------
  3x3 homography matrix for moving from 2d homogeneous image plane to world floor plane(at z=0)
  """
  
  #Take 5 homogenous points on the floor(Unit is in Meters)
  pts_dst = np.array([[0,0,0,1],
                      [0,1,0,1],
                      [1,0,0,1],
                      [1,1,0,1],
                      [0,0,0,1]
                    ])
  #Obtain respective homogenous points on the image plane
  pts_src = (perspective_matrix @ pts_dst.T).T
  
  #convert homogenous coordinates to cartesian coordinates
  pts_src_cart = np.array([[x/w, y/w] for x,y,w in pts_src])
  pts_dst_cart = np.array([[x/w, y/w] for x,y,z,w in pts_dst])
  
  #find the 3x3 Homography Matrix for transforming image plane to floor plane
  h, status = cv2.findHomography(pts_src_cart, pts_dst_cart)
  return h


def project_to_floor(image_coordinates: List[int], h: np.array) -> List[int]: 
  """
  This method takes the Homography matrix and the 2d image cartesian coordinates. It returns the (x, y)
  cartesian coordinates in 3d cartesian world coordinates on floor plane(at z=0). Notice that z coordinate is omitted
  here and added inside the tracking function. 
  
  Parameters
  ----------
  image_coordinates: 2d pixel coordinates (x,y)
  h: 3x3 Homography matrix np.array[3x3]

  Returns
  ----------
  floor_coordinates: List of x, y coordinates in 3d world of same pixel on floor plane i.e. (x,y,z) Considering z=0 and 
  ommitted here.
  """
  #adding 1 for homogenous coordinate system
  x, y, w = h@np.array([[*image_coordinates, 1]]).T
  return [x/w, y/w]


In [59]:
# https://towardsdatascience.com/how-to-transform-a-2d-image-into-a-3d-space-5fc2306e3d36

def transform(video_filename,
              translation=(0, 0, 0),
              rotation=(0, 0, 0),
              scaling=(1, 1, 1),
              shearing=(0, 0, 0)):
    import numpy as np
    import cv2
    
    cap = cv2.VideoCapture(video_filename) 
    _, frame = cap.read()  


    # get the values on each axis
    t_x, t_y, t_z = translation
    r_x, r_y, r_z = rotation
    sc_x, sc_y, sc_z = scaling
    sh_x, sh_y, sh_z = shearing
    
    # convert degree angles to rad
    theta_rx = np.deg2rad(r_x)
    theta_ry = np.deg2rad(r_y)
    theta_rz = np.deg2rad(r_z)
    theta_shx = np.deg2rad(sh_x)
    theta_shy = np.deg2rad(sh_y)
    theta_shz = np.deg2rad(sh_z)
    
    # get the height and the width of the image
    h, w = frame.shape[:2]
    # compute its diagonal
    diag = (h ** 2 + w ** 2) ** 0.5
    # compute the focal length
    f = diag
    if np.sin(theta_rz) != 0:
        f /= 2 * np.sin(theta_rz)
        
    # set the image from cartesian to projective dimension
    H_M = np.array([[1, 0, -w / 2],
                    [0, 1, -h / 2],
                    [0, 0,      1],
                    [0, 0,      1]])
    # set the image projective to carrtesian dimension
    Hp_M = np.array([[f, 0, w / 2, 0],
                     [0, f, h / 2, 0],
                     [0, 0,     1, 0]])
    """
            We will define our matrices here in next parts
                                                            """
    Identity = np.array([[1, 0, 0, 0],
                         [0, 1, 0, 0],
                         [0, 0, 1, 0],
                         [0, 0, 0, 1]])
    
    # adjust the translation on z
    t_z = (f - t_z) / sc_z ** 2
    # translation matrix to translate the image
    T_M = np.array([[1, 0, 0, t_x],
                    [0, 1, 0, t_y],
                    [0, 0, 1, t_z],
                    [0, 0, 0,  1]])

        # calculate cos and sin of angles
    sin_rx, cos_rx = np.sin(theta_rx), np.cos(theta_rx)
    sin_ry, cos_ry = np.sin(theta_ry), np.cos(theta_ry)
    sin_rz, cos_rz = np.sin(theta_rz), np.cos(theta_rz)
    # get the rotation matrix on x axis
    R_Mx = np.array([[1,      0,       0, 0],
                     [0, cos_rx, -sin_rx, 0],
                     [0, sin_rx,  cos_rx, 0],
                     [0,      0,       0, 1]])
    # get the rotation matrix on y axis
    R_My = np.array([[cos_ry, 0, -sin_ry, 0],
                     [     0, 1,       0, 0],
                     [sin_ry, 0,  cos_ry, 0],
                     [     0, 0,       0, 1]])
    # get the rotation matrix on z axis
    R_Mz = np.array([[cos_rz, -sin_rz, 0, 0],
                     [sin_rz,  cos_rz, 0, 0],
                     [     0,       0, 1, 0],
                     [     0,       0, 0, 1]])
    # compute the full rotation matrix
    R_M = np.dot(np.dot(R_Mx, R_My), R_Mz)


    # get the scaling matrix
    Sc_M = np.array([[sc_x,     0,    0, 0],
                     [   0,  sc_y,    0, 0],
                     [   0,     0, sc_z, 0],
                     [   0,     0,    0, 1]])

    # get the tan of angles
    tan_shx = np.tan(theta_shx)
    tan_shy = np.tan(theta_shy)
    tan_shz = np.tan(theta_shz)
    # get the shearing matrix on x axis
    Sh_Mx = np.array([[      1, 0, 0, 0],
                      [tan_shy, 1, 0, 0],
                      [tan_shz, 0, 1, 0],
                      [      0, 0, 0, 1]])
    # get the shearing matrix on y axis
    Sh_My = np.array([[1, tan_shx, 0, 0],
                      [0,       1, 0, 0],
                      [0, tan_shz, 1, 0],
                      [0,       0, 0, 1]])
    # get the shearing matrix on z axis
    Sh_Mz = np.array([[1, 0, tan_shx, 0],
                      [0, 1, tan_shy, 0],
                      [0, 0,       1, 0],
                      [0, 0,       0, 1]])
    # compute the full shearing matrix
    Sh_M = np.dot(np.dot(Sh_Mx, Sh_My), Sh_Mz)


    # compute the full transform matrix
    M = Identity
    M = np.dot(T_M,  M)
    M = np.dot(R_M,  M)
    M = np.dot(Sc_M, M)
    M = np.dot(Sh_M, M)
    M = np.dot(Hp_M, np.dot(M, H_M))
    # apply the transformation
    return M 

In [60]:
tracks

{2157.0: [(2108, 174.0), (2099, 171.5)],
 2156.0: [(1994, 171.0), (1990, 167.0)],
 2158.0: [(2097, 193.0)],
 2159.0: [(1225, 161.0),
  (1222, 161.5),
  (1219, 163.0),
  (1218, 162.0),
  (1216, 163.5),
  (1211, 164.5),
  (1209, 166.0),
  (1207, 164.5),
  (1205, 164.0)],
 2165.0: [(1515, 866.5), (1517, 879.0), (1575, 901.5), (1587, 913.0)],
 2164.0: [(1507, 883.0), (1515, 854.0)],
 2160.0: [(1667, 174.5),
  (1668, 173.5),
  (1668, 173.0),
  (1668, 173.0),
  (1668, 174.5),
  (1666, 174.5)],
 2168.0: [(1516, 843.5),
  (1517, 844.0),
  (1519, 846.0),
  (1522, 850.5),
  (1526, 852.5),
  (1539, 857.0),
  (1548, 856.0)],
 2167.0: [(973, 484.0),
  (977, 485.0),
  (977, 486.0),
  (972, 486.5),
  (964, 486.0),
  (961, 482.0),
  (961, 479.5),
  (964, 479.5),
  (969, 478.5),
  (972, 477.0),
  (979, 473.5),
  (987, 470.5),
  (990, 470.0),
  (995, 465.0),
  (999, 465.0),
  (998, 469.0),
  (997, 472.5),
  (995, 473.0),
  (999, 470.0),
  (998, 472.5),
  (993, 474.5),
  (990, 479.0)],
 2166.0: [(1838, 3

In [61]:
# Dummy perspective matrix 
import numpy  as np 
v_path = 'Videos/cam3_004.mp4'
perspective_matrix = transform(v_path)


In [62]:
perspective_matrix_padded = np.hstack((perspective_matrix,np.ones((3,1))))
perspective_matrix_padded = perspective_matrix_padded + 0.01
perspective_matrix_padded

array([[2.93721956e+03, 1.00000000e-02, 1.28001000e+03, 1.01000000e+00],
       [1.00000000e-02, 2.93721956e+03, 7.20010000e+02, 1.01000000e+00],
       [1.00000000e-02, 1.00000000e-02, 2.93821956e+03, 1.01000000e+00]])

In [63]:
h= get_inverse_pespective(perspective_matrix_padded)  

In [64]:
data_input_copy = data_input.copy()
data_input_copy= data_input_copy[1:]
data_input_copy

array([[  24. , 2156. , 1994. ,  171. ],
       [  24. , 2157. , 2108. ,  174. ],
       [  25. , 2156. , 1994. ,  171. ],
       ...,
       [ 619. , 2675. ,  628. ,   87. ],
       [ 619. , 2675. ,  630. ,   86.5],
       [ 619. , 2675. ,  630. ,   87.5]])

In [67]:
new_data_input = np.zeros_like(data_input_copy[0]) 

for row in data_input_copy:
    point = (row[2],row[3])
    [x_floor, y_floor] = project_to_floor(point,h) 
    x_floor_val = x_floor[0] 
    y_floor_val = y_floor[0] 
    frameID = row[0]
    personID = row[1] 
    #record = np.array([frameID, personid, each_record[0], each_record[1] ]) 
    record2 = np.array([frameID, personID, x_floor_val, y_floor_val ])
    print(record2) 
    new_data_input=np.vstack((new_data_input, record2)) 
new_data_input
new_data_input = new_data_input[new_data_input[:, 1].argsort()]
new_data_input = new_data_input[new_data_input[:, 0].argsort(kind='mergesort')]    

[2.40000000e+01 2.15600000e+03 6.90404760e-01 5.88905214e-02]
[2.40000000e+01 2.15700000e+03 7.30189071e-01 5.99538250e-02]
[2.50000000e+01 2.15600000e+03 6.90404760e-01 5.88905214e-02]
[2.50000000e+01 2.15600000e+03 6.89000196e-01 5.75032843e-02]
[2.50000000e+01 2.15700000e+03 7.30189071e-01 5.99538250e-02]
[2.50000000e+01 2.15700000e+03 7.27041398e-01 5.90851088e-02]
[2.50000000e+01 2.15800000e+03 7.26396918e-01 6.65401757e-02]
[2.60000000e+01 2.15900000e+03 4.22881873e-01 5.52786762e-02]
[2.70000000e+01 2.15900000e+03 4.22881873e-01 5.52786762e-02]
[2.70000000e+01 2.15900000e+03 4.21841791e-01 5.54509479e-02]
[2.70000000e+01 2.16400000e+03 5.22103654e-01 3.05773853e-01]
[2.70000000e+01 2.16500000e+03 5.24861799e-01 3.00044840e-01]
[2.80000000e+01 2.15900000e+03 4.20803165e-01 5.59688939e-02]
[2.80000000e+01 2.15900000e+03 4.22881873e-01 5.52786762e-02]
[2.80000000e+01 2.15900000e+03 4.21841791e-01 5.54509479e-02]
[2.80000000e+01 2.16000000e+03 5.76487463e-01 6.00363595e-02]
[2.80000

In [68]:
new_data_input = new_data_input[1:]
new_data_input
zero_row = np.zeros((len(new_data_input),))

In [69]:
pixel_pos_raw2 = pd.DataFrame(new_data_input)
pixel_pos2 = pixel_pos_raw2.T 
zseries = pd.Series(zero_row)
pixel_pos2 = pixel_pos2.append(zseries,ignore_index=True)
pixel_pos2

  pixel_pos2 = pixel_pos2.append(zseries,ignore_index=True)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,23610,23611,23612,23613,23614,23615,23616,23617,23618,23619
0,24.0,24.0,25.0,25.0,25.0,25.0,25.0,26.0,27.0,27.0,...,618.0,618.0,618.0,618.0,619.0,619.0,619.0,619.0,619.0,619.0
1,2156.0,2157.0,2156.0,2156.0,2157.0,2157.0,2158.0,2159.0,2159.0,2159.0,...,2675.0,2675.0,2675.0,2675.0,2675.0,2675.0,2675.0,2675.0,2675.0,2675.0
2,0.690405,0.730189,0.690405,0.689,0.730189,0.727041,0.726397,0.422882,0.421842,0.422882,...,0.215092,0.216128,0.216127,0.215784,0.216127,0.215092,0.215784,0.216128,0.216818,0.216818
3,0.058891,0.059954,0.058891,0.057503,0.059954,0.059085,0.06654,0.055279,0.055451,0.055279,...,0.030161,0.029989,0.029644,0.030851,0.029644,0.030161,0.030851,0.029989,0.029472,0.029817
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [70]:
pixel_pos2.to_csv('pixel_pos_group_pix.csv', index=None, columns=None, header=None)  