# Imports

In [36]:
import numpy as np
import os
import six.moves.urllib as urllib
import sys
import tarfile
import tensorflow as tf
import zipfile

import collections
from numpy import dot
from scipy.linalg import inv, block_diag
import cv2

cap = cv2.VideoCapture("prank.mp4")

# Default resolutions of the frame are obtained.The default resolutions are system dependent.
# We convert the resolutions from float to integer.
frame_width = int(cap.get(3))
frame_height = int(cap.get(4))
out = cv2.VideoWriter('output.avi', cv2.VideoWriter_fourcc('M','J','P','G'), 10, (frame_width,frame_height))

from distutils.version import StrictVersion
from collections import defaultdict
from io import StringIO
import matplotlib.pyplot as plt
from PIL import Image

%matplotlib inline
os.environ['KMP_DUPLICATE_LIB_OK']='True'

# This is needed since the notebook is stored in the object_detection folder.
sys.path.append("..")
from object_detection.utils import ops as utils_ops

if StrictVersion(tf.__version__) < StrictVersion('1.9.0'):
  raise ImportError('Please upgrade your TensorFlow installation to v1.9.* or later!')


## Object detection imports
Here are the imports from the object detection module.

In [4]:
from utils import label_map_util
from utils import visualization_utils as vis_util

# Model preparation 

## Variables

Any model exported using the `export_inference_graph.py` tool can be loaded here simply by changing `PATH_TO_FROZEN_GRAPH` to point to a new .pb file.  

By default we use an "SSD with Mobilenet" model here. See the [detection model zoo](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/detection_model_zoo.md) for a list of other models that can be run out-of-the-box with varying speeds and accuracies.

In [5]:
# What model to download.
MODEL_NAME = 'ssd_mobilenet_v1_coco_2017_11_17'
MODEL_FILE = MODEL_NAME + '.tar.gz'
DOWNLOAD_BASE = 'http://download.tensorflow.org/models/object_detection/'

# Path to frozen detection graph. This is the actual model that is used for the object detection.
PATH_TO_FROZEN_GRAPH = MODEL_NAME + '/frozen_inference_graph.pb'

# List of the strings that is used to add correct label for each box.
PATH_TO_LABELS = os.path.join('data', 'mscoco_label_map.pbtxt')

## Download Model

In [6]:
opener = urllib.request.URLopener()
opener.retrieve(DOWNLOAD_BASE + MODEL_FILE, MODEL_FILE)
tar_file = tarfile.open(MODEL_FILE)
for file in tar_file.getmembers():
  file_name = os.path.basename(file.name)
  if 'frozen_inference_graph.pb' in file_name:
    tar_file.extract(file, os.getcwd())

## Load a (frozen) Tensorflow model into memory.

In [7]:
detection_graph = tf.Graph()
with detection_graph.as_default():
  od_graph_def = tf.GraphDef()
  with tf.gfile.GFile(PATH_TO_FROZEN_GRAPH, 'rb') as fid:
    serialized_graph = fid.read()
    od_graph_def.ParseFromString(serialized_graph)
    tf.import_graph_def(od_graph_def, name='')

## Loading label map
Label maps map indices to category names, so that when our convolution network predicts `5`, we know that this corresponds to `airplane`.  Here we use internal utility functions, but anything that returns a dictionary mapping integers to appropriate string labels would be fine

In [8]:
category_index = label_map_util.create_category_index_from_labelmap(PATH_TO_LABELS, use_display_name=True)

# Utility functions

In [9]:
from utils.visualization_utils import STANDARD_COLORS
from utils.visualization_utils import draw_bounding_box_on_image_array as draw_box

def boxes_to_class_map(
    boxes,
    classes,
    scores,
    category_index,
    max_boxes_to_draw=20,
    min_score_thresh=.5,
    main_class='person'):
  """Remove all boxes except the main class
  
  Returns a tuple of numpy arrays of boxes and scores meeting the main_class and
  minimum threshold requirements
  
  The default main class for the algorithm is 'person'
  """
  
  final_boxes = np.empty(shape=(0, 4))
  final_scores = np.array([])

  for i in range(boxes.shape[0]):
    if scores[i] > min_score_thresh and category_index[classes[i]]['name'] == 'person':
      final_boxes = np.vstack((final_boxes, boxes[i]))
      final_scores = np.append(final_scores, scores[i])

  return (final_boxes, final_scores)

def visualize_boxes(
    image, 
    boxes_dict,
    classes,
    category_index,
    main_class = 'person'):
  """Visualize boxes
  
  Render numbered boxes of a single class (main_class)
  """
  
  box_to_color_map = collections.defaultdict(str)
  box_to_display_str_map = collections.defaultdict(list)
  
  # Get color and display string
  for id in boxes_dict:
#     box = tuple(boxes_dict[id].tolist())
    box = tuple(boxes_dict[id])
    display_str = '{} {}'.format(main_class, id)
    box_to_display_str_map[box].append(display_str)
    box_to_color_map[box] = STANDARD_COLORS[id%len(STANDARD_COLORS)]
 
  # Draw boxes
  for box, color in box_to_color_map.items():
    ymin, xmin, ymax, xmax = box
    
    draw_box(
        image,
        ymin,
        xmin,
        ymax,
        xmax,
        color=color,
        thickness=8,
        display_str_list=box_to_display_str_map[box],
        use_normalized_coordinates=True)

  return image
  
def get_iou(box1, box2):
  """Evaluate IOU for two boxes
  """
  
  ymin1, xmin1, ymax1, xmax1 = box1
  ymin2, xmin2, ymax2, xmax2 = box2
  
  x1_intersection = max(xmin1, xmin2)
  x2_intersection = min(xmax1, xmax2)
  y1_intersection = max(ymin1, ymin2)
  y2_intersection = min(ymax1, ymax2)
  
  intersection = (x2_intersection - x1_intersection)*(y2_intersection - y1_intersection)
  s1 = (xmax1 - xmin1)*(ymax1 - ymin1)
  s2 = (xmax2 - xmin2)*(ymax2 - ymin2)
  
  return intersection/(s1 + s2 - intersection)

# Main class for object tracking with Kalman filter

In [20]:
class Tracker():
  def __init__(self):
    self.id = 0  # tracker's id 
    self.box = [] # list to store the coordinates for a bounding box 
    self.hits = 0 # number of detection matches
    self.no_losses = 0 # number of unmatched tracks (track loss)

    """
            Initialize parameters for Kalman Filtering
    The state is the (x, y) coordinates of the detection box
    state: [up, up_dot, left, left_dot, down, down_dot, right, right_dot]
    """
    self.x_state=[] 
    self.dt = 1.   # time interval

    # Process matrix, assuming constant velocity model
    self.F = np.array([[1, self.dt, 0,  0,  0,  0,  0, 0],
                       [0, 1,  0,  0,  0,  0,  0, 0],
                       [0, 0,  1,  self.dt, 0,  0,  0, 0],
                       [0, 0,  0,  1,  0,  0,  0, 0],
                       [0, 0,  0,  0,  1,  self.dt, 0, 0],
                       [0, 0,  0,  0,  0,  1,  0, 0],
                       [0, 0,  0,  0,  0,  0,  1, self.dt],
                       [0, 0,  0,  0,  0,  0,  0,  1]])

    # Measurement matrix, assuming we can only measure the coordinates
    self.H = np.array([[1, 0, 0, 0, 0, 0, 0, 0],
                       [0, 0, 1, 0, 0, 0, 0, 0],
                       [0, 0, 0, 0, 1, 0, 0, 0], 
                       [0, 0, 0, 0, 0, 0, 1, 0]])

    # Initialize the state covariance
    self.L = 100.0
    self.P = np.diag(self.L*np.ones(8))

    # Initialize the process covariance
    self.Q_comp_mat = np.array([[self.dt**4/2., self.dt**3/2.],
                                [self.dt**3/2., self.dt**2]])
    self.Q = block_diag(self.Q_comp_mat, self.Q_comp_mat, 
                        self.Q_comp_mat, self.Q_comp_mat)

    # Initialize the measurement covariance
    self.R_ratio = 1.0/16.0
    self.R_diag_array = self.R_ratio * np.array([self.L, self.L, self.L, self.L])
    self.R = np.diag(self.R_diag_array)

  def update_R(self):   
    R_diag_array = self.R_ratio * np.array([self.L, self.L, self.L, self.L])
    self.R = np.diag(R_diag_array)

  def kalman_filter(self, z): 
    '''
    Implement the Kalman Filter, including the predict and the update stages,
    with the measurement z
    '''

    x = self.x_state

    # Predict
    x = dot(self.F, x)
    self.P = dot(self.F, self.P).dot(self.F.T) + self.Q

    #Update
    S = dot(self.H, self.P).dot(self.H.T) + self.R
    K = dot(self.P, self.H.T).dot(inv(S)) # Kalman gain
    y = z - dot(self.H, x) # residual
    x += dot(K, y)
    self.P = self.P - dot(K, self.H).dot(self.P)
#     self.x_state = x.astype(int) # convert to integer coordinates 
#                                  #(pixel values)
    self.x_state = x

  def predict_only(self):  
    '''
    Implement only the predict stage. This is used for unmatched detections and 
    unmatched tracks
    '''
    
    x = self.x_state
    # Predict
    x = dot(self.F, x)
    self.P = dot(self.F, self.P).dot(self.F.T) + self.Q
#     self.x_state = x.astype(int)
    self.x_state = x

  def get_bbox(self):
    """
    Get bbox coordinates in correct format
    """
    
    return [self.box[2], self.box[1], self.box[0], self.box[3]]
    

# Instance identification method

In [34]:
def boxes_to_ids_map(
    boxes, 
    tracker_list, 
    min_iou_thresh=.3):
  """Get ids for boxes
  
  Map all boxes of the same class(main_class) to ids(integers) according
  to tracker list
  
  This method mutates tracker_list and returns a dictionary of current boxes
  """
  
  current_boxes_dict = collections.defaultdict(list)
  tmp_tracker_list = list(tracker_list)
  tmp_len = len(tmp_tracker_list)
  matched_trackers_list = []
  
  # Eveluate IOU of every input box with boxes from buffer dictionary
  for i in range(boxes.shape[0]):
    current_box = boxes[i]
    current_iou = 0
    current_id = 0
    
    for trk in tmp_tracker_list:
      iou = get_iou(current_box, trk.box)
      
      if iou > current_iou:
        current_iou = iou
        current_id = trk.id
 
    # In case of existing instance
    if current_iou > min_iou_thresh:      
      current_box = np.expand_dims(current_box, axis=0).T

      tmp_trk = tracker_list[current_id - 1]
      tmp_trk.kalman_filter(current_box)
      
      xx = tmp_trk.x_state.T[0].tolist()
      xx = [xx[0], xx[2], xx[4], xx[6]]
      tmp_trk.box = xx      

      current_boxes_dict[current_id] = np.squeeze(xx)
      matched_trackers_list.append(current_id - 1)
    # Brand new instance
    else:
      current_box = np.expand_dims(current_box, axis=0).T
      
      tmp_trk = Tracker()
      x = np.array([[current_box[0], 0, current_box[1], 0, current_box[2], 0, current_box[3], 0]]).T
      tmp_trk.x_state = x
      tmp_trk.predict_only()

      xx = tmp_trk.x_state.T[0].tolist()
      xx = [xx[0], xx[2], xx[4], xx[6]]
      tmp_trk.box = xx      
      tmp_trk.id = tmp_len + 1
      
      current_boxes_dict[tmp_len + 1] = np.squeeze(xx)
      tracker_list.append(tmp_trk)
      tmp_len += 1

  # Filter unmatched tracks
  for idx, trk in enumerate(tmp_tracker_list):
    if (idx not in matched_trackers_list):
      tmp_trk = tracker_list[idx]
      tmp_trk.predict_only()
    
      xx = tmp_trk.x_state.T[0].tolist()
      xx = [xx[0], xx[2], xx[4], xx[6]]
      tmp_trk.box = xx

  return current_boxes_dict

# Inference

In [None]:
with detection_graph.as_default():
  with tf.Session(graph=detection_graph) as sess:
    # Remembers all entrances of every instance in a video
    tracker_list = []
    
    while True:
      ret, image_np = cap.read()
      
      # Expand dimensions since the model expects images to have shape: [1, None, None, 3]
      image_np_expanded = np.expand_dims(image_np, axis=0)
      image_tensor = detection_graph.get_tensor_by_name('image_tensor:0')
      
      # Each box represents a part of the image where a particular object was detected.
      boxes = detection_graph.get_tensor_by_name('detection_boxes:0')
      
      # Each score represent how level of confidence for each of the objects.
      # Score is shown on the result image, together with the class label.
      scores = detection_graph.get_tensor_by_name('detection_scores:0')
      classes = detection_graph.get_tensor_by_name('detection_classes:0')
      num_detections = detection_graph.get_tensor_by_name('num_detections:0')
      
      # Get the inference
      (boxes, scores, classes, num_detections) = sess.run(
          [boxes, scores, classes, num_detections],
          feed_dict={image_tensor: image_np_expanded})

      # Drop all boxes except those for 'person' class
      final_boxes, final_scores = boxes_to_class_map(
        np.squeeze(boxes),
        np.squeeze(classes),
        np.squeeze(scores),
        category_index)
      
      # Recognise all of the recieved instances
      output_dict = boxes_to_ids_map(final_boxes, tracker_list)
      
      # Render boxes with ids
      visualize_boxes(
        image_np,
        output_dict,
        np.squeeze(classes).astype(np.int32),
        category_index)
      
      out.write(image_np)

      if cv2.waitKey(25) & 0xFF == ord('q'):
        break
        
    cap.release()
    out.release()
    cv2.destroyAllWindows()