### Introduction to Computer Vision (Spring 2020)
Instructor: Muhammad Fahim \\
TA: Marcus Ebner

### Acknowledgement
This lab was maintained by Marcus

# Object Tracking


In [1]:
from IPython.display import clear_output
import cv2
import sys
%matplotlib inline
from matplotlib import pyplot as plt
plt.rcParams["figure.figsize"] = (16, 10) # (w, h)
import numpy as np
import time
import math
from google.colab.patches import cv2_imshow
import imutils
!pip install tf-nightly-gpu-2.0-preview

# For running inference on the TF-Hub module.
import tensorflow as tf
import tensorflow_hub as hub

# For downloading the image.
import matplotlib.pyplot as plt
import tempfile
from six.moves.urllib.request import urlopen
from six import BytesIO

# For drawing onto the image.
import numpy as np
from PIL import Image
clear_output()

ModuleNotFoundError: No module named 'google.colab'

In [0]:
!wget "https://drive.google.com/uc?export=view&id=1skt6SmYqjQ_jWh4jIkpe3njILEIaS72-" -O vid.mp4
!wget "https://drive.google.com/uc?id=1Ckgprba7Yd6a-nQQi9AfXvSEGsEZXpcz&authuser=0&export=download" -O cars.mp4

## Helper functions

In [0]:
def show_in_row(list_of_images, titles = None, disable_ticks = False):
  count = len(list_of_images)
  for idx in range(count):
    subplot = plt.subplot(1, count, idx+1)
    if titles is not None:
      subplot.set_title(titles[idx])
      
    img = list_of_images[idx]
    cmap = 'gray' if (len(img.shape) == 2 or img.shape[2] == 1) else None
    subplot.imshow(img, cmap=cmap)
    if disable_ticks:
      plt.xticks([]), plt.yticks([])
  plt.show()
  

Additional functions

In [0]:
def detect_red(frame):
  """
  Detects the largest red object in the frame and returns its bounding box.
  """
  state = []
  track_frame = cv2.inRange(frame.copy(), (0, 0, 140), (60, 60, 255))
  track_frame = cv2.dilate(track_frame, np.ones((39,39), np.uint8))
  track_frame = cv2.erode(track_frame, np.ones((29,29), np.uint8))
  
  contour = cv2.findContours(track_frame, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
  contour = imutils.grab_contours(contour)
  if len(contour)>0:
    c = max(contour, key=cv2.contourArea)
    (cent_x, cent_y), radius = cv2.minEnclosingCircle(c)
    cent_x, cent_y, radius = int(cent_x),int(cent_y), int(radius)
    state.append((cent_x-radius, cent_y-radius, 2*radius, 2*radius))
    if (4*radius*radius)<1000:
      state = []
  return state


def create_tracker(tracker_type='KCF'):
  if tracker_type == 'BOOSTING':
      tracker = cv2.TrackerBoosting_create()
  elif tracker_type == 'MIL':
      tracker = cv2.TrackerMIL_create()
  elif tracker_type == 'KCF':
      tracker = cv2.TrackerKCF_create()
  elif tracker_type == 'TLD':
      tracker = cv2.TrackerTLD_create()
  elif tracker_type == 'MEDIANFLOW':
      tracker = cv2.TrackerMedianFlow_create()
  elif tracker_type == 'GOTURN':
      tracker = cv2.TrackerGOTURN_create()
  elif tracker_type == 'MOSSE':
      tracker = cv2.TrackerMOSSE_create()
  elif tracker_type == "CSRT":
      tracker = cv2.TrackerCSRT_create()
  else:
      print("No valid tracker type given! Using type 'KCF'.")
      tracker = cv2.TrackerKCF_create()

  return tracker

## Detecting and Tracking of a Single Object

In [0]:
def track(video, frame, bbox, output_writer, num_frames=5):
  """
  Tracks given object for num_frames frames, draws a bounding box around the object
  and writes those frames to an output.
  """
  
  # Create and initialize a tracker using the function from above
  # See documentation for cv::Tracker to see how to initialize it
  < your code here >

  for i in range(num_frames):
    ok, frame = video.read()
    if not ok:
        print("[function 'track'] Error reading frame from video!")
        break
    
    # Update the tracker with the current frame
    # If the tracking was successful, draw a rectangle/bounding box around the object
    # If not successful, draw text on the frame indicating a tracking error
    < your code here>

    # Display the frame (in the output below or into a video file)
    show_in_row([frame])
    # output_writer.write(frame)


def detect_object_and_track(path, output_writer, num_frames):
  """
  Opens a video, tries to detect a large, red object and then tries to track it
  for the next num_frames frames and draw a bounding box around it.
  """
  video = cv2.VideoCapture(path)

  try:
    while(True):
      ret, frame = video.read()
      if not ret:
        print("[function 'detect_object_and_track'] Error reading frame from video!")
        video.release()
        break

      # Detect a red object. If found, draw a bounding box around the object,
      # display or write the frame (image) with the bounding box to a video
      # and track the object for the next num_frames frames.
      # If it wasn't found, just display or write the unchanged image.
      < your code here >

  except KeyboardInterrupt:
    video.release()

out = cv2.VideoWriter('output_single.avi',cv2.VideoWriter_fourcc('M','J','P','G'), 30, (640, 360))
detect_object_and_track('vid.mp4', out, 5)
out.release()

## Detect (via SSD) and Track Multiple Objects

In [0]:
# Easy to use car detector that takes an image and ouputs a list of bounding boxes
class CarDetector():
  def __init__(self):
    module_handle = "https://tfhub.dev/google/openimages_v4/ssd/mobilenet_v2/1"
    self.detector = hub.load(module_handle).signatures['default']
  
  def filter_boxes(self, image, boxes, class_names, scores, max_boxes=10, min_score=0.6, class_of_interest='Car'):
    im_height, im_width = image.shape[:2]
    state=[]

    for i in range(min(boxes.shape[0], max_boxes)):
      if scores[i] >= min_score and class_names[i].decode("ascii")==class_of_interest:
        [ymin, xmin, ymax, xmax] = boxes[i]
        (left, right, top, bottom) = (xmin * im_width, xmax * im_width,
                                  ymin * im_height, ymax * im_height)
        left, right, top, bottom = min(left, right), max(left, right), min(top, bottom), max(top, bottom)
        left, right, top, bottom  = int(left), int(right), int(top), int(bottom) 
        state.append((left, top, right-left, bottom-top))
    return state

  def detect(self, img):
    converted_img  = tf.image.convert_image_dtype(img, tf.float32)[tf.newaxis, ...]
    result = self.detector(converted_img)

    result = {key:value.numpy() for key,value in result.items()}
    bbox = self.filter_boxes(img, result["detection_boxes"],
        result["detection_class_entities"], result["detection_scores"])
    return bbox


def track_multiple(video, frame, bboxes, output_writer, num_frames=5):
  """
  Tracks object for num_frames frames
  """
  
  # Create a MultiTracker and for each bounding box, add a tracker to it
  # See the documentation for cv::MultiTracker to find the necessary functions
  < your code here >

  for i in range(num_frames):
    ok, frame = video.read()
    if not ok:
        break
    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) # Detector uses RGB, so we will work in this space
    
    # Feed the tracker with a new image and draw a bounding box for each
    # tracked object onto the frame
    < your code here >

    # show_in_row([frame])
    output_writer.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)) # Converting back to BGR, to write correct colors



def detect_object_and_track(path, output_writer, num_frames=300, start_frame=0):
  video = cv2.VideoCapture(path)
  video.set(cv2.CAP_PROP_POS_FRAMES, start_frame)
  ok, frame = video.read()

  try:
   while video.get(cv2.CAP_PROP_POS_FRAMES) < start_frame + num_frames:
      ret, frame = video.read()
      if not ret:
        video.release()
        break
      frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

      state = car_detect.detect(frame)
      # If we found objects, draw a bounding box around each detected object,
      # display or write the resulting image and then track all found objects.
      # If nothing was found, display or draw the unchanged image.
      # REMEMBER TO CONVERT BACK TO BGR if you are writing to a video file!
      < your code here >
      
  except KeyboardInterrupt:
    video.release()

out = cv2.VideoWriter('output_multiple.avi', cv2.VideoWriter_fourcc('M','J','P','G'), 25, (640, 360))
car_detect = CarDetector()
detect_object_and_track("cars.mp4", out, 750, 4950) # start video at 3 min 18 sec, run for 750 frames
out.release()

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore
