## **1. Mount google drive**
---

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


## **2. Import the libraries**
---

In [2]:
import cv2
import sys
import os
import numpy as np

from IPython.display import clear_output
from google.colab.patches import cv2_imshow

print("Versions of key libraries")
print("---")
print("numpy:     ", np.__version__)
print("opencv    :", cv2.__version__)

Versions of key libraries
---
numpy:      1.19.5
opencv    : 4.1.2


## **3. Setup the classes and load the MobileNet SSD**
---


In [3]:
classNames      = {0: 'background',
                   1: 'aeroplane', 
                   2: 'bicycle', 
                   3: 'bird', 
                   4: 'boat',
                   5: 'bottle', 
                   6: 'bus', 
                   7: 'car', 
                   8: 'cat', 
                   9: 'chair',
                   10: 'cow', 
                   11: 'diningtable', 
                   12: 'dog', 
                   13: 'horse',
                   14: 'motorbike', 
                   15: 'person', 
                   16: 'pottedplant',
                   17: 'sheep', 
                   18: 'sofa', 
                   19: 'train', 
                   20: 'tvmonitor'}

prototxt        = '/content/gdrive/My Drive/iss/RTAVS/Week 3/data/MobileNetSSD_deploy.prototxt'
caffemodel      = '/content/gdrive/My Drive/iss/RTAVS/Week 3/data/MobileNetSSD_deploy.caffemodel'
net             = cv2.dnn.readNetFromCaffe(prototxt,
                                           caffemodel)

print("caffemodel '", caffemodel, "' loaded")

caffemodel ' /content/gdrive/My Drive/iss/RTAVS/Week 3/data/MobileNetSSD_deploy.caffemodel ' loaded


## **4. Before the analysis**
---
* Step 1: Specify the video to be analyzed and its output path
* Step 2: Load the video. Check the frames per second (`int` and `round` must be applied since the output can be `float`). Check the width and height of each frame
* Step 3: Setup the codec and video writer. Note: colab so far does not support X264 or H264 encoding, so use MJPG and thus the extension of .avi for the output. No error will occur if a codec is not supported. However, there will no video file saved. 
* Step 4: Set the threshold to determine if the identified object should be retained

In [4]:
                                                                                # Step 1
videopath       = '/content/gdrive/My Drive/iss/RTAVS/Week 3/data/ff7.mp4'
outpath         = '/content/gdrive/My Drive/iss/RTAVS/Week4/colab/ssd_ff7.avi'

                                                                                # Step 2
vs              = cv2.VideoCapture(videopath)
fps             = int(round(vs.get(cv2.CAP_PROP_FPS)))
W               = int(vs.get(cv2.CAP_PROP_FRAME_WIDTH))
H               = int(vs.get(cv2.CAP_PROP_FRAME_HEIGHT))

                                                                                # Step 3
fourcc          = cv2.VideoWriter_fourcc(*"MJPG")    
writer          = cv2.VideoWriter(outpath,
                                  fourcc,
                                  fps,
                                  (W, H),
                                  True)

scoreThres      = 0.5                                                           # Step 4

print("Video to be analyzed.  :", videopath)
print("Output will be saved at:", outpath)

Video to be analyzed.  : /content/gdrive/My Drive/iss/RTAVS/Week 3/data/ff7.mp4
Output will be saved at: /content/gdrive/My Drive/iss/RTAVS/Week4/colab/ssd_ff7.avi


## **5. Run the analysis (to be completed)**
---
* Step 1: Setup running number `fr` for reporting of the frame being analyzed
* Step 2: Read a frame from video stream
* Step 3: If there is no frame left to be analyzed, exit the while loop
* Step 4: Prepare the blob for `net`. Get the `rows` and `cols` of the blob. The shape of `blob` is `(1,3,300,300)`
* Step 5: Perform the prediction with MobileNet SSD. The shape of `pred` is `(1,1,n,7)`, `n` is the number of objects detected.
* Step 6: For each detected object, check its confidence score. If the score exceeds threshold, get the class and the `(x1,y1,x2,y2)` for bounding box. Re-scale the positions (relative to the size of blob, which is `(300, 300)`).
* Step 7: Get the actual positions of the bounding box in original frame. Express the bounding box in the form of `(x,y,w,h)`.
* Step 8: Setup the text to be displayed on the bounding box. Get the size of the text.
* Step 9: Draw the bounding box, put up the text.
* Step 10: Write the frame into the output
* Step 11: Report the amount of frames analyzed
* Step 12: After all frames are done, close the writer and release video stream (of the original video)

In [5]:
fr    = 1                                                                       # Step 1

while True: 
    
    (grabbed, frame) = vs.read()                                                # Step 2

    if not grabbed:                                                             # Step 3
      break

    output = frame.copy()
    blob = cv2.dnn.blobFromImage(image=cv2.resize(frame,(300,300)),             # Step 4
                                 scalefactor=0.007843,
                                 size=(300, 300),
                                 mean=(127.5, 127.5, 127.5),
                                 swapRB=False,
                                 crop=False)
    rows = blob.shape[2]
    cols = blob.shape[3]
    print(blob.shape)
    
    net.setInput(blob)                                                          # Step 5
    pred = net.forward()
    numOfObjects= pred.shape[2]
    print(pred.shape)

    for i in range(numOfObjects):                                               # Step 6
      confidence = pred[0, 0, i, 2]

      if confidence > scoreThres:
          classId = int(pred[0, 0, i, 1])

          x1 = int(pred[0, 0, i, 3] * cols)
          y1 = int(pred[0, 0, i, 4] * rows)
          x2 = int(pred[0, 0, i, 5] * cols)
          y2 = int(pred[0, 0, i, 6] * rows)
          hFactor = H/300.0
          wFactor = W/300.0

          x1 = int(wFactor*x1)                                                  # Step 7
          y1 = int(hFactor*y1)
          x2 = int(wFactor*x2)
          y2 = int(hFactor*y2)
          x = x1
          y = y1
          w = x2-x1
          h = y2-y1

          txtlbl = "{} : {:.2f}".format(classNames[classId],                    # Step 8
                                        confidence)
          txtsize = cv2.getTextSize(txtlbl,
                                    cv2.FONT_HERSHEY_SIMPLEX,
                                    0.5,
                                    1)
          bsize = txtsize[0]
          bsline = txtsize[1]

          cv2.rectangle(output,                                                 # Step 9
                        (x,y),
                        (x+w,y+h),
                        (0, 255, 0),
                        2)
          cv2.rectangle(output,
                        (x-1,y),
                        (x+bsize[0],y+bsize[1]+bsline),
                        (0, 255, 0),
                        -1)
          cv2.putText(output,
                      txtlbl,
                      (x-1,y+bsize[1]),
                      cv2.FONT_HERSHEY_SIMPLEX,
                      0.5,
                      (0, 0, 0),
                      1,
                      cv2.LINE_AA)
          
    if writer is None:
        fourcc = cv2.VideoWriter_fourcc(*"X264")
        writer = cv2.VideoWriter(outpath,
                                 fourcc,
                                 fps,
                                 (W, H),
                                 True)

    writer.write(output)                                                        # Step 10

    clear_output(wait=True)                                                     # Step 11
    if fr % 10 == 0:
      print(fr, "of frames analyzed ...")

    fr    = fr+1

                                                                                # Step 12
print("Closing ...")
writer.release()
vs.release()
print("Done.")

Closing ...
Done.
