# Video detection using traind model

You can use the trained model to detect a video. 

The priciple is to separate the video into hundureds of frames (images), and detect images one by one, then put those detected image togethor to form a detected video. 

## Table of Content
* Set up env
* Load the trained model
* Separate video into frames and run detection on them
* Make new video using detected frames

# Set up env

In [1]:
# Path and version numbers for current python
!which python
!python --version

/usr/local/bin/python
Python 3.8.16


In [2]:
# Check the tensorflow and keras version
import tensorflow as tf
import keras
print(keras.__version__)
print(tf.__version__)

2.9.0
2.9.2


In [3]:
# clone Mask_RCNN repo and install packages

%%shell
git clone https://github.com/akTwelve/Mask_RCNN.git
cd Mask_RCNN
python setup.py install

Cloning into 'Mask_RCNN'...
remote: Enumerating objects: 979, done.[K
remote: Total 979 (delta 0), reused 0 (delta 0), pack-reused 979[K
Receiving objects: 100% (979/979), 137.72 MiB | 27.54 MiB/s, done.
Resolving deltas: 100% (570/570), done.
running install
running bdist_egg
running egg_info
creating mask_rcnn.egg-info
writing mask_rcnn.egg-info/PKG-INFO
writing dependency_links to mask_rcnn.egg-info/dependency_links.txt
writing top-level names to mask_rcnn.egg-info/top_level.txt
writing manifest file 'mask_rcnn.egg-info/SOURCES.txt'
reading manifest template 'MANIFEST.in'
adding license file 'LICENSE'
writing manifest file 'mask_rcnn.egg-info/SOURCES.txt'
installing library code to build/bdist.linux-x86_64/egg
running install_lib
running build_py
creating build
creating build/lib
creating build/lib/mrcnn
copying mrcnn/visualize.py -> build/lib/mrcnn
copying mrcnn/model.py -> build/lib/mrcnn
copying mrcnn/parallel_model.py -> build/lib/mrcnn
copying mrcnn/utils.py -> build/lib/mrcn



In [4]:
import os
import sys
import random
import math
import numpy as np
import skimage.io
import matplotlib
import matplotlib.pyplot as plt

# Root directory of the project
ROOT_DIR = os.path.abspath("./Mask_RCNN/")

# Import Mask RCNN
sys.path.append(ROOT_DIR)  # To find local version of the library
from mrcnn import utils
import mrcnn.model as modellib
from mrcnn import visualize

# Import COCO config
sys.path.append(os.path.join(ROOT_DIR, "samples/coco/"))  # find local version
import coco

%matplotlib inline 

# Directory to save logs and trained model
MODEL_DIR = os.path.join(ROOT_DIR, "logs")

# Local path to trained weights file
COCO_MODEL_PATH = os.path.join(ROOT_DIR, "mask_rcnn_coco.h5")
# Download COCO trained weights from Releases if needed
if not os.path.exists(COCO_MODEL_PATH):
    utils.download_trained_weights(COCO_MODEL_PATH)

# Directory of images to run detection on
IMAGE_DIR = os.path.join(ROOT_DIR, "images")

Downloading pretrained model to /content/Mask_RCNN/mask_rcnn_coco.h5 ...
... done downloading pretrained model!


In [5]:
# download original video and model from my github
!git lfs clone https://github.com/BaosenZ/amoeba-video-detection.git

          with new flags from 'git clone'

'git clone' has been updated in upstream Git to have comparable
speeds to 'git lfs clone'.
Cloning into 'amoeba-video-detection'...
remote: Enumerating objects: 250, done.[K
remote: Counting objects: 100% (39/39), done.[K
remote: Compressing objects: 100% (30/30), done.[K
remote: Total 250 (delta 7), reused 39 (delta 7), pack-reused 211[K
Receiving objects: 100% (250/250), 142.83 MiB | 38.87 MiB/s, done.
Resolving deltas: 100% (110/110), done.
Git LFS: (1 of 1 files) 225.96 MB / 225.96 MB


In [6]:
# !rm -rf amoeba-video-detection

In [7]:
!unzip amoeba-video-detection/trained-amoeba-model.zip

Archive:  amoeba-video-detection/trained-amoeba-model.zip
  inflating: trained-amoeba-model/mask_rcnn_amoeba_cfg_0009.h5  


# Load the trained model

In [8]:
class_names = ['BG', 'amoeba']

In [9]:
class InferenceConfig(coco.CocoConfig):
    NAME = "amoeba_cfg"
    GPU_COUNT = 1
    IMAGES_PER_GPU = 3
    batch_size = 3  # Batch size = GPU_COUNT * IMAGES_PER_GPU
    NUM_CLASSES = 1 + 1

config = InferenceConfig()
config.display()


Configurations:
BACKBONE                       resnet101
BACKBONE_STRIDES               [4, 8, 16, 32, 64]
BATCH_SIZE                     3
BBOX_STD_DEV                   [0.1 0.1 0.2 0.2]
COMPUTE_BACKBONE_SHAPE         None
DETECTION_MAX_INSTANCES        100
DETECTION_MIN_CONFIDENCE       0.7
DETECTION_NMS_THRESHOLD        0.3
FPN_CLASSIF_FC_LAYERS_SIZE     1024
GPU_COUNT                      1
GRADIENT_CLIP_NORM             5.0
IMAGES_PER_GPU                 3
IMAGE_CHANNEL_COUNT            3
IMAGE_MAX_DIM                  1024
IMAGE_META_SIZE                14
IMAGE_MIN_DIM                  800
IMAGE_MIN_SCALE                0
IMAGE_RESIZE_MODE              square
IMAGE_SHAPE                    [1024 1024    3]
LEARNING_MOMENTUM              0.9
LEARNING_RATE                  0.001
LOSS_WEIGHTS                   {'rpn_class_loss': 1.0, 'rpn_bbox_loss': 1.0, 'mrcnn_class_loss': 1.0, 'mrcnn_bbox_loss': 1.0, 'mrcnn_mask_loss': 1.0}
MASK_POOL_SIZE                 14
MASK_SHAPE         

In [10]:
# Create model object in inference mode and Load weights.
Trained_Model_Path = "trained-amoeba-model/mask_rcnn_amoeba_cfg_0009.h5"
model = modellib.MaskRCNN(mode="inference", model_dir=MODEL_DIR, config=config)
model.load_weights(Trained_Model_Path, by_name=True)

Instructions for updating:
Use fn_output_signature instead


## if run pretrained coco model in Mask RCNN package

In [11]:
# COCO dataset class names
# class_names = ['BG', 'person', 'bicycle','car', 'motorcycle','airplane',
#                   'bus', 'train', 'truck', 'boat', 'traffic light',
#                   'fire hydrant', 'stop sign', 'parking meter','bench','bird',
#                   'cat','dog','horse','sheep','cow','elephant','bear',
#                   'zebra','giraffe','backpack','umbrella','handbag','tie',
#                   'suitcase', 'frisbee', 'skis','snowboard','sports ball',
#                   'kite','baseball bat','baseball glove', 'skateboard',
#                   'surfboard','tennis racket','bottle','wine glass','cup',
#                   'fork','knife','spoon','bowl','banana','apple',
#                   'sandwich','orange','broccoli','carrot','hot dog', 'pizza',
#                   'donut','cake','chair','couch','potted plant','bed',
#                   'dining table','toilet','tv','laptop','nouse','remote',
#                   'keyboard','cell phone','microwave','oven','toaster',
#                   'sink','refrigerator','book','clock','vase','scissors',
#                   'teddy bear','hair drier','toothbrush']

In [12]:
# class InferenceConfig(coco.CocoConfig):
#     GPU_COUNT = 1
#     IMAGES_PER_GPU = 3
#     batch_size = 3 # Batch size = GPU_COUNT * IMAGES_PER_GPU
#     NUM_CLASSES = 80 + 1
    
# config = InferenceConfig()
# config.display()

In [13]:
# Create model object in inference mode and Load weights.
# model = modellib.MaskRCNN(mode="inference", model_dir=MODEL_DIR, config=config)
# model.load_weights(COCO_MODEL_PATH, by_name=True, exclude=[ "mrcnn_class_logits", "mrcnn_bbox_fc", "mrcnn_bbox", "mrcnn_mask"])

# Separate video into frames and run detection on them

In [14]:
import glob
ROOT_DIR = os.getcwd()
VIDEO_PATH = os.path.join(ROOT_DIR, "amoeba-video-detection/original-video/amoeba.mp4")   # choose video file
print(VIDEO_PATH)
DETECT_IMAGE_SAVE_DIR = os.path.join(ROOT_DIR, "output-video-images")
print(DETECT_IMAGE_SAVE_DIR)

if not os.path.exists(DETECT_IMAGE_SAVE_DIR):
    os.makedirs(DETECT_IMAGE_SAVE_DIR)

/content/amoeba-video-detection/original-video/amoeba.mp4
/content/output-video-images


In [15]:
# import random
# import math
# import numpy as np
# import scipy.misc

# define random colors
# def random_colors(N):
#     np.random.seed(1)
#     colors = [tuple(255 * np.random.rand(3)) for _ in range(N)]
#     return colors

#apply mask to image
# def apply_mask(image, mask, color, alpha=0.5):
#     for n, c in enumerate(color):
#         image[:,:,n]=np.where(mask==1, image[:,:,n]*(1-alpha)+alpha *c, image[:,:,n])
#     return image

#take the image and apply the mask, box, and Label
# def display_instances(image, boxes, masks, ids, names, scores):
#     n_instances = boxes.shape[0]
#     colors = random_colors(n_instances)
#     if not n_instances:
#         print('NO INSTANCES TO DISPLAY')
#     else:
#         assert boxes.shape[0] == masks.shape[-1] == ids.shape[0]
    
#     for i, color in enumerate(colors):
#          if not np.any(boxes[i]):
#             continue
    
#     y1, x1, y2, x2 = boxes[i]
#     label = names[ids[i]]
#     score = scores[i] if scores is not None else None
#     caption = '{} {:.2f}'.format(label, score) if score else label
#     # mask = masks[:, :, i]
#     # image = apply_mask(image, mask, color)
#     image = cv2.rectangle(image, (x1, y1), (x2, y2), color, 2)
#     image = cv2.putText(image, caption, (x1, y1), cv2.FONT_HERSHEY_COMPLEX, 0.7, color, 2)
#     return image

In [None]:
import cv2

capture = cv2.VideoCapture(VIDEO_PATH)
frames =[]
frame_count = 0 
batch_size = 3

# Find OpenCV version
(major_ver, minor_ver, subminor_ver) = (cv2.__version__).split('.')
if int(major_ver) < 3 :
    fps = capture.get(cv2.cv.CV_CAP_PROP_FPS)
    print("Frames per second using video.get(cv2.cv.CV_CAP_PROP_FPS): {0}".format(fps))
else :
    fps = capture.get(cv2.CAP_PROP_FPS)
    print("Frames per second using video.get(cv2.CAP_PROP_FPS) : {0}".format(fps))

# Separate video into frames and run detection on them
while True:
    ret, frame = capture.read()
    # Bail out when the video file ends
    if not ret:
        break        
    # Save each frame of the video to a list
    frame_count += 1
    frames.append(frame)
    if len(frames) == batch_size:
        results = model.detect(frames, verbose=0)
        for i, item in enumerate(zip(frames, results)):
            frame = item[0]
            r = item[1]
            # frame = display_instances(frame, r['rois'], r['masks'], r['class_ids'], class_names, r['scores'])
            # frame = apply_mask(frame, r['rois'], color)
            for box in r['rois']:
                startY, startX, endY, endX = box
                frame = cv2.rectangle(frame, (startX, startY), (endX, endY), (255,0,0), 2)
            name = '{0}.jpg'.format(frame_count + i - batch_size)
            name = os.path.join(DETECT_IMAGE_SAVE_DIR, name)
            print(name)
            cv2.imwrite(name, frame)
        # Clear the frames array to start the next batch
        frames = []

In [None]:
# Get all image file paths to a list.
images = list(glob.iglob(os.path.join(DETECT_IMAGE_SAVE_DIR, '*.*')))

# Sort the images by name index.
images = sorted(images, key=lambda x: float(os.path.split(x)[1][:-3]))

# Make new video using detected frames

In [None]:
def make_video(outvid, images=None, fps=30, size=None,
               is_color=True, format="FMP4"):
    """
    Create a video from a list of images.
 
    @param      outvid      output video
    @param      images      list of images to use in the video
    @param      fps         frame per second
    @param      size        size of each frame
    @param      is_color    color
    @param      format      see http://www.fourcc.org/codecs.php
    @return                 see http://opencv-python-tutroals.readthedocs.org/en/latest/py_tutorials/py_gui/py_video_display/py_video_display.html
    """
    from cv2 import VideoWriter, VideoWriter_fourcc, imread, resize
    fourcc = VideoWriter_fourcc(*format)
    vid = None
    for image in images:
        if not os.path.exists(image):
            raise FileNotFoundError(image)
        img = imread(image)
        if vid is None:
            if size is None:
                size = img.shape[1], img.shape[0]
            vid = VideoWriter(outvid, fourcc, float(fps), size, is_color)
        if size[0] != img.shape[1] and size[1] != img.shape[0]:
            img = resize(img, size)
        vid.write(img)
    vid.release()
    return vid

outvid = "output.mp4"
make_video(outvid, images, fps=30)

In [None]:
# from google.colab import files
# files.download('output.mp4')

# References

1. Good visualize with openCV: https://pyimagesearch.com/2018/11/19/mask-r-cnn-with-opencv/ and https://zhuanlan.zhihu.com/p/84149055.
2. Some codes are modified from the blog: https://www.dlology.com/blog/how-to-run-object-detection-and-segmentation-on-video-fast-for-free/. 