# First Step

This notebook shows step-by-step the implementation I used for my video. Any input video is valid, but note that its focus should be found in one of the 
MS-COCO pre-trained classes. Class recognition is capable by choice in the 'apply_mask' section, where the extracted label should be defined.

In [36]:
# pip install ffpyplayer
import cv2, os, sys
import numpy as np

from google.colab import drive
path = '/content/drive/My Drive/Colab Notebooks/Object Detection/Mask_RCNN'
path_PARENT = '/content/drive/My Drive/Colab Notebooks/Object Detection'
drive.mount('/content/drive')

if os.getcwd() != path:
    os.chdir(path)

from samples import coco
from mrcnn import utils
from mrcnn import model as modellib


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Pretrained model and configuration info

Loading the pre-trained model data ([Mask-RCNN](https://github.com/matterport/Mask_RCNN/releases) trained by COCO dataset), \\
and carefully fitting the right paths to the CURRENT working directory.

In [0]:
# Load the pre-trained model data
ROOT_DIR = os.getcwd()
# Download pretrained net @ https://github.com/matterport/Mask_RCNN/releases
COCO_MODEL_PATH = os.path.join(path_PARENT, 'mask_rcnn_coco.h5')

if not os.path.exists(COCO_MODEL_PATH):
    utils.download_trained_weights(COCO_MODEL_PATH)

The original configuration information is saved in config.py file. It can be changed if necessary. 

It's better to use the default value, but you can also change the GPU information to suit the personal GPU well.

In [0]:
# Change the config infermation
class InferenceConfig(coco.CocoConfig):
    GPU_COUNT = 1
    
    # Number of images to train with on each GPU. A 12GB GPU can typically
    # handle 2 images of 1024x1024px.
    # Adjust based on your GPU memory and image sizes. Use the highest
    # number that your GPU can handle for best performance.
    IMAGES_PER_GPU = 1
    
config = InferenceConfig()
# config.print()

In [0]:
# COCO dataset object names
model = modellib.MaskRCNN(mode="inference", model_dir=COCO_MODEL_PATH, config=config)
model.load_weights(COCO_MODEL_PATH, by_name=True)


In [0]:

class_names = [
    'BG', 'person', 'bicycle', 'car', 'motorcycle', 'airplane',
    'bus', 'train', 'truck', 'boat', 'traffic light',
    'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird',
    'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear',
    'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie',
    'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
    'kite', 'baseball bat', 'baseball glove', 'skateboard',
    'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup',
    'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
    'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
    'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed',
    'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
    'keyboard', 'cell phone', 'microwave', 'oven', 'toaster',
    'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors',
    'teddy bear', 'hair drier', 'toothbrush']
    



```
# This is formatted as code
```

# Define the processing functions

Now define two image process functions to process each frame of the input video. 

apply_mask is used to change the background information to grayscale.

display_instances is used to show the object detection result in original image.

In [0]:
def apply_mask(image, mask):
    """ This function receives the detected RCNN's frame with its label map.
    It seperates the desired label from the rest of the frame which is automatically
    coloured at black and white. I chose to colorize the 'person' label, though
    any label ( @ class_names ) is feasible.
    """
    gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    image[:, :, 0] = np.where( mask == 0, gray_image[:, :], image[:, :, 0]) # R
    image[:, :, 1] = np.where( mask == 0, gray_image[:, :], image[:, :, 1]) # G
    image[:, :, 2] = np.where( mask == 0, gray_image[:, :], image[:, :, 2]) # B
    return image

In [0]:
def display_instances(image, boxes, masks, ids, names, scores):
    """ This function utilizes for extracting the detection results from
    the original image. Note that the max_area will save the largest object
     for all the detection results"""

    max_area = 0
    
    # n_instances saves the amount of all objects
    n_instances = boxes.shape[0]

    if not n_instances:
        print('NO INSTANCES TO DISPLAY')
    else:
        assert boxes.shape[0] == masks.shape[-1] == ids.shape[0]

    for i in range(n_instances):
        if not np.any(boxes[i]):
            continue

        # compute the square of each object
        y1, x1, y2, x2 = boxes[i]
        square = (y2 - y1) * (x2 - x1)

        label = names[ids[i]]
        if label == 'person':
            # save the largest object in the image as main character
            # other people will be regarded as background
            if square > max_area:
                max_area = square
                mask = masks[:, :, i]
            else:
                continue
        else:
            continue

        # apply mask for the image
        image = apply_mask(image, mask)
        
    return image

# Process Video

The following script is the main call, using the openCV video capture tools, \\
calling the Mask RCNN detection tools at each frame of the footage. \\
Eventually it is save to the cuttent working directory.

In [43]:
from google.colab.patches import cv2_imshow

input_video = 'Aladdin_low_res.mp4'
cap = cv2.VideoCapture(input_video)

# Recording Video
fps = cap.get(cv2.CAP_PROP_FPS)  # Find original frames rate
width, height = int(cap.get(3)), int(cap.get(4))
fcc = cv2.VideoWriter_fourcc('D', 'I', 'V', 'X')
vid_out = cv2.VideoWriter("video_output.avi", fcc, fps, (width, height))

# ----- Extracting desired footage segments ----- #
fr_i = 0
fr_tot = cap.get(cv2.CAP_PROP_FRAME_COUNT)
fr_0_a, fr_end_a = 0.205*fr_tot, 0.321*fr_tot
# fr_0_b, fr_end_b = 0.800*fr_tot, 0.936*fr_tot

while fr_i < fr_tot:
    ret, frame = cap.read()

    # ----- Desirable segment for detection ----- #
    # if ( (fr_i > fr_0_a) & (fr_i < fr_end_a) ) | ( (fr_i > fr_0_b) & (fr_i < fr_end_b) ) :
    if ( (fr_i > fr_0_a) & (fr_i < fr_end_a) ):
        # ---------- Print live index ----------- #    
        if fr_i % 25 == 0:
            print("Progress: {:.2f} [%]".format( 100*(fr_i - fr_0_a)/(fr_end_a - fr_0_a) ))
        # ------- Activate CNN detection -------- # 
        results = model.detect([frame], verbose=0)
        r = results[0]
        frame = display_instances(
            frame, r['rois'], r['masks'], r['class_ids'], class_names, r['scores'])
        # cv2_imshow(frame)         # uncomment for online presentation
        vid_out.write(frame)        # Recording Video

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break
    
    fr_i += 1

# -------- Close and Save video footage --------- #
cap.release()
vid_out.release()
cv2.destroyAllWindows()


Progress: 3.19 [%]
Progress: 7.37 [%]
Progress: 11.56 [%]
Progress: 15.74 [%]
Progress: 19.92 [%]
Progress: 24.11 [%]
Progress: 28.29 [%]
Progress: 32.48 [%]
Progress: 36.66 [%]
Progress: 40.84 [%]
Progress: 45.03 [%]
Progress: 49.21 [%]
Progress: 53.40 [%]
Progress: 57.58 [%]
Progress: 61.76 [%]
Progress: 65.95 [%]
Progress: 70.13 [%]
Progress: 74.32 [%]
Progress: 78.50 [%]
Progress: 82.68 [%]
Progress: 86.87 [%]
Progress: 91.05 [%]
Progress: 95.24 [%]
Progress: 99.42 [%]
