# Experimentation with querying over test videos

In [1]:
# basic
import math
import os
import pickle
from PIL import Image
import random
import time

# data
import numpy as np
import pandas as pd

# plot
import matplotlib.pyplot as plt
import matplotlib.patches as patches

# opencv
import cv2

# torch
import torch
import torchvision
from torchvision import transforms
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

## Check gpu

In [2]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


## Config vars

In [3]:
root = '.'
model_store_path = os.path.join(root, 'models')
model_file = os.path.join(model_store_path, 'bdd_fasterrcnn_mobilenet_v3_large_fpn_1622545030.pt')
backbone_model = 'fasterrcnn_mobilenet_v3_large_fpn'
target_labels = ['car', 'traffic sign', 'pedestrian']
num_classes = len(target_labels) + 1

# path to a sample video
test_video_folder = os.path.join(root, 'bdd100k_videos_test_00', 'bdd100k', 'videos', 'test')
test_videos = sorted(os.listdir(test_video_folder))

# change this to see different test videos
video_index = 0
test_video_path = os.path.join(test_video_folder, test_videos[video_index])

target_labels = ['car', 'traffic sign', 'pedestrian']
#target_labels = ['car']

# create a map for label->id
label_id_map = {}
id_label_map = {}
id_color_map = {}
colors = ['r', 'b', 'g']
for i in range(1, len(target_labels)+1):
    label_id_map[target_labels[i-1]] = i
    id_label_map[i] = target_labels[i-1]
    color = colors[i-1]
    id_color_map[i] = color

## Utility functions

In [4]:
def play_video(video_path):
    # Create a VideoCapture object and read from input file

    cap = cv2.VideoCapture(video_path)
    
    # get fps
    fps = round(cap.get(cv2.CAP_PROP_FPS))
    frame_count = round(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    
    print('Video info:')
    print(f'fps: {fps}')
    print(f'duration: {frame_count//fps} seconds')
    
    # Check if camera opened successfully
    if (cap.isOpened()== False): 
        print("Error opening video  file")
    
    frame_i = 0
    # Read until video is completed
    while(cap.isOpened()):

        # Capture frame-by-frame
        ret, frame = cap.read()
        if ret == True:

            # Display the resulting frame
            cv2.imshow('Frame', frame)
            frame_i += 1

            # Press Q on keyboard to  exit
            if cv2.waitKey(25) & 0xFF == ord('q'):
                break

        # Break the loop
        else: 
            break
    
    # When everything done, release 
    # the video capture object
    cap.release()

    # Closes all the frames
    cv2.destroyAllWindows()

In [5]:
def check_condition(labels, target_counts, condition):
    
    detected_label_counts = [0 for i in range(len(target_labels))]
    
    # populate each label frequency
    for label in labels:
        detected_label_counts[label-1] += 1
    
    if condition == 'greater_than':
        if detected_label_counts > target_counts:
            return True
        else:
            return False
    elif condition == 'lesser_than':
        if detected_label_counts < target_counts:
            return True
        else:
            return False
    elif condition == 'equal':
        if detected_label_counts == target_counts:
            return True
        else:
            return False

In [6]:
def iou(box1, box2):
    x1 = max(box1[0], box2[0])
    y1 = max(box1[1], box2[1])
    x2 = min(box1[2], box2[2])
    y2 = min(box1[3], box2[3])
    
    # intersection area
    intersection_area = max(0, x2 - x1 + 1) * max(0, y2 - y1 + 1)

    # individual areas
    box1_area = (box1[2] - box1[0] + 1) * (box1[3] - box1[1] + 1)
    box2_area = (box2[2] - box2[0] + 1) * (box2[3] - box2[1] + 1)
    
    iou = intersection_area / float(box1_area + box2_area - intersection_area)

    return iou

In [7]:
def merge_bounding_boxes(labels, boxes):
    
    total_boxes = len(boxes)
    
    # Bool array indicating which initial bounding rect has
    # already been used
    rectsUsed = [False for i in range(total_boxes)]
    
    # sort boxes and labels together based on the x1 coordinate of the box
    labels, boxes = (list(t) for t in zip(*sorted(zip(labels, boxes), key=lambda i:i[1][0])))
       
    # Array of accepted rects
    acceptedLabels = []
    acceptedRects = []

    # Merge threshold for x coordinate distance. 
    xThr = 3

    # Iterate all initial bounding rects
    for supIdx, supVal in enumerate(boxes):
        if (rectsUsed[supIdx] == False):

            # Initialize current rect
            currxMin = supVal[0]
            currxMax = supVal[2]
            curryMin = supVal[1]
            curryMax = supVal[3]

            # This bounding rect is used
            rectsUsed[supIdx] = True

            # Iterate all initial bounding rects
            # starting from the next
            for subIdx, subVal in enumerate(boxes[(supIdx+1):], start = (supIdx+1)):

                # Initialize merge candidate
                candxMin = subVal[0]
                candxMax = subVal[2]
                candyMin = subVal[1]
                candyMax = subVal[3]

                # Check if x distance between current rect
                # and merge candidate is small enough
                #if iou(boxes[supIdx], boxes[subIdx]) == 0.0:
                if (candxMin <= currxMax + xThr) and labels[supIdx] == labels[subIdx]:

                    # Reset coordinates of current rect
                    currxMax = candxMax
                    curryMin = min(curryMin, candyMin)
                    curryMax = max(curryMax, candyMax)

                    # Merge candidate (bounding rect) is used
                    rectsUsed[subIdx] = True

            # No more merge candidates possible, accept current rect
            acceptedRects.append([currxMin, curryMin, currxMax, curryMax])
            acceptedLabels.append(labels[supIdx])
                        
    return acceptedLabels, acceptedRects

In [8]:
def get_fastrcnn_model(num_classes):

    # load the pretrained model
    if backbone_model == 'fasterrcnn_resnet50_fpn':
        model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
    elif backbone_model == 'fasterrcnn_mobilenet_v3_large_fpn':
        model = torchvision.models.detection.fasterrcnn_mobilenet_v3_large_fpn(pretrained=True)

    # number of input features for the classifier
    in_features = model.roi_heads.box_predictor.cls_score.in_features

    # replace with pretrained head with a new one
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
    
    return model

In [9]:
def plot_image_with_boxes(img, labels, boxes):
    
    # Create figure and axes
    fig, ax = plt.subplots(1)

    # Display the image
    ax.imshow(img.permute(1, 2, 0))

    # draw each box
    for i in range(len(boxes)):
        if int(labels[i]) in id_label_map:
            bottom_left = (boxes[i][0], boxes[i][1])
            width = boxes[i][2] - boxes[i][0]
            height = boxes[i][3] - boxes[i][1]

            label = int(labels[i])

            # Create a Rectangle patch
            rect = patches.Rectangle(bottom_left, width, height, linewidth=1,
                                     edgecolor=id_color_map[label], facecolor="none")

            # Add the patch to the Axes
            ax.add_patch(rect)
    plt.show()

In [10]:
# get the model definition and load from file
model = get_fastrcnn_model(num_classes)
model.load_state_dict(torch.load(model_file))
model = model.to(device)
model.eval()

FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (0): ConvBNActivation(
        (0): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (1): FrozenBatchNorm2d(16, eps=1e-05)
        (2): Hardswish()
      )
      (1): InvertedResidual(
        (block): Sequential(
          (0): ConvBNActivation(
            (0): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=16, bias=False)
            (1): FrozenBatchNorm2d(16, eps=1e-05)
            (2): ReLU(inplace=True)
          )
          (1): ConvBNActivation(
            (0): Conv2d(16, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): FrozenBatchNorm2d(16, eps=1e-05)
            (2): Identity()
          )
        )
      )
      (2): InvertedResidual(


In [11]:
def image_loader(loader, image):
    image = Image.fromarray(image).convert('RGB')
    image = loader(image).float()
    return image

In [12]:
def get_predictions(frame, data_transforms):
    tensor_img = image_loader(data_transforms, frame)

    start_time = time.time()
    with torch.no_grad():
        outputs = model([tensor_img.to(device)])
        
    #print(f"input image shape: {tensor_img.shape}")
    #print(f"model eval time: {int(time.time()) - start_time}")
    #print(f"outputs: {outputs}")
    
    # for now we are checking only 1 image at at time
    target = outputs[0]
    
    labels = target['labels'].to('cpu').detach().numpy().tolist()
    boxes = target['boxes'].to('cpu').detach().numpy().tolist()
    
    # Need to do compression of overlapping (beyond a threshold) boxes
    compressed_labels, compressed_boxes = merge_bounding_boxes(labels, boxes)
    
    #plot_image_with_boxes(tensor_img, compressed_labels, compressed_boxes)
    
    return tensor_img, compressed_labels, compressed_boxes

In [13]:
# just a single transform for now
data_transforms = transforms.Compose([
    transforms.ToTensor()
])

In [14]:
def query(video_path, sample_freq, target_counts, condition, see_video, output_plots):
    
    # timestamps that satisfy the given condition
    result_timestamps = []
    
    # open capture
    cap = cv2.VideoCapture(video_path)
    
    # video info
    fps = round(cap.get(cv2.CAP_PROP_FPS))
    total_frame_count = round(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    
    # Check if camera opened successfully
    if (cap.isOpened()== False): 
        print("Error opening video  file")
    
    # frame counter
    frame_i = 1
    
    # Read until video is completed
    while frame_i <= total_frame_count:
        
        # found a frame with the conditions
        found = False
        
        # Capture frame-by-frame
        ret, frame = cap.read()
        if ret == True:
            
            # if frame count is at an integer second
            if (frame_i % fps) == 0:
                seconds_elapsed = frame_i // fps
                
                # if seconds elapsed equals the sampling frequency the user wants
                if seconds_elapsed % sample_freq == 0:
                    
                    # then run model
                    tensor_img, compressed_labels, compressed_boxes = get_predictions(frame, data_transforms)
                    
                    # and check for conditions
                    if check_condition(compressed_labels, target_counts, condition):
                        result_timestamps.append(seconds_elapsed)
                        found = True
                        
                        # if you want to output the image
                        if output_plots:
                            plot_image_with_boxes(tensor_img, compressed_labels, compressed_boxes)
            
            if see_video:
                
                # add a green border momentarily when the frame satisfies the user conditions
                if found:
                    frame = cv2.copyMakeBorder(frame, 20, 20, 20, 20, cv2.BORDER_CONSTANT, value=[0, 255, 0])
                else:
                    frame = cv2.copyMakeBorder(frame, 20, 20, 20, 20, cv2.BORDER_CONSTANT, value=[0, 0, 255])
                
                # display the frame
                cv2.imshow('Frame', frame)
                
                # Press Q on keyboard to  exit
                if cv2.waitKey(25) & 0xFF == ord('q'):
                    break
                
            frame_i += 1
        
        # Break the loop
        else: 
            break
        
    # release the video capture object
    cap.release()

    # Closes all the frames
    cv2.destroyAllWindows()
    
    return result_timestamps

In [15]:
# The below values correspond to the query 'Check every 3 seconds and find all frames with more than 2 cars, more than 1 traffic sign, more than 1 pedestrian'
sample_freq = 3
target_counts = [3,1,2]

# greater_than, lesser_than or equal
condition = 'greater_than'

In [16]:
result_timestamps = query(test_video_path, sample_freq, target_counts, condition, see_video=True, output_plots=False)

In [17]:
print(f"Timestamps with {condition} {target_counts[0]} cars, {target_counts[1]} signs, {target_counts[2]} pedestrians, sampled every {sample_freq} seconds:\n{result_timestamps}")

Timestamps with greater_than 3 cars, 1 signs, 2 pedestrians, sampled every 3 seconds:
[3, 6, 9, 18]
