Tracking Objects in Video with YOLOV3
===============================================

Yolo is very fast (You Only Look Once) 

In [1]:
import cv2 
import numpy as np 
import os 

In [2]:
BL = (195,654)
BR = (1068,654)
TR = (848,273)
TL = (398,273) 

court_x, court_y = [data for data in zip(BL,BR,TR,TL)] 

start_x = min(court_x) 
range_x = max(court_x) - min(court_x) 
start_y = min(court_y) 
range_y = max(court_y) - min(court_y) 

In [3]:
PATH_WEIGHTS = os.path.abspath(os.path.join('YoloV3', 'yolov3.weights'))
PATH_CFG = os.path.abspath(os.path.join('YoloV3', 'yolov3.cfg'))
PATH_NAMES = os.path.join('YoloV3', 'coconames.txt') 

net = cv2.dnn.readNet(PATH_WEIGHTS, PATH_CFG)  
classes = [] 

with open(PATH_NAMES, 'r') as f: 
    classes = f.read().splitlines()  
    
print(classes) 

['person', 'bicycle', 'car', 'motorbike', 'aeroplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'sofa', 'pottedplant', 'bed', 'diningtable', 'toilet', 'tvmonitor', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush']


In [4]:
width = 1280 
height = 720

In [5]:
def get_frames(filename):
    video = cv2.VideoCapture(filename)
    while video.isOpened():
        ret, frame = video.read()
        if ret:
            yield frame
        else:
            break
    video.release()
    yield None

In [6]:
# Converting Image to Blob for using Coco Model (Display blobs here) 
def getBlob(frame): 
    blob = cv2.dnn.blobFromImage(frame, 1/255, (416,416), (0,0,0), swapRB = True, crop=False) 
    return blob

In [7]:
def getLayerOutputs(blob): 
    net.setInput(blob) 
    output_layers_names = net.getUnconnectedOutLayersNames() 
    layerOutputs = net.forward(output_layers_names)
    
    return layerOutputs 

In [8]:
def yoloPredict(layerOutputs): 
    boxes = [] 
    confidences = [] 
    class_ids = [] 
    
    X_BUFFER = 0
    Y_BUFFER = 70

    for output in layerOutputs: 
        for detection in output: 
            scores = detection[5:]  # Scores of all the 80 classes (Starts from element 6)
            class_id = np.argmax(scores)  
            if not class_id == 0: # Only want person class
                continue 
                
            confidence = scores[class_id]
            if confidence > 0.5: 
                center_x = int(detection[0] * width) 
                center_y = int(detection[1] * height) 
                if (start_x - X_BUFFER<= center_x <=start_x + range_x + X_BUFFER) and (start_y - Y_BUFFER<= center_y <=start_y + range_y + Y_BUFFER):
                    w = int(detection[2] * width) 
                    h = int(detection[3] * height) 

                    x = int(center_x - w/2) # Doing this because OpenCV needs coordinate for corner to make rectangle
                    y = int(center_y - h/2) # Same reason 

                    boxes.append([x,y,w,h])  
                    confidences.append(float(confidence)) 
                    class_ids.append(class_id) 
                
    return boxes, confidences, class_ids 

In [9]:
def displayBoxes(frame, boxes , confidences , class_ids): 
    if len(boxes) == 0: 
        return False
    
    indexes = cv2.dnn.NMSBoxes(boxes, confidences, 0.5, 0.4)  # Unique boxes
    font = cv2.FONT_HERSHEY_PLAIN 
    colors = np.random.uniform(0,255,size=(len(boxes), 3)) 

    for i in indexes.flatten():  
        label = str(classes[class_ids[i]])  
        x,y,w,h = boxes[i] 
        confidence = str(round(confidences[i],2)) 
        color = colors[i] 

        cv2.rectangle(frame, (x,y), (x+w, y+h), color, 2) 
        cv2.putText(frame, str(i) + ' ' + label + ' ' + confidence, (x,y+20), font, 2, (255,255,255), 2)
    
    cv2.imshow('frame', frame) 
    # 27 is the escape key. The number in the function is waiting time
    if cv2.waitKey(30) == 27:
        if cv2.waitKey(0) == 27:
            return True 
    
    return False

#### Detecting Object in Image

In [10]:
f = os.path.join('data','train','10','frame1.jpg') 
img = cv2.imread(f)  

In [11]:
height, width, _ = img.shape 
print(height, width) 

# cv2.imshow('Image', img) 
# cv2.waitKey(0) 
# cv2.destroyAllWindows() 

720 1280


In [12]:
# Converting Image to Blob for using Coco Model (Display blobs here) 

blob = getBlob(img)  

# for b in blob: 
#     for n, blob_img in enumerate(b): 
#         cv2.imshow(str(n), blob_img) 

# cv2.waitKey(0) 
# cv2.destroyAllWindows() 

In [13]:
layerOutputs = getLayerOutputs(blob) 

In [14]:
# Getting Predictions

boxes , confidences , class_ids = yoloPredict(layerOutputs) 

In [15]:
boxes

[[586, 159, 68, 148],
 [616, 314, 83, 197],
 [603, 312, 109, 199],
 [610, 322, 95, 191],
 [602, 317, 111, 199]]

In [16]:
# Getting rid of objects that had multiple bounding boxes made over it 
# Display indexes of only unique object bbox 

indexes = cv2.dnn.NMSBoxes(boxes, confidences, 0.5, 0.4) 
print(indexes.flatten())

[4 0]


In [17]:
# Display BBox on image 

displayBoxes(img, boxes,confidences, class_ids)  

cv2.waitKey(0) 
cv2.destroyAllWindows() 

#### Detecting Objects in Video 

In [10]:
VFILENAME = os.path.join('data', 'match2.mp4')

for frame in get_frames(VFILENAME):
    if frame is None: break 
        
    blob = getBlob(frame)
    layerOutputs = getLayerOutputs(blob) 
    boxes, confidences, class_ids = yoloPredict(layerOutputs) 
    terminate = displayBoxes(frame, boxes,confidences,class_ids) 
    if terminate: 
        break 
        
cv2.destroyAllWindows()
    

THINGS THAT CAN BE DONE 


1) Spread particles only within court space 
2) This is YOLO608. We can change the model to lower dimension 
3) We can also try YoloV5 

4) Yolo => People detect 
   Particle Filter -> Color detect on Yolo detected people
   
5) Using shape + color difference for particle filter instead of color alone (Not sure how to do)