Imports OpenCV (Open Source Computer Vision Library) and Numerical Python

Helpful resource: https://learnopencv.com/deep-learning-based-object-detection-using-yolov3-with-opencv-python-c/

In [27]:
import cv2
import numpy as np

Initializes the YOLO object detection model version 3 by loading the pre-trained weights and configuration from https://github.com/pjreddie/darknet

In [28]:
# net is the YOLO Nueral Network trained by the YOLOv3 weights and config from GitHub.
net = cv2.dnn.readNet("yolov3.weights", "yolov3.cfg")

Gets the names of the output layers needed for object detection. Checks compatibility with OpenCV versions.

In [29]:
# Gets the names of all the layers in the YOLO Neural Network
layer_names = net.getLayerNames()

'''
    try: expects to return a list of lists 
        where each sub-list contains a single index.
    
    except: expects to returns a flat array of indices
        not wrapped in sub-lists.
        
    [layer_names[i[0] - 1] is used because OpenCV is index-1 based
        python is index-0 based.
'''
try:
    output_layers = [layer_names[i[0] - 1] for i in net.getUnconnectedOutLayers()]
except:
    output_layers = [layer_names[i - 1] for i in net.getUnconnectedOutLayers()]

Reads the class labels that YOLO was trained on (COCO.names also from https://github.com/pjreddie/darknet). Handles FileNotFound error.

In [30]:
classes = []

try:
    with open("coco.names", "r") as f:
        classes = [line.strip() for line in f.readlines()]
except FileNotFoundError:
    print("Error: 'coco.names' file not found.")
    exit()

Prepares the video file for object detection. Handles failure to open file and failure to properly read frames of video.

In [31]:
video_path = 'CityScene.mp4'

cap = cv2.VideoCapture(video_path)

if not cap.isOpened():
    print("Error: Could not open video.")
    exit()

'''
        ret is true if the frames of the video are read correctly.
        cap.read() is iterating through the frames.
        Each frame is an image represented by an array.
    '''
ret, frame = cap.read()

if not ret:
    print("Failed to grab a frame from the video.")
    cap.release()
    exit()

1. Loop for Video Processing: The infinite loop is used to process each frame of the video sequentially until the video ends or the loop is manually terminated by the user (q).

2. Frame Acquisition: Each iteration of the loop reads the next frame from the video. If there are no more frames to read (ret is False), the loop breaks, ending the processing.

3. Frame to Blob: The current frame is transformed into a blob (preprocessed image format compatible with the neural network). This scales the image and subtracts mean values.

4. Forward Pass: The blob is fed into the YOLO network by setting it as the input and then performing a forward pass to obtain the detection results in outs.

5. Processing Detections: YOLO:
        Confidence scores and class IDs.
        Filter out detections with confidence below a threshold.
        Calculate the bounding box coordinates.
        Store the coordinates, confidence scores, and class IDs.

6. Non-Max Suppression: Applies suppression to reduce overlapping bounding boxes.

7. Drawing Bounding Boxes and Labels: For each detection after suppression:
        Draw a bounding box around the detected object.
        Put a label with the class name and confidence score on the bounding box.

8. Displaying the Frame: Displays an annotated frame with boxes and labels.

9. End: Releases the capture and destroys all OpenCV windows.

In [32]:
while True:
    
    '''
        ret is true if the frames of the video are read correctly.
        cap.read() is iterating through the frames.
        Each frame is an image represented by an array.
    '''
    ret, frame = cap.read()
    
    # Break conditions for the while loop
    if not ret:
        break
    
    '''
        1/255 = 0.00392 for normalization
        YOLOv3 works with 416x416 pixel images
        The True converts BGR to RGB
        OpenCV reads images in BGR and YOLO uses RGB
    '''
    blob = cv2.dnn.blobFromImage(frame, 0.00392, (416, 416), (0, 0, 0), True, crop=False)
    
    # Sets up 'blob' as the input for the YOLO network 'net'
    net.setInput(blob)
    
    # Results are stored in 'outs' for a forward pass through the model
    outs = net.forward(output_layers)
    
    # Stores classification ids, confidence scores and box coordinates
    class_ids = []
    confidences = []
    boxes = []
    
    for out in outs:
        for detection in out:
            
            '''
                scores = detection[5:] gets the class confidence scores for the detection. 
                The first four elements in detection[] are the bounding box coordinates. 
                The class confidence scores start from the fifth element.
            '''
            scores = detection[5:]
            
            # argmax(scores) identifies the highest score (most likely image)
            class_id = np.argmax(scores)
            
            # if scores[class_id] is greater than 0.5, its considered a valid detection; otherwise its ignored
            confidence = scores[class_id]
            if confidence > 0.5:
                
                '''
                    center_x and center_y are center coordinates of the bounding box.
                    w and h are the width and height of the bounding box.
                    x and y are the top-left coordinates of the bounding box.
                    These are scaled to the frames dimensions.
                '''
                center_x = int(detection[0] * frame.shape[1])
                center_y = int(detection[1] * frame.shape[0])
                w = int(detection[2] * frame.shape[1])
                h = int(detection[3] * frame.shape[0])
                x = int(center_x - w / 2)
                y = int(center_y - h / 2)
                boxes.append([x, y, w, h])
                confidences.append(float(confidence))
                class_ids.append(class_id)
    
    '''
        NMSBoxes performs Non-Maximum Suppression on bounding boxes.
        0.5: The confidence threshold. 
        Only boxes with a confidence score higher than this threshold are Suppressed (if needed). 
        This removes considering weak detections.
        0.4 is the NMS threshold. It determines which boxes are considered for merging.
        This is applied for overlapping boxes.
        This helps determine if its actually two objects or just one.
    '''
    indices = cv2.dnn.NMSBoxes(boxes, confidences, 0.5, 0.4)
    
    '''
        flatten() flattens the list of lists that can result from NSMBoxes
        iterates through the best bounding boxes in boxes[]
            and through the confidence of the prediction in confidences[]
    '''
    for i in indices.flatten():
        box = boxes[i]
        # x, y, w, h are the indices
        x, y, w, h = box
        label = str(classes[class_ids[i]])
        confidence = confidences[i]
        
        '''
            .rectangle draws the bounding boxes
            (0, 255, 0) changes the color in RGB format, green in this instance.
            2 is the thickness of boundary-box.
        '''
        cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 255, 0), 2)
        
        '''
            .putText labels the boxes with the best prediction and confidence percent.
            (x, y + 30) the text is positioned 30 pixels above box.
            the numbers can be fine-tuned depending on the nature of the video
        '''
        cv2.putText(frame, f"{label} {confidence:.2f}", (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
    
    cv2.imshow("YOLO Object Detection", frame)
    
    # ends the frame if key 'q' is pressed
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break
    
# ends the cv2 window
cap.release()
cv2.destroyAllWindows()