In [2]:
#imports
import cv2
import numpy as np
import time

#function for loading the network into memory.It automatically detects configuration and framework based on file name specified
#pre-trained weights are stored in yolov3.weights and the neural network model architecture is stored in the yolov3.cfg file
net = cv2.dnn.readNet("yolov3.weights", "cfg/yolov3.cfg")

print("Welcome to Detection System! What do you want to get detected???\n")

#storing data of coco.names in a list
classes = []
with open("coco.names", "r") as f:
    classes = [line.strip() for line in f.readlines()]
    
# The YOLO neural network has 254 components. Get the name of all layers of the network.
layer_names = net.getLayerNames()

# Get the output layers names. 82, 94 and 106
output_layers=[]
for i in net.getUnconnectedOutLayers(): #index of output layers
    output_layers.append(layer_names[i-1])
    
#assigning random rgb value to each label of coco.names       
colors = np.random.uniform(0, 255, size=(len(classes), 3))

#choosing the option of whether real-time or through video
option = int(input("Enter 0 for real-time detection, 1 for video detection: "))
if (option == 0):
    cap = cv2.VideoCapture(0)
elif (option == 1):
    path = input("enter path for video: ")
    cap = cv2.VideoCapture(path)

#choosing fonr style
font = cv2.FONT_HERSHEY_PLAIN

# record start time
starting_time = time.time()
frame_id = 1


while True:
    #ret is a boolean variable that returns true if the frame is available.
    #frame is an image array vector (one frame from the video)
    ret, frame = cap.read()
    if(ret == False):
        break
    frame_id += 1

    #getting parameters of image array(frame)
    height, width, channels = frame.shape
    
    # Blob is used to extract feature from the image and to resize them. We are using scaling factor 0.00392
    #YOLO accepts three sizes, we are using (416x416) size. and swapping red and blue channels
    blob = cv2.dnn.blobFromImage(frame, 0.00392, (416, 416), (0, 0, 0), True, crop=False)

    #These two instructions calculate the network response
    net.setInput(blob)                  #the blob object is given as input to network
    outs = net.forward(output_layers)   #run inference through the network and gather predictions from output layers

    # for each detetion from each output layer get the confidence, class id, scores
    class_ids = []
    confidences = []
    boxes = []
    for out in outs:
        for detection in out:
            scores = detection[5:]
            class_id = np.argmax(scores)
            confidence = scores[class_id]
            # bounding box params and ignore weak detections (confidence > 0.2)
            if confidence > 0.2:
                # Object detected
                center_x = int(detection[0] * width)
                center_y = int(detection[1] * height)
                w = int(detection[2] * width)
                h = int(detection[3] * height)

                # Rectangle coordinates, scale the bounding box coordinates back relative to size of the image
                x = int(center_x - w / 2)
                y = int(center_y - h / 2)
                # update our list of bounding box coordinates, confidences,and class IDs
                boxes.append([x, y, w, h])
                confidences.append(float(confidence))
                class_ids.append(class_id)
    # apply non-maxima suppression to suppress weak, overlapping boundingboxes, a threshold of 0.8
    indexes = cv2.dnn.NMSBoxes(boxes, confidences, 0.8, 0.3)

    for i in range(len(boxes)):
        if i in indexes:
            x, y, w, h = boxes[i]
            label = str(classes[class_ids[i]])
            confidence = confidences[i]
            color = colors[class_ids[i]]
            # draw a bounding box rectangle and label on the image
            cv2.rectangle(frame, (x, y), (x + w, y + h), color, 2)
            cv2.putText(frame, label + " " + str(round(confidence, 2)), (x, y + 30), font, 3, color, 3)


    #calculating frames per sec and printing them
    elapsed_time = time.time() - starting_time
    fps = frame_id / elapsed_time
    cv2.putText(frame, "FPS: " + str(round(fps, 2)), (10, 50), font, 4, (0, 0, 0), 3)
    cv2.imshow("Image", frame)
    
    #break loop if esc is pressed
    key = cv2.waitKey(1)
    if key == 27:
        break
        
cap.release()
cv2.destroyAllWindows()

Welcome to Detection System! What do you want to get detected???

Enter 0 for real-time detection, 1 for video detection: 1
enter path for video: fruits.mp4
