In [1]:
!pip install opencv-python



# Importing Libraries

In [19]:
import cv2
import numpy as np

# Loading YOLO Model

In [20]:
net = cv2.dnn.readNet('yolov3.weights', 'yolov3.cfg')

# Loading Classes

In [21]:
classes = []
with open('coco.names', 'r') as f:
    classes= f.read().splitlines()
    
print(classes)

['person', 'bicycle', 'car', 'motorbike', 'aeroplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'sofa', 'pottedplant', 'bed', 'diningtable', 'toilet', 'tvmonitor', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush']


# Reading Image

In [39]:
img = cv2.imread('office2.jpg')
height, width, _ = img.shape 

# Creating Blob

In [40]:
blob = cv2.dnn.blobFromImage(img, 1/255, (416,416), (0,0,0), swapRB=True, crop=False)
'''
1. img:
This is the input image that you want to process.

2. 1/255:
This is a scaling factor applied to each pixel value of the image. It is common to scale pixel values to the range [0, 1] 
for neural network input. In this case, each pixel value is divided by 255.

3. (416, 416):
This is the spatial size to which the input image will be resized. The YOLO model used in this code expects input images 
to be of size 416x416 pixels.

4. (0, 0, 0):
This represents the mean subtraction values for each channel. Subtracting the mean helps center the data around zero. 
In this case, (0, 0, 0) means no mean subtraction.

5. swapRB=True:
This parameter specifies whether to swap the Red and Blue channels in the input image. 
OpenCV loads images in BGR (Blue, Green, Red) order by default, but many pre-trained models, including YOLO, 
expect images in RGB order. Setting swapRB=True swaps the channels accordingly.

6. crop=False:
This parameter determines whether to crop the image after resizing it to the specified size. 
In this case, crop=False means no cropping is performed.
'''

#for b in blob:
#    for n,img_blob in enumerate(b):
#        cv2.imshow(str(n), img_blob)

'\n1. img:\nThis is the input image that you want to process.\n\n2. 1/255:\nThis is a scaling factor applied to each pixel value of the image. It is common to scale pixel values to the range [0, 1] \nfor neural network input. In this case, each pixel value is divided by 255.\n\n3. (416, 416):\nThis is the spatial size to which the input image will be resized. The YOLO model used in this code expects input images \nto be of size 416x416 pixels.\n\n4. (0, 0, 0):\nThis represents the mean subtraction values for each channel. Subtracting the mean helps center the data around zero. \nIn this case, (0, 0, 0) means no mean subtraction.\n\n5. swapRB=True:\nThis parameter specifies whether to swap the Red and Blue channels in the input image. \nOpenCV loads images in BGR (Blue, Green, Red) order by default, but many pre-trained models, including YOLO, \nexpect images in RGB order. Setting swapRB=True swaps the channels accordingly.\n\n6. crop=False:\nThis parameter determines whether to crop th

In [41]:
net.setInput(blob) #sets the blob as the input to the YOLO network.


output_layers_names = net.getUnconnectedOutLayersNames()
layerOutputs = net.forward(output_layers_names) #performs a forward pass through the YOLO network to obtain the output layers.


boxes = []
confidences = []
class_ids = []

for output in layerOutputs:
    for detection in output:
        # Processing each detection, extracting information,
        # and storing relevant data in lists.
        scores = detection[5:]
        class_id = np.argmax(scores)
        confidence = scores[class_id]
        if confidence > 0.5:
            center_x = int(detection[0] * width)
            center_y = int(detection[1] * height)
            w = int(detection[2] * width)
            h = int(detection[3] * height)
            
            x = int(center_x - w/2)
            y = int(center_y - h/2)
            
            boxes.append([x,y,w,h])
            confidences.append((float(confidence)))
            class_ids.append(class_id)
            
            '''processes the output of the YOLO network, extracting bounding boxes, confidence scores, 
            and class IDs for detections with confidence greater than 0.5.'''
            
            
print(len(boxes))
indexes = cv2.dnn.NMSBoxes(boxes, confidences, 0.5, 0.4) #applies Non-Maximum Suppression to filter out overlapping 
                                                        #and low-confidence bounding boxes. The result is stored in the indexes variable.
print(indexes.flatten())

font = cv2.FONT_HERSHEY_PLAIN
colors = np.random.uniform(0, 155, size=(len(boxes), 3))

for i in indexes.flatten():
    # Extracting box coordinates, label, confidence, and color.
    # Drawing rectangle and text on the image.

    x,y,w,h = boxes[i]
    label = str(classes[class_ids[i]])
    confidence = str(round(confidences[i], 2))
    color = colors[i]
    cv2.rectangle(img, (x,y), (x+w, y+h), color, 2)
    cv2.putText(img, label + " " + confidence, (x, y+20), font,2, (255,255,255), 2)
    
    

cv2.imshow('Image', img)
cv2.waitKey(0)
cv2.destroyAllWindows()

15
[10  0  3  9  1 11  7]
