## YOLO: You Only Look Once
- yolov3.cfg 
- yolov3.weights
- classes.txt

## YOLO Theory
- https://stackoverflow.com/questions/50575301/yolo-object-detection-how-does-the-algorithm-predict-bounding-boxes-larger-than
- https://www.kaggle.com/utkarshxy/object-detection-with-yolo-complete-theory-5mins
- https://www.youtube.com/watch?v=1LCb1PVqzeY&t=0s



## Import Library

In [2]:
import cv2
import numpy as np
import matplotlib.pyplot as plt

## Configuration YOLO 
readNet(model[, config[, framework]]) -> retval
   * @brief Read deep learning network represented in one of the supported formats.
   * @param[in] model Binary file contains trained weights. The following file
   *                  extensions are expected for models from different frameworks:
   *                  * `*.caffemodel` (Caffe, http://caffe.berkeleyvision.org/)
   *                  * `*.pb` (TensorFlow, https://www.tensorflow.org/)
   *                  * `*.t7` | `*.net` (Torch, http://torch.ch/)
   *                  * `*.weights` (Darknet, https://pjreddie.com/darknet/)
   *                  * `*.bin` (DLDT, https://software.intel.com/openvino-toolkit)
   * @param[in] config Text file contains network configuration. It could be a
   *                   file with the following extensions:
   *                  * `*.prototxt` (Caffe, http://caffe.berkeleyvision.org/)
   *                  * `*.pbtxt` (TensorFlow, https://www.tensorflow.org/)
   *                  * `*.cfg` (Darknet, https://pjreddie.com/darknet/)
   *                  * `*.xml` (DLDT, https://software.intel.com/openvino-

In [3]:
net=cv2.dnn.readNet('yolov3.weights','yolov3.cfg')
classes=[]
with open('classes.txt','r') as f:
    classes=f.read().splitlines()

In [4]:
print('Total classes: ', len(classes))
print(classes)

Total classes:  80
['person', 'bicycle', 'car', 'motorbike', 'aeroplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'sofa', 'pottedplant', 'bed', 'diningtable', 'toilet', 'tvmonitor', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush']


## Import Image for object Detection

In [86]:
image=cv2.imread('people.jpg')
cv2.imshow('Input Image: ',image)
cv2.waitKey()
cv2.destroyAllWindows()

In [87]:
print('Image shape: ',image.shape)
height,width,channel=image.shape
print(height)
print(width)

Image shape:  (775, 936, 3)
775
936


In [88]:
yolo_input_image=np.copy(image)
yolo_input_image.shape

(775, 936, 3)

## Blob from Image
blobFromImage(image[, scalefactor[, size[, mean[, swapRB[, crop[, ddepth]]]]]]) -> retval

In [89]:
blob=cv2.dnn.blobFromImage(yolo_input_image,1/255,(416,416),(0,0,0),swapRB=True,crop=False)

In [90]:
blob.shape

(1, 3, 416, 416)

In [91]:
for b in blob:
    print(b.shape)
    # indicate that there are 3 channel and each channel image size is: (image.height x image.width)

(3, 416, 416)


In [92]:
for b in blob:
    for channel,img_blob in enumerate(b):
        cv2.imshow(str(channel),img_blob)
cv2.waitKey()
cv2.destroyAllWindows()

## Initialize YOLO with Blobs

In [93]:
net.setInput(blob)

In [94]:
output_layers_name=net.getUnconnectedOutLayersNames()
output_layers_name

['yolo_82', 'yolo_94', 'yolo_106']

In [95]:
# Runs forward pass to compute output of layer with name @p outputName.
layerOutputs=net.forward(output_layers_name)

In [96]:
len(layerOutputs)

3

In [97]:
layerOutputs[0][0][0]

0.032563165

## Define Box Parameter, Confidence and Class ids

In [98]:
boxes=[]
confidences=[]
class_ids=[]

In [99]:
# First four parameters are: center x, center y, box width , box height
# Others parametre indicate the predicted class id
layerOutputs[0][0]

array([3.2563165e-02, 4.6636049e-02, 4.3901926e-01, 1.3100865e-01,
       7.0275079e-09, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e

In [100]:
for output in layerOutputs:
    for detection in output:
        scores=detection[5:]
        class_id=np.argmax(scores)
        confidence=scores[class_id]
        if confidence>0.5:
            center_x=int(detection[0]*width)
            center_y=int(detection[1]*height)
            w=int(detection[2]*width)
            h=int(detection[3]*height)
            
            x=int(center_x -(w/2))
            y=int(center_y -(h/2))
            
            boxes.append([x,y,w,h])
            confidences.append(float(confidence))
            class_ids.append(class_id)
            

In [101]:
class_ids

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 60,
 60,
 60,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 40,
 41,
 41,
 41,
 41,
 41,
 41,
 41,
 40]

In [102]:
print('Box found: ',len(boxes))
print(boxes)

Box found:  67
[[1, 196, 193, 179], [-8, 206, 207, 209], [584, 320, 185, 157], [371, 362, 197, 187], [227, 385, 228, 244], [260, 387, 233, 231], [721, 319, 176, 415], [519, 399, 210, 366], [684, 341, 193, 428], [714, 326, 180, 450], [205, 592, 445, 167], [201, 585, 474, 173], [205, 569, 465, 206], [645, 157, 141, 205], [650, 163, 146, 195], [0, 185, 177, 214], [243, 161, 136, 255], [263, 168, 126, 240], [642, 164, 146, 228], [653, 171, 141, 218], [782, 170, 114, 235], [0, 193, 177, 231], [245, 168, 131, 269], [657, 183, 137, 238], [782, 173, 120, 272], [-1, 200, 179, 271], [469, 301, 121, 163], [602, 303, 145, 161], [1, 318, 121, 173], [-4, 303, 165, 199], [108, 321, 85, 146], [104, 320, 96, 167], [468, 298, 124, 189], [475, 302, 136, 188], [603, 308, 146, 180], [615, 311, 143, 177], [0, 328, 124, 186], [128, 346, 146, 183], [602, 320, 143, 205], [613, 317, 148, 216], [124, 343, 158, 233], [138, 341, 163, 238], [379, 356, 155, 223], [385, 343, 174, 243], [121, 352, 161, 276], [236, 379

## Non Maximum Supression 
NMSBoxes(bboxes, scores, score_threshold, nms_threshold[, eta[, top_k]]) -> indices

If the two boxes are correlated with each other. we have to choose one box rather than two.In this scenario, <b>NMSBoxes</b> removes the lower scores bounding box if two boxes are mixed with each others

In [103]:
type(confidences[0])

float

In [104]:
type(boxes[0][0])

int

In [105]:
indexes=cv2.dnn.NMSBoxes(boxes,confidences,0.3,0.4)
indexes

array([[34],
       [16],
       [42],
       [21],
       [18],
       [ 9],
       [66],
       [32],
       [28],
       [40],
       [60],
       [30],
       [54],
       [11],
       [51],
       [61],
       [20],
       [58]], dtype=int32)

In [106]:
print('After Non Maximum Suppression (NMS): ',len(indexes))
indexes.flatten()

After Non Maximum Suppression (NMS):  18


array([34, 16, 42, 21, 18,  9, 66, 32, 28, 40, 60, 30, 54, 11, 51, 61, 20,
       58], dtype=int32)

## Draw the Bounding Box on Image 

In [107]:
font=cv2.FONT_HERSHEY_PLAIN
colors=np.random.uniform(0,255,size=(len(boxes),3))

In [108]:
print(colors.shape)
colors

(67, 3)


array([[209.44495171,  55.13474942, 136.84524274],
       [ 87.09833817, 119.36918663,  59.41495622],
       [150.84755871, 235.96936183, 155.72133873],
       [205.75305396,  94.89788033,  70.10456361],
       [216.36528489, 222.54316639, 104.81143333],
       [109.94677867, 135.26829369, 143.97084279],
       [ 40.60829598,  73.53139886, 185.61902291],
       [ 71.97381052, 103.43667405, 142.95401093],
       [178.85985686, 210.523659  , 202.42262349],
       [152.07275653, 149.36186439,  47.43372557],
       [121.59896722,  33.36432299, 212.61459708],
       [247.43531759, 225.52080007, 179.21852054],
       [222.1744657 ,  24.74637119,  65.67487216],
       [182.01838346, 174.95463972, 197.77835304],
       [123.70778303, 189.3357475 ,  34.46545299],
       [186.49226965, 116.14462533, 239.00225323],
       [227.21157013,   2.80756899, 243.28102575],
       [186.43206657, 244.82664405, 144.29049591],
       [227.34197342,  26.51486501,  46.68035212],
       [254.12131523,  17.54540

In [109]:
if len(indexes)>0:
    for i in indexes.flatten():
        x,y,w,h=boxes[i]
        label=str(classes[class_ids[i]])
        color=colors[i]
        confidence=str(round(confidences[i],2))
        
        cv2.rectangle(yolo_input_image,(x,y),(x+w,y+h),color,2)
        cv2.putText(yolo_input_image,label+' '+confidence,(x,y+20),font,2,(0,0,255),2 )
cv2.imshow('YOLO Input',image)
cv2.imshow('YOLO Output',yolo_input_image)
cv2.waitKey()
cv2.destroyAllWindows()