# Computational Intelligence Assignment

* Title : YOLOv3 an Incremental approach
* Research Paper link : https://pjreddie.com/media/files/papers/YOLOv3.pdf
* Author : Joseph Redmon, Ali Farhadi @ University of Washington

### Implementation : A real time Object Detector based on YOLOv3 model
* Author/Developer : Bharat Dadwaria (https://bharatdadwaria.github.io)
* Roll No. : 1941000147
* M.Tech Statistical Computing (JNU)

I have implemented "A real time object Detector based on YOLOv3 model" . I took the YOLOv3 pretrained model and predicted the boundary boxes coordinates and draw those boudary boxes through OpenCV. For this we need to download the following files.
* YOLOv3 weights : https://pjreddie.com/media/files/yolov3.weights
* YOLOv3 Configuration file : https://github.com/pjreddie/darknet/blob/master/cfg/yolov3.cfg

### Analysing the YOLOv3 Model 
#### Setting up the configuration file yolov3.cfg 
This file contains the information about YOLOv3 model which contains the information of 106 layers that is type of the layer (Convolutional/detector/shortcut/) 

In [37]:
import torch
import torch.nn as nn
import numpy as np

In [38]:
def construct_cfg(construct_cfg):
    
    config=open(construct_cfg,'r')
    file=config.read().split('\n')
    file=[line for line in file if len(line)>0 and line[0]!='#']
    file=[line.lstrip().rstrip() for line in file]
    networkBlocks=[]
    networkBlock={}
    for x in file : 
        if x[0]=='[':
            if len(networkBlock)!=0:
                networkBlocks.append(networkBlock)
                networkBlock={}
            networkBlock['type']=x[1:-1].rstrip()
        else:
            entity,value=x.split('=')
            networkBlock[entity.rstrip()]=value.lstrip()
    networkBlocks.append(networkBlock)
    return networkBlocks

In [39]:
con_file=construct_cfg('yolov3.cfg')

In [40]:
print(con_file)

[{'type': 'net', 'batch': '1', 'subdivisions': '1', 'width': '320', 'height': '320', 'channels': '3', 'momentum': '0.9', 'decay': '0.0005', 'angle': '0', 'saturation': '1.5', 'exposure': '1.5', 'hue': '.1', 'learning_rate': '0.001', 'burn_in': '1000', 'max_batches': '500200', 'policy': 'steps', 'steps': '400000,450000', 'scales': '.1,.1'}, {'type': 'convolutional', 'batch_normalize': '1', 'filters': '32', 'size': '3', 'stride': '1', 'pad': '1', 'activation': 'leaky'}, {'type': 'convolutional', 'batch_normalize': '1', 'filters': '64', 'size': '3', 'stride': '2', 'pad': '1', 'activation': 'leaky'}, {'type': 'convolutional', 'batch_normalize': '1', 'filters': '32', 'size': '1', 'stride': '1', 'pad': '1', 'activation': 'leaky'}, {'type': 'convolutional', 'batch_normalize': '1', 'filters': '64', 'size': '3', 'stride': '1', 'pad': '1', 'activation': 'leaky'}, {'type': 'shortcut', 'from': '-3', 'activation': 'linear'}, {'type': 'convolutional', 'batch_normalize': '1', 'filters': '128', 'size'




### Implementing the Real time Object Detector based on YOLOv3
In order to implement A Real time object detector based on YOLOv3 i have used pretrained model of darknet which is trained on COCO Dataset with having 80 classes. Firstly i have tried to implment the whole network by my own but since the YOLOv3 model is too compex to implement. 

In [41]:
# Importing libraries 
import cv2 as cv
import numpy as np

In [42]:
#confidance threshold
confThreshold=0.05

In [43]:
#non-max-suprssion threshold will limit the boundary boxes by allowing boundary boxes having the score more then threshold values to draw.
nmsThreshold=0.40

In [44]:
# Now the input frame will be reshaped to the following measurements
inpWidth=416
inpHeight=416

In [45]:
#now taking all the possible classes and transforming them all into list
classesFile="coco.names"
classes=None

In [46]:
with open(classesFile,'rt') as f:
    classes =f.read().rstrip('\n').split('\n')

In [49]:
print((classes))

['person', 'bicycle', 'car', 'motorbike', 'aeroplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'sofa', 'pottedplant', 'bed', 'diningtable', 'toilet', 'tvmonitor', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush']


In [50]:
#Loading the configuration file and weight file
modelConf='yolov3.cfg'
modelWeights='yolov3.weights'

In [51]:
#Seting up Neural Network through OPENCV
net = cv.dnn.readNetFromDarknet(modelConf, modelWeights)
net.setPreferableBackend(cv.dnn.DNN_BACKEND_OPENCV)
# Setting up the target computational device as CPU
net.setPreferableTarget(cv.dnn.DNN_TARGET_CPU)

In [52]:
print(type(net))

<class 'cv2.dnn_Net'>


#### The 3 different scaled output layers (Detection layers)

In [53]:
#This function will output the name of the layers which are detection layer (3 scales)
def getOutputsNames(net):
    #Geting the names of all the layers in the network
    layersNames = net.getLayerNames()
    #take the unconnected layers (output layers) which are 3
    k=((net.getUnconnectedOutLayers())-1)
    DetectionLayers=[]
    for i in (k):
        DetectionLayers.append(layersNames[i[0]])
    return DetectionLayers

In [54]:
Detectionlayers=getOutputsNames(net)
print(Detectionlayers)

['yolo_82', 'yolo_94', 'yolo_106']


### Predicting the coordinats for an image

In [55]:
image=cv.imread('image.jpg')

blob=cv.dnn.blobFromImage(image,1/255,(inpWidth,inpHeight),[0,0,0],1,crop=False)
net.setInput(blob)
p=net.forward(getOutputsNames(net))
#p is the output for the image from our network model which is 3 different scaled .
print('At scale-1(82nd layer) total prediction(boxes) : ',len((p[0])))
print('At scale-2(94th layer) total prediction(boxes):',len(p[1]))
print('At scale-3(106th layer) total prediction(boxes):',len(p[2]))
print('Number of vectors for each boxes: ',len(p[2][0]))

At scale-1(82nd layer) total prediction(boxes) :  507
At scale-2(94th layer) total prediction(boxes): 2028
At scale-3(106th layer) total prediction(boxes): 8112
Number of vectors for each boxes:  85


### Detecting the objects for the above image file 

In [56]:
postprocess(image,out)
cv.imshow('Detection for a image',image)
cv.waitKey(0)
cv.destroyAllWindows()

### Drawing the postProcessed coordinators

In [57]:
def drawPred(img,classid,conf,left,top,right,bottom):
    cv.rectangle(img,(left,top),(right,bottom),(255,178,50),1)
    label="%.2f" %conf
    if classes:
        assert(classid<len(classes))
        label="%s%s" %(classes[classid],label)
    cv.putText(img,label,(left,top),cv.FONT_HERSHEY_SIMPLEX,1,(255,255,255),1)

### Handling the predicted values from model and postprocessing it

In [58]:
def postprocess(img,outs):
    frameHeight=img.shape[0]
    frameWidth=img.shape[1]
    
    classIds=[]
    confidances=[]
    boxes=[]
    
    for out in outs:
        for detection in out:
            #scores are the 85 values 
            scores=detection[5:]
            classid=np.argmax(scores)
            confidance=scores[classid]
            
            if confidance>confThreshold:
                #we'll get the pixel values that we are looking for 
                centerX=int(detection[0]*frameWidth)
                centerY=int(detection[1]*frameHeight)
                width=int(detection[2]*frameWidth)
                height=int(detection[3]*frameHeight)
                left=int(centerX-width/2)
                top=int(centerY-height/2)
                
                classIds.append(classid)
                confidances.append(float(confidance))
                boxes.append([left,top,width,height])
    indices=cv.dnn.NMSBoxes(boxes,confidances,confThreshold,nmsThreshold)
    for i in indices:
        i=i[0]
        box=boxes[i]
        left=box[0]
        top=box[1]
        width=box[2]
        height=box[3]
        drawPred(img,classIds[i],confidances[i],left,top,left+width,top+height)

### Testing the model at real time using WebCam

In [23]:
winName="A real time Object Detector"
cv.namedWindow(winName,cv.WINDOW_NORMAL)
cv.resizeWindow(winName,1000,1000)

cap=cv.VideoCapture(0)
while (cv.waitKey(1)<0):
    hasFrame,frame=cap.read()
    blob=cv.dnn.blobFromImage(frame,1/255,(inpWidth,inpHeight),[0,0,0],1,crop=False)
    net.setInput(blob)
    out=net.forward(getOutputsNames(net))
    postprocess(frame,out)
    cv.imshow(winName,frame)
cap.release()
cv.destroyAllWindows() 