# Detecting on Video

In [1]:
import os
import time
import cv2
import numpy as np
from model.yolo_model import YOLO
from lxml import etree
import xml.etree.cElementTree as ET

Using TensorFlow backend.


In [2]:
def get_classes(file):
    """Get classes name.

    # Argument:
        file: classes name for database.

    # Returns
        class_names: List, classes name.

    """
    with open(file) as f:
        class_names = f.readlines()
    class_names = [c.strip() for c in class_names]

    return class_names

In [3]:
yolo = YOLO(0.6, 0.5)
file = 'data/coco_classes.txt'
all_classes = get_classes(file)



In [4]:
frame_number = 30

In [5]:
video = 'car1.mp4'

In [None]:

camera = cv2.VideoCapture(video)
sourceDirectory = r'E:\Computer-Vision-with-Python\06-Deep-Learning-Computer-Vision\06-YOLOv3\xmls'
count = 0
while True:
    res, frame = camera.read()

    if not res:
        break
    
            

    image = cv2.resize(frame, (416, 416),
                       interpolation=cv2.INTER_CUBIC)
    image = np.array(image, dtype='float32')
    image /= 255.
    image = np.expand_dims(image, axis=0)
    
    boxes, classes, scores = yolo.predict(image, frame.shape)


    if boxes is not None:
        height = np.size(frame, 0)
        width = np.size(frame, 1)
    
        annotation = ET.Element('annotation')
        ET.SubElement(annotation, 'folder').text = 'kitty'
        ET.SubElement(annotation, 'filename').text = str(count)+'.jpg'
        ET.SubElement(annotation, 'segmented').text = '0'
        size = ET.SubElement(annotation, 'size')
        ET.SubElement(size, 'width').text = str(width)
        ET.SubElement(size, 'height').text = str(height)
        ET.SubElement(size, 'depth').text = '3'


        for box,cl, score in zip(boxes, classes, scores):
            if all_classes[cl] == 'person' or all_classes[cl] == 'car':
                x, y, w, h = box

                top = max(0, np.floor(x + 0.5).astype(int))
                left = max(0, np.floor(y + 0.5).astype(int))
                right = min(frame.shape[1], np.floor(x + w + 0.5).astype(int))
                bottom = min(frame.shape[0], np.floor(y + h + 0.5).astype(int))

                cv2.rectangle(frame, (top, left), (right, bottom), (255, 0, 0), 2)
                cv2.putText(frame, '{0} {1:.2f}'.format(all_classes[cl], score),
                            (top, left - 6),
                            cv2.FONT_HERSHEY_SIMPLEX,
                            0.6, (0, 0, 255), 1,
                            cv2.LINE_AA)


                ob = ET.SubElement(annotation, 'object')
                ET.SubElement(ob, 'name').text = all_classes[cl]
                ET.SubElement(ob, 'pose').text = 'Unspecified'
                ET.SubElement(ob, 'truncated').text = '0'
                ET.SubElement(ob, 'difficult').text = '0'
                bbox = ET.SubElement(ob, 'bndbox')
                ET.SubElement(bbox, 'xmin').text = str(top)
                ET.SubElement(bbox, 'ymin').text = str(left)
                ET.SubElement(bbox, 'xmax').text = str(right)
                ET.SubElement(bbox, 'ymax').text = str(bottom)

        xml_str = ET.tostring(annotation)
        root = etree.fromstring(xml_str)
        xml_str = etree.tostring(root, pretty_print=True)
        xml_labels = str(count) + '.xml'
        save_path = os.path.join(sourceDirectory, xml_labels)
        with open(save_path, 'wb') as temp_xml:
            temp_xml.write(xml_str)

        cv2.imwrite(os.path.join(sourceDirectory,(str(count)+'.jpg')), frame)
        count += frame_number         #30 # i.e. at 30 fps, this advances one second
        camera.set(1, count)

        cv2.imshow("detection", frame)

        if cv2.waitKey(0) & 0xff == ord('q'):
            break
    
camera.release()