In [1]:
import cv2
import matplotlib.pyplot as plt
import numpy as np
from utils.vino_functions import PrepareNetWork,PrepareInputImage,MakePrediction
from pathlib import Path
import time

## Models

In [2]:
model_root = Path('model/intel')
facenet_bin = model_root / 'face-detection-retail-0005/FP16/face-detection-retail-0005.bin'
facenet_xml = model_root / 'face-detection-retail-0005/FP16/face-detection-retail-0005.xml'

emotion_bin = model_root / 'emotions-recognition-retail-0003/FP16/emotions-recognition-retail-0003.bin'
emotion_xml = model_root / 'emotions-recognition-retail-0003/FP16/emotions-recognition-retail-0003.xml'

style_trans = model_root / 'style-GAN/StyleGAN.onnx'

device1 = 'CPU'
device2 = 'GPU'

emotion_label = ('Neutral', 'Happy', 'Sad', 'Surprise', 'Anger')

#### Prepare Models

In [3]:
facenet_names, facenet_exec_net, facenet_img_shape = PrepareNetWork(facenet_xml,facenet_bin,device1)
emotion_names, emotion_exec_net, emotion_img_shape = PrepareNetWork(emotion_xml,emotion_bin,device2)
style_names, style_exec_net, style_img_shape = PrepareNetWork(style_trans,None,device2,flag='onnx')

Extract Model Input Dimension: 1 3 300 300
Extract Model Input Dimension: 1 3 64 64
Extract Model Input Dimension: 1 3 224 224


In [4]:
# image = cv2.imread('data/face.jpg')
# image = cv2.resize(image,(224,224))
# image.shape

In [5]:
# _, in_frame = PrepareInputImage(image,style_img_shape)
# result = MakePrediction(style_exec_net, style_names[0], in_frame)

In [6]:
# styled_image = result[style_names[1]]
# styled_image.shape
# # styled_image = styled_image[0]
# # styled_image = styled_image.transpose((1,2,0))
# # styled_image = np.clip(styled_image, 0, 255)

# # plt.imshow(styled_image/255)
# # plt.axis('off')
# # plt.show()

In [11]:
def detection_box(result, output_key, prob_thresh, image):
    res = result[output_key].reshape(200,7)
    height,width = image.shape[:2]
    face_image = image.copy()
    
    
    for obj in res:
        conf = obj[2]
        # Get Class ID
        class_id = int(obj[1])
        
        if conf >= prob_thresh:
            
            # get coordinates
            xmin = int(obj[3] * width)
            ymin = int(obj[4] * height)
            xmax = int(obj[5] * width)
            ymax = int(obj[6] * height)
#             print(xmin,ymin)
            
            
            face_image = image[ymin-10:ymax+10,xmin-10:xmax+10]
            _, in_frame = PrepareInputImage(face_image, emotion_img_shape)
            result = MakePrediction(emotion_exec_net, emotion_names[0], in_frame)
            emo_proba = result[emotion_names[1]].flatten()
            emo_label = emotion_label[np.argmax(emo_proba)]
#             print(emo_label)


            _, in_frame = PrepareInputImage(face_image,style_img_shape)
            result = MakePrediction(style_exec_net, style_names[0], in_frame)
            styled_image = result[style_names[1]]
            styled_image = styled_image[0]
            styled_image = styled_image.transpose((1,2,0))
            styled_image = np.clip(styled_image, 0, 255)
            styled_image = cv2.resize(styled_image,(face_image.shape[1],face_image.shape[0]))
            
            
#             print(styled_image.shape,image[ymin-10:ymax+10,xmin-10:xmax+10].shape)
            
            image[ymin-10:ymax+10,xmin-10:xmax+10] = cv2.blur(styled_image,(13,13))

            


            
            
            cv2.rectangle(image, (xmin, ymin), (xmax, ymax), color=(0, 255, 0))
        
            cv2.putText(frame,text=f"Emotion Status: {emo_label}",
                        org=(xmin,ymin-5),  # X. Y
                        fontFace=font,fontScale= 0.5,color=(0,255,125),thickness=2,lineType=cv2.LINE_AA)            
            
            
        
            
            


In [13]:
# Video Reading utility 
cap = cv2.VideoCapture(0)
font = cv2.FONT_HERSHEY_SIMPLEX
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

frame_name = 0

while cap.isOpened():


    ret, frame = cap.read()
    
    frame_name += 1
    
    if not ret:
        # No Frames to render
        break
        
    start_time = time.time()
        
    _, in_frame = PrepareInputImage(frame, facenet_img_shape)
    result = MakePrediction(facenet_exec_net, facenet_names[0], in_frame)
#     print(result)
    
    try:
    
        detection_box(result, facenet_names[1], 0.9, frame)
      
    except:
        end_time = time.time()
        fps = np.round(1/(end_time-start_time))
        cv2.putText(frame,text = f"Current FPS: {fps}",org = (0,30),
                    fontFace = cv2.FONT_HERSHEY_SIMPLEX,
                    fontScale = 0.5, color = (0,0,255),
                    thickness = 2, lineType = cv2.LINE_AA)


        cv2.imshow('WebCAM', frame)
        

    
    end_time = time.time()
    fps = np.round(1/(end_time-start_time))
    cv2.putText(frame,text = f"Current FPS: {fps}",org = (0,30),
                fontFace = cv2.FONT_HERSHEY_SIMPLEX,
                fontScale = 0.5, color = (0,0,255),
                thickness = 2, lineType = cv2.LINE_AA)
        

    cv2.imshow('WebCAM', frame)
    cv2.imwrite(f'{str(frame_name).zfill(5)}.png',frame)
    

    if cv2.waitKey(30) == ord('q'):
        break

        
## Very Important to remeber to close all windows to avoid breakdown
cap.release()
cv2.destroyAllWindows()
