# Evaluation with video feed

This notebook runs the hand detection and gesture classification models in the same manner at the gesture control pipeline. It uses the webcam of a laptop and disclays the live video feed adding boxes around hands and outputting the detected gestures. It can be used to manually test the gesture recognition pipeline on the laptop under changing light conditions or other factors as wished.

All needed adjustments are marked with "Todo".

In [1]:
import numpy as np
import cv2
import tensorflow as tf
import pathlib
import time


In [2]:
# ToDo: Only necessary if you changed the selected gestures in the script data_preprocessing_lared.ipynb.
# adjust the names and quantity of the gestures here too.
INDEX_TO_LABEL = {
    0 :"crawl",
    1 :"fist",
    2 :"five",
    3 :"palm",
    4 :"peace",
}

IMAGE_HEIGHT = 300
IMAGE_WIDTH = 300

# Todo: change thresholds to and see how the models react.
# Lower the threshold to see objects like ears to be detected as hands.
THRESHOLD_HAND_DETECTION = 0.8
THRESHOLD_GESTURE_CLASSIFICATION = 0.95

font = cv2.FONT_HERSHEY_SIMPLEX
color = (0,205,205)
fontScale = 0.35
thickness = 1

### Helper methods

In [3]:
def get_detected_hand_boxes(img):
    """ Takes in image of arbitrary size and returns bounding boxes along with confidence scores after detecting
    hands in the image.
    """
    input_tensor = tf.convert_to_tensor(img)
    input_tensor = input_tensor[tf.newaxis,...]
    
    hand_detections = model_detect(input_tensor)
    
    boxes, hand_scores = filter_detected_boxes(hand_detections, threshold=THRESHOLD_HAND_DETECTION)
    
    return boxes, hand_scores


def filter_detected_boxes(hand_detections, threshold):
    """ Filter the detected hands according to defined threshold.
    
    Only bounding boxes that have been detected above the threshold will be returned.
    Returns: 
        boxes: Bounding boxes of detected hands
        hand_scores: Probability that the detected object is a hand
    """
    boxes_raw = hand_detections['detection_boxes'].numpy()[0]
    scores = hand_detections['detection_scores'].numpy()[0]
    
    boxes = []
    hand_scores = []
    for i, score in enumerate(scores):
        if score > threshold:
            box = boxes_raw[i]
            
            ymin = int(float(box[0])*IMAGE_HEIGHT)
            xmin = int(float(box[1])*IMAGE_WIDTH)
            ymax = int(float(box[2])*IMAGE_HEIGHT)
            xmax = int(float(box[3])*IMAGE_WIDTH)

            score = round(score,2)
            hand_scores.append(score)
            
            boxes.append((xmin, ymin, xmax, ymax))
            
    return boxes, hand_scores


def add_boxes_to_img(img, boxes, hand_scores, gesture_names, gesture_scores):
    """ Draw bounding boxes on an image
    
    The calculated probabilities for a hand detection or a gesture classification will be added to the image.
    The gesture names will be drawn on the image.    
    
    Input:
        img: Image that contains the bounding boxes
        boxes: The bounding boxes to draw
        hand_scores: The probability of a detected hand
        gesture_names: The names of classified gestures
        gesture_scores: The probability of a classified gesture
    Returns: 
        img: The input image with drawn bounding boxes and scores
    """
    
    for box, hand_score, gesture_name, gesture_score in zip(boxes, hand_scores, gesture_names, gesture_scores):
        (xmin, ymin, xmax, ymax) = box

        img = cv2.rectangle(img, (xmin, ymin), (xmax, ymax), color, 1)
        img = cv2.putText(img, 'hand ' + str(hand_score), (xmin+2, ymin+10), font, fontScale, color, thickness, cv2.LINE_AA)
        
        if gesture_name:
            img = cv2.putText(img, gesture_name +" "+ gesture_score, (xmin+2, ymax-5), font, fontScale, color, thickness, cv2.LINE_AA)
        
    return img


def crop_rect(img, xmin, xmax, ymin, ymax):
    """ Crop the the rectangular image to a square
    
    Input:
        img: Image that contains the bounding box
        xmin, xmax, ymin, ymax: Coordinates of a hand bounding box
    Returns: 
        img: Input image cropped to a square
    """
    x, y = xmin, ymin
    w = xmax - xmin # width
    h = ymax - ymin # height
    
    # crop a square form
    if w > h:
        y = y - int((w-h)/2)
        h = w    
        #make sure y is within picture
        y = max(y,0)
        y = min(y, IMAGE_HEIGHT-h)

    elif h > w:
        x = x - int((h-w)/2)
        w = h
        #make sure x is within picture
        x = max(x,0)
        x = min(x,IMAGE_WIDTH-w)
        
    img = img[y:y+h, x:x+w]
    img = cv2.resize(img, (64,64))
        
    return img


def get_prediction_gesture(img):
    """ Predict the gesture on the input image
    
    Input:
        img: Image that contains the gesture
    Returns: 
        gesture_name: The name of the predicted gesture
        confidence: The probability for that gesture
    """

    img =  cv2.resize(img, (64,64))
    img = img/255
    img = img.astype('float32')
    input_tensor = tf.convert_to_tensor(img)
    input_tensor = input_tensor[tf.newaxis,...]
    
    predictions = model_classify(input_tensor)
    
    predicted_index = np.argmax(predictions, axis=1)[0]
    confidence = round(np.max(predictions[0]), 2)
    
    if confidence > THRESHOLD_GESTURE_CLASSIFICATION:
        gesture_name = INDEX_TO_LABEL[predicted_index]
    else:
        gesture_name = ""
    
    return gesture_name, str(confidence)


def classify_gestures_for_boxes(boxes):
    """ Classify the gestures for all bounding boxes 
    
    Input:
        boxes: Bounding boxes containing hand detections
    Returns: 
        gesture_names: The names of the classified gestures
        gesture_scores: The probability of the classifed gestures
    """
    gesture_names = []
    gesture_scores = []
    
    for box in boxes:
        (xmin, ymin, xmax, ymax) = box
        cropped_image = crop_rect(img, xmin, xmax, ymin, ymax)
        gesture_name, gesture_score = get_prediction_gesture(cropped_image)
        gesture_names.append(gesture_name)
        gesture_scores.append(gesture_score)
        
    return gesture_names, gesture_scores


def filter_detected_boxes(hand_detections, threshold):
    """ Filter the detected hands according to defined threshold.
    
    Only bounding boxes that have been detected above the threshold will be returned.
    Returns: 
        boxes: Bounding boxes of detected hands
        hand_scores: Probability that the detected object is a hand
    """
    boxes_raw = hand_detections['detection_boxes'].numpy()[0]
    scores = hand_detections['detection_scores'].numpy()[0]
    
    boxes = []
    hand_scores = []
    for i, score in enumerate(scores):
        if score > threshold:
            box = boxes_raw[i]
            
            ymin = int(float(box[0])*IMAGE_HEIGHT)
            xmin = int(float(box[1])*IMAGE_WIDTH)
            ymax = int(float(box[2])*IMAGE_HEIGHT)
            xmax = int(float(box[3])*IMAGE_WIDTH)

            score = round(score,2)
            hand_scores.append(score)
            
            boxes.append((xmin, ymin, xmax, ymax))
            
    return boxes, hand_scores


### Load models

The detection and classification models are needed.

In [4]:
# hand detection model
model_dir = pathlib.Path("../2_detection/model_ssd_mobilenetV2")
model_detect = tf.saved_model.load(str(model_dir))
model_detect = model_detect.signatures['serving_default']
print("hand detection model loaded")

# gesture classification model
model_dir = pathlib.Path("../3_classification/model_mobilenet")
model_classify = tf.saved_model.load(str(model_dir))
print("gesture classification model loaded")

INFO:tensorflow:Saver not created because there are no variables in the graph to restore
hand detection model loaded
gesture classification model loaded


### Start webcam displaying the live video feed

This script has only been tested on Windows but should 
run also on other operation systems.


In [7]:
video = cv2.VideoCapture(0)

_, frame = video.read()

prev_time = time.time()

while True:
    # load current video image
    _, frame = video.read()
        
    img = frame
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    img = img[60:420, 80:560]    # crop black stride
    img = cv2.resize(img, (IMAGE_HEIGHT,IMAGE_WIDTH))
    img = cv2.flip(img, 1)
    
    # Create input Tensor
    input_tensor = tf.convert_to_tensor(img)
    input_tensor = input_tensor[tf.newaxis,...]
    
    # Detect hands
    hand_detections = model_detect(input_tensor)
    boxes, hand_scores = filter_detected_boxes(hand_detections, threshold=THRESHOLD_HAND_DETECTION)

    # Classify gestures
    gesture_names, gesture_scores = classify_gestures_for_boxes(boxes)

    # Draw boxes and labels on image
    img = add_boxes_to_img(img, boxes, hand_scores, gesture_names, gesture_scores)

    # Print current frames per second
    fps = round(1 / (time.time() - prev_time),1)
    print(str(fps)+"fps", "\r", end="")
    prev_time = time.time()
    
    img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
    img = cv2.resize(img, (600,600)) # display image a bit bigger for convenience
    
    cv2.imshow('webcam', img)
    if cv2.waitKey(1) == 27: 
        break  # esc to quit

cv2.destroyAllWindows()

28.6fps            