## Hand Gesture Recognition

This notebook implements hand gesture recognition using OpenCV for the purpose of playing rock paper scissors.

In [1]:
import numpy as np
import cv2
import imutils

### Define helper functions

In [2]:
def getBlurredRegion(img, start, end, kernel=(7, 7)):
    """Get the rectangular region from the image corresponding to start and end points, convert to grayscale, and blur."""
    region = img[start[1]:end[1], start[0]:end[0]]
    region = cv2.cvtColor(region, cv2.COLOR_BGR2GRAY)
    region = cv2.GaussianBlur(region, kernel, 0)
    return region

In [3]:
def getForegroundMask(img, bg, threshold=15):
    """Returns the foreground mask of an image given the background and a threshold."""
    diff = cv2.absdiff(bg.astype(np.uint8), img)
    mask = cv2.threshold(diff, threshold, 255, cv2.THRESH_BINARY)[1]
    return mask

In [4]:
def getHandContour(img):
    """Returns the contour corresponding to the hand from a masked image."""
    contours, hierarchy = cv2.findContours(img, cv2. RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
    
    if len(contours) == 0:
        return
    else:
        # contour with maximum area should be the hand
        return max(contours, key=cv2.contourArea)

In [12]:
def recognizeGesture(cnt, thresh=30):
    """Determines whether a hand gesture corresponds to rock, paper, or scissors."""
    hull = cv2.convexHull(cnt, returnPoints=False)
    try:
        defects = cv2.convexityDefects(cnt, hull)
    except:
        return
    
    # defects with an approx. distance to farthest point greater than the threshold are considered to correspond to the space between figners
    if defects is not None:
        n = 0 # number of spaces between fingers
        for i in range(defects.shape[0]):
            s, e, f, d = defects[i, 0]
            if d / 256.0 > thresh:
                n += 1
    
        if n >= 4: # should really be n == 4, but n >= 4 can be less finicky for paper detection
            return "paper"
        elif n == 1:
            return "scissors"
        elif n == 0:
            return "rock" # cannot actually distinguish between 1 and 0 fingers held up
        else:
            return
    else:
        return

### Set parameters
* `start_point` and `end_point` are the top-left and bottom-right x-y coordinates for the bounding box corresponding to the region of the video where your hand will be placed. This region needs to remain stationary for 30 frames at the start of code execution to initialize a background model for the region.

In [6]:
# coordinates for rectangle of hand region
start_point = (10, 10)
end_point = (310, 310)
#start_point = (250, 250)
#end_point = (550, 550)

# alpha for running average background model
alpha = 0.5

# kernel for mophological transformations
kernel = np.ones((5, 5), np.uint8)

## Recognize gestures from livestream
Let the background model initialize by keeping the scene in the bounding box region still for 30 frames. Placing your hand in the bounding box afterwards should output a text prediction of the gesture (rock, paper, or scissors). The hand contour is displayed in red and the convex hull is displayed in green. An additional window showing the foreground mask for the hand is also displayed.

In [13]:
# capture frames from camera
capture = cv2.VideoCapture(0)

# initialize background
_, bg = capture.read()
bg = imutils.resize(bg, width=800)
bg = np.float32(bg)
bg = getBlurredRegion(bg, start_point, end_point)

frame_count = 0

while True:
    _, frame = capture.read()
    
    # resize frame
    frame = imutils.resize(frame, width=800)
    
    # get hand region, convert to grayscale, and blur
    region = getBlurredRegion(frame, start_point, end_point)
    
    # update background model if frame_count < 30 otherwise get foreground mask 
    if frame_count < 30:
        cv2.accumulateWeighted(region, bg, alpha)
    else:
        fg_mask = getForegroundMask(region, bg)
        
        # apply morphological transformations to remove holes in foregound mask
        fg_mask = cv2.morphologyEx(fg_mask, cv2.MORPH_CLOSE, kernel)
        
        # get hand contour and draw
        cnt = getHandContour(fg_mask)
        if cnt is not None:
            cv2.drawContours(frame, [cnt + start_point], -1, (0, 0, 255), 2)
            
            # get convex hull and draw
            hull = cv2.convexHull(cnt)
            cv2.drawContours(frame, [hull + start_point], -1, (0, 255, 0), 2)
            
            # detect gesture and display text
            gesture = recognizeGesture(cnt)
            if gesture is not None:
                cv2.putText(frame, gesture, (end_point[0], start_point[1] + 20), 
                            cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 4)
            
            
        # display mask
        cv2.imshow('FG Mask', fg_mask)
    
    # draw rectangle for hand region
    cv2.rectangle(frame, start_point, end_point, (255, 0, 0), 2)
    
    # display frame
    cv2.imshow('Frame', frame)
    
    frame_count += 1
    
    keypress = cv2.waitKey(1)
    if keypress == ord('q'):
        break

capture.release()
cv2.destroyAllWindows()

## Play rock paper scissors
With another person (or by yourself!), make the appropriate gestures in the two bounding boxes after the background models have initialized. The gestures of each person and the winner will be displayed on screen. 

In [9]:
def determineWinner(gesture_b, gesture_g):
    """Determines the winner of rock paper scissors from gestures."""
    if ((gesture_b == "paper" and gesture_g == "rock") or 
        (gesture_b == "rock" and gesture_g == "scissors") or 
        (gesture_b == "scissors" and gesture_g == "paper")):
        return "blue"
    elif ((gesture_g == "paper" and gesture_b == "rock") or 
        (gesture_g == "rock" and gesture_b == "scissors") or 
        (gesture_g == "scissors" and gesture_b == "paper")):
        return "green"
    else:
        return "draw"

In [14]:
# coordinates for rectangles of hand regions for both players
start_point_b = (10, 10)
end_point_b = (260, 260)
start_point_g = (10, 270)
end_point_g = (260, 520)

# alpha for running average background model
alpha = 0.5

# kernel for mophological transformations
kernel = np.ones((5, 5), np.uint8)

In [15]:
# capture frames from camera
capture = cv2.VideoCapture(0)

# initialize backgrounds
_, bg = capture.read()
bg = imutils.resize(bg, width=800)
bg = np.float32(bg)
bg_b = getBlurredRegion(bg, start_point_b, end_point_b)
bg_g = getBlurredRegion(bg, start_point_g, end_point_g)

frame_count = 0

while True:
    _, frame = capture.read()
    
    # resize frame
    frame = imutils.resize(frame, width=800)
    
    # get hand regions, convert to grayscale, and blur
    region_b = getBlurredRegion(frame, start_point_b, end_point_b)
    region_g = getBlurredRegion(frame, start_point_g, end_point_g)
    
    # update background models if frame_count < 30 otherwise get foreground masks 
    if frame_count < 30:
        cv2.accumulateWeighted(region_b, bg_b, alpha)
        cv2.accumulateWeighted(region_g, bg_g, alpha)
    else:
        fg_mask_b = getForegroundMask(region_b, bg_b)
        fg_mask_g = getForegroundMask(region_g, bg_g)
        
        # apply morphological transformations to remove holes in foregound masks
        fg_mask_b = cv2.morphologyEx(fg_mask_b, cv2.MORPH_CLOSE, kernel)
        fg_mask_g = cv2.morphologyEx(fg_mask_g, cv2.MORPH_CLOSE, kernel)
        
        # get hand contours
        cnt_b = getHandContour(fg_mask_b)
        cnt_g = getHandContour(fg_mask_g)
        gesture_b, gesture_g = None, None
        if cnt_b is not None:
            cv2.drawContours(frame, [cnt_b + start_point_b], -1, (0, 0, 255), 2)
            # detect gesture and display text
            gesture_b = recognizeGesture(cnt_b)
            if gesture_b is not None:
                cv2.putText(frame, gesture_b, (start_point_b[0], end_point_b[1]), 
                            cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 4)
        if cnt_g is not None:
            cv2.drawContours(frame, [cnt_g + start_point_g], -1, (0, 0, 255), 2)
            # detect gesture and display text
            gesture_g = recognizeGesture(cnt_g)
            if gesture_g is not None:
                cv2.putText(frame, gesture_g, (start_point_g[0], end_point_g[1]), 
                            cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 4)
        if gesture_b is not None and gesture_g is not None:
            # determine winner and display
            winner = determineWinner(gesture_b, gesture_g)
            if winner == "blue":
                cv2.putText(frame, "blue wins!", (320, 160), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 4)
            elif winner == "green":
                cv2.putText(frame, "green wins!", (320, 160), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 4)
            else:
                cv2.putText(frame, "draw!", (320, 160), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 4)
            
    # draw rectangles for hand regions
    cv2.rectangle(frame, start_point_b, end_point_b, (255, 0, 0), 2)
    cv2.rectangle(frame, start_point_g, end_point_g, (0, 255, 0), 2)
    
    # display frame
    cv2.imshow('Frame', frame)
    
    frame_count += 1
    
    keypress = cv2.waitKey(1)
    if keypress == ord('q'):
        break

capture.release()
cv2.destroyAllWindows()