In [4]:
# load packages
import os
import cv2
import pickle
from keras.models import model_from_json
from PIL import Image, ImageTk
from keras.preprocessing import image
from keras.applications.resnet50 import preprocess_input, decode_predictions
import numpy as np
import matplotlib.pyplot as plt
import tkinter as tk
from tkinter import messagebox
from datetime import datetime
import xgboost as xgb

# make sure matplotlib shows images inline
%matplotlib inline

In [5]:
# set directory
os.chdir("C:/GitHub/HandSign_Recognition")
#os.chdir("C:/Users/e.v.nistelrooij/Desktop/SSGN_2703/GUI")

In [6]:
# define the location of the cascade files we will use to detect hands
#fist_cascade_path = '00 Data/Haarcascades/fist.xml' 
#palm_cascade_path =  '00 Data/Haarcascades/palm.xml'
#closed_frontal_palm_cascade_path =  '00 Data/Haarcascades/closed_frontal_palm.xml'

In [7]:
# load cascade file
#fistCascade = cv2.CascadeClassifier(fist_cascade_path)
#palmCascade = cv2.CascadeClassifier(palm_cascade_path)
#closedFrontalPalmCascade = cv2.CascadeClassifier(closed_frontal_palm_cascade_path)

In [20]:
# load json and create model
json_file = open('01 Models/model_resnet_datagen_git.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)

# load weights into new model
loaded_model.load_weights("01 Models/model_resnet_datagen_git.h5")
print("Loaded model from disk")

Loaded model from disk


In [9]:
loaded_model

<keras.engine.training.Model at 0x22b65523d30>

In [10]:
# load pickle with XGB model
#loaded_model = pickle.load(open('01 Models/xgb_model_own.pickle.dat', 'rb'))



In [11]:
loaded_model

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=nan, n_estimators=500,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [12]:
# load the dictionary that translates the label matrix to values
label_dict = pickle.load(open('01 Models/LabelDictionary.pkl', 'rb'))

In [13]:
def im_to_wide(grey):
    y = []
    for r in range(0,len(grey)):
        for col in range(0,len(grey[r])):
            y.append(grey[r][col])
    return y

In [14]:
# define the threshold for the minimum probability a prediction must have
thresh = .4

In [15]:
# define the function to predict which letter is shown with handsigns
def predict_letter(model, img, target_size, xgboost = False):
    #    model: keras model
    #    img: PIL format image
    #    target_size: (width, height) tuple
    #    predict xgboost model or not
    
    if img.size != target_size:
        print("the original size of the image is: " + str(img.size))
        img = img.resize(target_size)
        print("the new size of the image is: " + str(img.size))

    # convert to numpy array
    x = np.array(img)
    # add a shape parameter which defines the number of images (which is 1)
    x = np.expand_dims(x, axis = 0)
    # convert to float
    x = x.astype(float)
    # normalize
    x = x / 255
    #print(x)
    #print(x.shape)
    
    # for xgboost: turn matrix into array
    if xgboost:
        x  = np.array([im_to_wide(x)])
        x = x[:,:,0]
        
    # set the channels when necessary
    elif(len(x.shape) == 3): # number of channels = 1
        x = x.reshape((x.shape[0], x.shape[1], x.shape[2], 1))
        #print(x.shape)
    
    
    # make a prediction
    if xgboost:
        pred = model.predict_proba(x)
    else:
        pred = model.predict(x)
    
    #print(pred)
    #print(pred < thresh)
    #print(x.shape)
    
    # set all elements below the threshold to zero
    pred[pred < thresh] = 0
    #print(pred)
    
    # if matrix contains all zeros, no prediction can be done
    if np.any(pred):
        print("prediction can be done")
        # check which column contains the highest probability
        # translate that label to the letter, using the label dictionary
        label = list(label_dict.keys())[list(label_dict.values()).index(np.argmax(pred))]
        
    else:
        print("no prediction possible")
        label = "Unknown"    
    
    # return the label of the prediction
    return label

In [16]:
# define a function for showing the webcam screen
def show_frame():
    _, frame = camera.read()
    frame = cv2.flip(frame, 1)
    cv2image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGBA)
    img = Image.fromarray(cv2image)
    imgtk = ImageTk.PhotoImage(image = img)
    lmain.imgtk = imgtk
    lmain.configure(image = imgtk)
    lmain.after(10, show_frame)

In [21]:
# define the webcam screen
width, height = 850, 640
camera = cv2.VideoCapture(0)
camera.set(cv2.CAP_PROP_FRAME_WIDTH, width)
camera.set(cv2.CAP_PROP_FRAME_HEIGHT, height)

# create the GUI screen
root = tk.Tk()
#root = tk.Toplevel()

# create an empty variable to save the sentence and the new letter
the_sentence = "de "
new_letter = ""

# configure the GUI
root.configure(bg = '#%02x%02x%02x' % (101, 116, 129))

# configure the number of columns and rows in the grid
root.columnconfigure(3, pad = 7)
root.rowconfigure(5, pad = 7)

# make column(s) and/or(s) row expandable
root.columnconfigure(1, weight = 1)
root.rowconfigure(1, weight = 1)
root.rowconfigure(4, weight = 1)

# make sure the GUI is in fullscreen mode
root.state("zoom")

# Define the title of the GUI
root.title("Gebarentaal herkenning met behulp van Artificial Intelligence")

# put a logo on the GUI
#logo = tk.PhotoImage(file = 'Logo.png')
#img = tk.Label(root, image = logo, bg = '#%02x%02x%02x' % (101, 116, 129))
#img.image = logo
#img.grid(row = 0, column = 0, padx = 5, pady = 5)

# create a placeholder for the webcam screen
lmain = tk.Label(root)
lmain.grid(row = 1, column = 1)

# create a placeholder textbox for the complete sentence
text = tk.Text(root
               , bd = 0 # size of the border
               , bg = '#%02x%02x%02x' % (218, 218, 222) # background color
               , height = 5 # number of lines
               , padx = 5 # left and right padding in pixels
               , pady = 5 # top and bottom padding in pixels
               , relief = "solid" # 3D appearance of widget: flat, groove, raised, ridge, solid, or sunken
               , wrap = "word" # truncate line after last whole word that fits
               , font = ('Verdana', 20, 'bold')
               , fg = '#%02x%02x%02x' % (4, 55, 133) # textcolor (Cmotions darkblue)
               , width = 40 # the number of characters that fit on a single line
              )
text.grid(row = 4, column = 1)

# show the sentence
text.insert(tk.INSERT, the_sentence)


# create a function that calls the model and processes the outcome
def call_model(the_sentence = the_sentence):
    return_value, image = camera.read()
    print(datetime.now())
    
    # check if the camera gives an image
    if return_value:
        
        # flip the image
        image = cv2.flip(image, 1)
    
        # make sure the image is read as an RGB
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGBA)
        
        # set to grayscale
        image = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
        
        # binarize (image to black and white)
        #thr, image = cv2.threshold(image, 135, 300, cv2.THRESH_BINARY)
              
        # convert opencv image to PIL
        pil_im = Image.fromarray(image)
        pil_im.show()
        
        
        # try to define the letter from the handsign
        new_letter = predict_letter(loaded_model, pil_im, (64,64), False)
        print(new_letter)
        
        # let the user decide if the letter should be added to the sentence
        add_letter = messagebox.askyesno(title = "Bevestig toevoegen van letter aan zin"
                                         , message = "Ik heb de nieuwe letter herkend als een:\n\n" + str(new_letter)
                                         + "\n\nWil je deze toevoegen aan de zin?")
        
        # check is letter should be added and act accordingly
        if add_letter:
            
            # make sure the text is editable
            #text.configure(state = 'normal')
            
            # update the value of the complete sentence
            the_sentence = the_sentence + new_letter
            print('the sentence now is: ' + the_sentence)
            
            # show the complete sentence that has been created so far
            text.insert(tk.INSERT, new_letter)
            root.update_idletasks()
            
            # make sure the text isn't editable
            #text.configure(state = 'disabled')
        else:
            print('WRONG!')

# create a button that calls the model
startButton = tk.Button(root, text = 'Bepaal letter', command = call_model
                        , height = 5, width = 20, bg = 'red', foreground = 'white'
                       , relief = 'raised', justify = 'center', font = ('Verdana', 15, 'bold'))
startButton.grid(row = 1, column = 0)

# show the webcam stream
show_frame()

# start the GUI
root.mainloop()  

# turn off the camera
camera.release()
cv2.destroyAllWindows()

2018-03-26 13:17:01.166981
the original size of the image is: (640, 480)
the new size of the image is: (64, 85)
[[  1.10011752e-05   7.90103525e-02   9.14084092e-02   8.29545259e-01
    2.47349926e-05   3.07476654e-07]]
[[ True  True  True False  True  True]]
(1, 85, 64, 1)
prediction can be done
D
the sentence now is: de D
2018-03-26 13:17:14.126963
the original size of the image is: (640, 480)
the new size of the image is: (64, 85)
[[  1.39611902e-05   8.50935932e-03   9.70486104e-01   2.09734030e-02
    1.64949324e-05   7.13464715e-07]]
[[ True  True False  True  True  True]]
(1, 85, 64, 1)
prediction can be done
C
the sentence now is: de C
2018-03-26 13:17:20.574960
the original size of the image is: (640, 480)
the new size of the image is: (64, 85)
[[  1.30622269e-04   2.51854420e-01   4.42388088e-01   3.05463701e-01
    1.59932664e-04   3.22925894e-06]]
[[ True  True False  True  True  True]]
(1, 85, 64, 1)
prediction can be done
C
WRONG!


In [18]:
# turn off the camera
camera.release()
cv2.destroyAllWindows()

In [19]:
loaded_model

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=nan, n_estimators=500,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)