In [1]:
#importing libraries to prepare inputs
import cv2
import numpy as np
import os
import mediapipe as mp

In [2]:
#loading mediapipe solutions for detecting key points on the body, hands and face (holistic) and for drawing skeletons (drawing_utils)
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils

In [3]:
#defining the pose detector by keypoints
def mediapipe_detection(image, model):
    #converting the colors of the image from BGR to RGB and blocking the possibility of changing the image
    #preparing the image in a configuration suitable for the application of the model
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image.flags.writeable = False

    #aplicação do modelo - reconhecimento dos pontos chave
    results = model.process(image)

    #model application - recognition of keypoints
    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)

    #return of the image and the results obtained (coordinates of the keypoints)
    return image, results

In [4]:
#definition for the skeleton drawing - based on the key points - in the image
def draw_landmarks(image, results):
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                              mp_drawing.DrawingSpec(color=(80,22,10), thickness=2, circle_radius=4),
                              mp_drawing.DrawingSpec(color=(80,44,121), thickness=2, circle_radius=2))
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                              mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4),
                              mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2))
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                              mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=4),
                              mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2))

In [5]:
#definition for converting keypoint data to numpy array
def extract_keypoints(results):
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    print(np.concatenate([pose, lh, rh]))
    return np.concatenate([pose, lh, rh])

In [7]:
#definition of array containing the gestures to be detected
actions = np.array(['vol-up', 'vol-down', 'bright-up', 'bright-down', 'nothing'])
#definition of the number of videos per gesture
no_sequences = 30
#definition of the number of frames used per video
sequence_length = 30

In [8]:
#creation of the file structure for saving entries
DATA_PATH = os.path.join('gesture-info')
for action in actions:
    for sequence in range(no_sequences):
        try:
            os.makedirs(os.path.join(DATA_PATH, action, str(sequence)))
        except:
            pass

In [None]:
with mp_holistic.Holistic(min_detection_confidence=0.4, min_tracking_confidence=0.4) as holistic:
    #loop for each gesture
    for action in actions:
        #loop para cada vídeo
        for sequence in range(0, no_sequences):
            #loop for each video
            cap = cv2.VideoCapture('videos/' + action + '-' + str(sequence + 1) + '.mp4')

            #obtaining number of frames
            cap_length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

            #identification of the distance between usable frames
            aux = cap_length%sequence_length
            if aux%2 == 0:
                first_frame = int(aux/2)
                last_frame = int(cap_length - aux/2)
            else:
                first_frame = int((aux - 1)/2)
                last_frame = int(cap_length - (aux + 1)/2)

            sequence_gap = int((last_frame - first_frame)/sequence_length)

            count = 0
            control1 = 0
            control2 = 0
            
            #loop for each usable frame
            for frame_num in range(0, last_frame):
                ret, frame = cap.read()
                frame = cv2.resize(frame, (640,480), interpolation=cv2.INTER_AREA)

                if control1 < first_frame:
                    control1 = control1 + 1
                else:    
                    if control2 == 0:
                        #application of the keypoint detection model
                        image, results = mediapipe_detection(frame, holistic)

                        #conversion of data obtained into numpy array
                        keypoints = extract_keypoints(results)

                        #saving input data in the created structure
                        npy_path = os.path.join(DATA_PATH, action, str(sequence), str(count))
                        np.save(npy_path, keypoints)

                        count = count + 1
                        if sequence_gap != 1:
                            control2 = control2 + 1
                    else:
                        if control2 == sequence_gap - 1:
                            control2 = 0 
                        else:
                            control2 = control2 + 1
    
#Release the video capture
cap.release()
cv2.destroyAllWindows()

In [10]:
#import of libraries to structure input and output data
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

In [11]:
#convert the gesture array into a categorization structure
#vol-up = 10000; vol-down = 01000; bright-up = 00100; bright-down = 00010; nothing = 00001
label_map = {label:num for num, label in enumerate(actions)}

In [12]:
#reading entries - numpy arrays - previously saved
sequences, lables = [], []
for action in actions:
    for sequence in range(no_sequences):
        window = []
        for frame_num in range(sequence_length):
            res = np.load(os.path.join(DATA_PATH, action, str(sequence), "{}.npy".format(frame_num)))
            window.append(res)
        sequences.append(window)
        lables.append(label_map[action])

In [13]:
#definition of input (x) and output (y) variables
x = np.array(sequences)
y = to_categorical(lables).astype(int)

#separation of input and output in training and testing
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.05)

In [14]:
#import of libraries for structuring the neural network
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

In [15]:
#definition of the neural network model
model = Sequential()
#first three layers using LSTM model, due to the possibility of storing information in the short and long term; interesting for variable time series,
#such as action detection in videos
model.add(LSTM(64, return_sequences=True, activation='relu', input_shape=(30,258)))
model.add(LSTM(128, return_sequences=True, activation='relu'))
model.add(LSTM(64, return_sequences=False, activation='relu'))
#next layers using Dense model, more traditional, with all neorons fully interconnected
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
#last layer functioning as a kind of probabilistic calculation (values ​​closer to 0 or 1), in order to represent one of the gestures by 
# the categorization structure
model.add(Dense(actions.shape[0], activation='softmax'))

In [16]:
#model compilation
model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

In [None]:
#model training
model.fit(x_train, y_train, epochs=2000)

In [71]:
#Saving the weights of the trained model
model.save_weights('gesture.h5')

In [19]:
#Loading the weights of the trained model
model.load_weights('gesture.h5')

In [20]:
#Import of analytical metrics from the sklearn library
from sklearn.metrics import confusion_matrix, accuracy_score

In [21]:
#test detection
yhat = model.predict(x_test)



In [22]:
ytrue = np.argmax(y_test, axis=1).tolist()
yhat = np.argmax(yhat, axis=1).tolist()

In [23]:
#confusion matrix - used to map errors and confusion in detection
confusion_matrix(ytrue, yhat)

array([[2, 0, 0, 0],
       [0, 1, 0, 0],
       [0, 0, 1, 0],
       [0, 0, 0, 4]], dtype=int64)

In [24]:
#accuracy - hits/detections
accuracy_score(ytrue, yhat)

1.0

In [25]:
from ctypes import cast, POINTER
from comtypes import CLSCTX_ALL
from pycaw.pycaw import AudioUtilities, IAudioEndpointVolume
import screen_brightness_control as pct

In [26]:
#Definition of activities triggered by gestures
def system_action(action):
    if action == "vol-up":
        #Identifies the audio output device
        device = AudioUtilities.GetSpeakers()
        interface = device.Activate(IAudioEndpointVolume._iid_, CLSCTX_ALL, None)
        #Identifies the current volume
        volume = cast(interface, POINTER(IAudioEndpointVolume))
        #Increase one unit per iteration
        for i in range(0, 5):
            #check if the volume is at maximum
            if volume.GetMasterVolumeLevelScalar() != 1:
                volume.VolumeStepUp(None)

    if action == "vol-down":
        #Identifies the audio output device
        device = AudioUtilities.GetSpeakers()
        interface = device.Activate(IAudioEndpointVolume._iid_, CLSCTX_ALL, None)
        #Identifies the current volume
        volume = cast(interface, POINTER(IAudioEndpointVolume))
        #Decrease one unit per iteration
        for i in range(0, 5):
            #check if the volume is at minimum
            if volume.GetMasterVolumeLevelScalar() != 0:
                volume.VolumeStepDown(None)

    if action == "bright-up":
        #Identifies the current brightness
        current_bright = pct.get_brightness()
        #check if the brightness will go to maximum
        if current_bright[0] + 5 < 100:
            pct.set_brightness(str(current_bright[0] + 5))
        else:
            pct.set_brightness("100")
        

    if action == "bright-down":
        #Identifies the current brightness
        current_bright = pct.get_brightness()
        #check if the brightness goes to minimum
        if current_bright[0] - 5 > 0:
            pct.set_brightness(str(current_bright[0] - 5))
        else:
            pct.set_brightness("0")


In [None]:
#control variables
sequence = []
action = ""
prob = ""
control1 = 0
control2 = 0
show_count = 0

#webcam capture
cap = cv2.VideoCapture(0)

with mp_holistic.Holistic(min_detection_confidence=0.4, min_tracking_confidence=0.4) as holistic:
    #loop executed while webcam capture is open
    while cap.isOpened():
        #frame reading
        ret, frame = cap.read()
        frame = cv2.resize(frame, (640,480), interpolation=cv2.INTER_AREA)

        if control1 == 0:
            #application of the keypoint detection model
            image, results = mediapipe_detection(frame, holistic)
            
            #drawing the keypoints -- skeleton
            draw_landmarks(image, results)
                
            #conversion of data obtained into numpy array
            keypoints = extract_keypoints(results)
            sequence.append(keypoints)

            #accumulated 30 frames, detection occurs
            if len(sequence) == 30:
                res = model.predict(np.expand_dims(sequence, axis=0))[0]
                sequence = []
                prob = str(res[np.argmax(res)])
                action = actions[np.argmax(res)]
                system_action(action)
                show_count = show_count + 1

        #control of used frames -- 1 in 2 frames
        if control1 == 1:
            control1 = 0
        else:
            control1 = control1 + 1

        #control the time in which the gesture is detected and shown in the frame -- 10 frames
        if show_count != 0:
            if show_count == 10:
                show_count = 0
                action = ""
                prob = ""
            else:
                show_count = show_count + 1

        #writing the gesture detected in the frame and its probability
        cv2.putText(image, action + ' ' + prob, (3,30), cv2.FONT_HERSHEY_COMPLEX, 1, (0, 0, 255), 2, cv2.LINE_AA)
        #showing frame
        cv2.imshow("OpenCV feed", image)

        #interrupt condition: press 'q' key
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break

    #Release webcam capture
    cap.release()
    cv2.destroyAllWindows()