In [1]:
import os, time
import tensorflow as tf
import numpy as np
import pandas as pd
import cv2
import mediapipe as mp
from tensorflow.keras.models import load_model

pwd = os.getcwd() + '\\models\\model_200_tanh.h5'
model = load_model(pwd)
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_4 (LSTM)               (None, 64)                41984     
                                                                 
 dense_8 (Dense)             (None, 32)                2080      
                                                                 
 dense_9 (Dense)             (None, 5)                 165       
                                                                 
Total params: 44,229
Trainable params: 44,229
Non-trainable params: 0
_________________________________________________________________


In [2]:
# test
actions = ['come', 'grip', 'hello', 'Ok', 'spin']
seq_length = 30

# MediaPipe Hands Model 불러오기
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
hands = mp_hands.Hands(
    max_num_hands=1,
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5)

cap = cv2.VideoCapture(0)

cap.set(3, 1280)
cap.set(4, 720)

True

In [4]:
seq = []
action_seq = []

if cap.isOpened() == False:
    print('Can not find any camera...')

else:
    while cap.isOpened():
        ret, img = cap.read()
        img0 = img.copy()

        img = cv2.flip(img, 1)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        result = hands.process(img)
        img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)

        if result.multi_hand_landmarks is not None:
            for res in result.multi_hand_landmarks:
                joint = np.zeros((21, 4))
                for j, lm in enumerate(res.landmark):
                    joint[j] = [lm.x, lm.y, lm.z, lm.visibility]

                
                v1 = joint[[0,1,2,3,0,5,6,7,0,9,10,11,0,13,14,15,0,17,18,19], :3]
                v2 = joint[[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20], :3]
                v = v2 - v1 # [20, 3]
                
                v = v / np.linalg.norm(v, axis=1)[:, np.newaxis]

                # 
                angle = np.arccos(np.einsum('nt,nt->n',
                    v[[0,1,2,4,5,6,8,9,10,12,13,14,16,17,18],:], 
                    v[[1,2,3,5,6,7,9,10,11,13,14,15,17,18,19],:])) # [15,]
                angle = np.degrees(angle) 

                d = np.concatenate([joint.flatten(), angle])

                seq.append(d)

                mp_drawing.draw_landmarks(img, res, mp_hands.HAND_CONNECTIONS)

                if len(seq) < seq_length:
                    continue

                input_data = np.expand_dims(np.array(seq[-seq_length:], dtype=np.float32), axis=0)

                y_pred = model.predict(input_data).squeeze()

                i_pred = int(np.argmax(y_pred))
                conf = y_pred[i_pred]

                if conf < 0.9:
                    continue
    
                action = actions[i_pred]
                action_seq.append(action)
                
                # action_seq의 결과가 3번 연속으로 동일할 경우 해당 action이라고 판단하도록 시킴
                ## 3, 4, 5번 전부 해봤지만 4번부터는 반응이 살짝 느려짐
                if len(action_seq) < 4:
                    continue
                
                this_action = "I don't know what you say:("
                if action_seq[-1] == action_seq[-2] == action_seq[-3]:
                    this_action = action
                
                cv2.putText(img, f"{this_action.upper()}", org=(int(res.landmark[0].x * img.shape[1]), int(res.landmark[0].y * img.shape[0] + 30)),
                            fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=1, color=(0,0,0), thickness=2)
                
                if this_action == 'come':
                    cv2.putText(img, "Sorry, I can't come to you", org=(int(res.landmark[0].x * img.shape[1]), int(res.landmark[0].y * img.shape[0] + 150)), 
                                fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=1, color=(255, 102, 102), thickness=2)
                elif this_action == 'grip':
                    cv2.putText(img, "Ok, I'm stop", org=(int(res.landmark[20].x * img.shape[1] + 150), int(res.landmark[0].y * img.shape[0] - 50)), 
                                fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=1, color=(102, 102, 255), thickness=2)
                elif this_action == 'hello':
                    cv2.putText(img, "Hello, my friend!", org=(int(res.landmark[0].x * img.shape[1]), int(res.landmark[12].y * img.shape[0] - 130)),
                               fontFace = cv2.FONT_HERSHEY_SIMPLEX, fontScale = 1, color=(102, 204, 0), thickness=2)
                elif this_action == 'Ok':
                    cv2.putText(img, "Ok", org=(int(res.landmark[20].x * img.shape[1] + 150), int(res.landmark[0].y * img.shape[0] - 50)),
                               fontFace = cv2.FONT_HERSHEY_SIMPLEX, fontScale = 1, color=(255, 204, 229), thickness=2)
                elif this_action == 'spin':
                    cv2.putText(img, "Oh, I feel dizzy X.X", org=(int(res.landmark[8].x * img.shape[1]), int(res.landmark[8].y * img.shape[0] + 150)),
                               fontFace = cv2.FONT_HERSHEY_SIMPLEX, fontScale = 1, color=(153, 255, 255), thickness=2)

        cv2.imshow('img', img)
        
        # q를 누르면 캠이 종료되도록 함
        if cv2.waitKey(10) == ord('q'):
            break
            
    cap.release()
    cv2.destroyWindow()









KeyboardInterrupt: 