Tutorial Video in [Youtube](https://youtu.be/doDUihpj6ro).

In [7]:
# Inpendencies import
import cv2
import numpy as np
import os
import time
import mediapipe as mp

### Initializing functions and mediapipe model

In [8]:
# Keypoints using MP Holistic
mp_holistic = mp.solutions.holistic # Holistic model
mp_drawing = mp.solutions.drawing_utils # Drawing utilities

In [2]:
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  #Color conversion
    image.flags.writeable = False                   #Image is no longer writeable
    results = model.process(image)                  # Make prediction
    image.flags.writeable = True                    
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)  #Color conversion
    return image, results

In [3]:
def draw_landmarks(image, results):
    # mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_TESSELATION)
    # mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS)
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                            mp_drawing.DrawingSpec(color=(255,0,0), thickness=1, circle_radius=4),
                            mp_drawing.DrawingSpec(color=(0,255,0), thickness=1, circle_radius=2))
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                            mp_drawing.DrawingSpec(color=(255,0,0), thickness=1, circle_radius=4),
                            mp_drawing.DrawingSpec(color=(0,255,100), thickness=1, circle_radius=2))

In [5]:
# cap = cv2.VideoCapture(0)

# #Set mediapipe model
# with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
#     while cap.isOpened():

#         #Read feed
#         ret, frame = cap.read()                     
        
#         #Make detection
#         image, results = mediapipe_detection(frame, holistic)
#         #Show to screen
#         draw_landmarks(image, results)
#         cv2.imshow('OpenCV Feed', image)

#         if cv2.waitKey(10) & 0xFF == ord('q'):
#             break
        

# cap.release()
# cv2.destroyAllWindows()

In [4]:
#Extract Keypoint Values
def extract_keypoints(results):
    pose = np.array([[res.x, res.y, res.z] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(99)
    left_hand = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(63)
    right_hand = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(63)
    face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(1404)
    return np.concatenate([pose, face, left_hand, right_hand])

### The setup for collection

In [44]:
DATA_PATH = os.path.join('MP_data')
actions = np.array(["a", "b", "c"])

# , "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"

#Number of videos for each action
no_sequences = 30

#Number of frames for each video
sequence_length = 30

In [45]:
for action in actions:
    for sequence in range(no_sequences):
        try: 
            os.makedirs(os.path.join(DATA_PATH, action, str(sequence)))
        except:
            pass

### Data feeding

In [48]:
cap = cv2.VideoCapture(0)

#Set mediapipe model
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    #Loop for feeding
    action = 'c'
    for sequence in range(no_sequences):
        for frame_num in range(sequence_length):

            #Read feed
            ret, frame = cap.read()                     
            
            #Make detection
            image, results = mediapipe_detection(frame, holistic)
            #Show to screen
            draw_landmarks(image, results)

            #Apply logic
            if frame_num == 0:
                cv2.putText(image, 'Starting collection', (120, 200), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 4, cv2.LINE_AA)
                cv2.putText(image, 'Collecting for {} Video Number {}'.format(action, sequence), (15,30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 1, cv2.LINE_AA)
                cv2.waitKey(2000)
            else:
                cv2.putText(image, 'Collecting for {} Video Number {}'.format(action, sequence), (15,30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 1, cv2.LINE_AA)

            keypoint = extract_keypoints(results)
            npy_path = os.path.join(DATA_PATH, action, str(sequence), str(frame_num))
            np.save(npy_path, keypoint)
            cv2.imshow('OpenCV Feed', image)

            if cv2.waitKey(10) & 0xFF == ord('q'):
                break
        

    cap.release()
    cv2.destroyAllWindows()

### Preprocessing Data, Create Labels and Features

In [16]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

In [49]:
label_map = {label:num for num, label in enumerate(actions)}
sequences, labels = [], []
for action in actions:
    for sequence in range (no_sequences):
        window = []
        for frame_num in range (sequence_length):
            res = np.load(os.path.join(DATA_PATH, action, str(sequence), "{}.npy".format(frame_num)))
            window.append(res)
        sequences.append(window)
        labels.append(label_map[action])

In [76]:
X = np.array(sequences)
y = to_categorical(labels).astype(int)
X.shape

(90, 30, 1629)

In [77]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05)

### Model training

In [60]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import TensorBoard

In [61]:
#Save weight folder
logdir = os.path.join('Logs')
tb_callback = TensorBoard(log_dir=logdir)

In [84]:
model = Sequential()
model.add(LSTM(64, return_sequences=True, activation='relu', input_shape=(30, 1629)))
model.add(LSTM(128, return_sequences=True, activation='relu'))
model.add(LSTM(64, return_sequences=False, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(actions.shape[0], activation='softmax'))

In [86]:
model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

In [87]:
model.fit(X_train, y_train, epochs = 30, callbacks=[tb_callback])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x1ce196edb10>

In [88]:
res = model.predict(X_test)



In [32]:
actions[np.argmax(res[3])], actions[np.argmax(y_test[3])]

('b', 'b')

In [89]:
#Save model
model.save('action.h5')

In [None]:
#Load model
model.load_weights('action.h5')

In [90]:
#Evaluating
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score

y_hat = model.predict(X_test)
y_hat = np.argmax(y_hat, axis=1).tolist()
y_true = np.argmax(y_test, axis=1).tolist()
multilabel_confusion_matrix(y_true, y_hat), accuracy_score(y_true, y_hat)



(array([[[3, 0],
         [2, 0]],
 
        [[0, 2],
         [0, 3]]], dtype=int64),
 0.6)

### Test on real time

In [91]:
sequence = []
sentence  = []
threshold = 0.4

cap = cv2.VideoCapture(0)
#Set mediapipe model
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():

        #Read feed
        ret, frame = cap.read()                     
        
        #Make detection
        image, results = mediapipe_detection(frame, holistic)
        #Show to screen
        draw_landmarks(image, results)

        keypoints = extract_keypoints(results)
        sequence.insert(0, keypoints)
        sequence = sequence[:30]

        if len(sequence) == 30:
            res = model.predict(np.expand_dims(sequence, axis=0))[0]
            print(actions[np.argmax(res)])

        cv2.imshow('OpenCV Feed', image)

        if cv2.waitKey(10) & 0xFF == ord('q'):
            break
        

cap.release()
cv2.destroyAllWindows()

a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
b
b
b
b
b
b
b
b
b
b
b
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
c
c
c
c
c
c
c
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
b
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
c
b
b
b
b
b
b
b
b


### Comment

Tutor's data feeding contains different postures of pose, face and hands when do 
- I love you
- Hello
- Thanks

Mine: 

1st try:  
- no pose, with 2 hands and face
- 26 letters in the alphabet  
    <10% accurancy  

2nd try:   
- no pose, with 2 hands and face
- 3 letters a, b, c  
    ~30% accurancy  
    
3rd try:  
- no left hand, with pose, face and right hand
- 3 letters a, b, c  
    ~40% accurancy  

### Conclusion
1. The data feeding step is very important
2. Binary data make the model train much faster
3. Need to understand the output of mediapipe