In [1]:
import mediapipe as mp
import cv2
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import time
import os

In [2]:
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils

In [3]:
def mediapip_detection(img,model):
    img = cv2.cvtColor(img,cv2.COLOR_BGR2RGB)
    img.flags.writeable = False
    results = model.process(img)
    img.flags.writeable = True
    img = cv2.cvtColor(img,cv2.COLOR_RGB2BGR)
    return img,results

In [4]:
def draw_landmarks(img,results):
    mp_drawing.draw_landmarks(img,results.face_landmarks,mp_holistic.FACEMESH_TESSELATION,
                              mp_drawing.DrawingSpec(color=(136,201,3),thickness=1,circle_radius=1),
                              mp_drawing.DrawingSpec(color=(136,220,3),thickness=1,circle_radius=1),
    )
    mp_drawing.draw_landmarks(img,results.pose_landmarks,mp_holistic.POSE_CONNECTIONS,
                              mp_drawing.DrawingSpec(color=(80,22,10),thickness=2,circle_radius=2),
                              mp_drawing.DrawingSpec(color=(80,44,121),thickness=1,circle_radius=1)
    )
    mp_drawing.draw_landmarks(img,results.left_hand_landmarks,mp_holistic.HAND_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(121,22,76),thickness=2,circle_radius=2),
                             mp_drawing.DrawingSpec(color=(121,44,250),thickness=2,circle_radius=2))
    mp_drawing.draw_landmarks(img,results.right_hand_landmarks,mp_holistic.HAND_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(245,117,66),thickness=2,circle_radius=2),
                             mp_drawing.DrawingSpec(color=(245,66,230),thickness=2,circle_radius=2))

In [5]:
def extract_keypoints(results):
    # hand landmarks - 21*3
    # pose landmarks - 33*4
    # face landmarks - 468*3
    # total = 1662

    pose = np.array([[res.x,res.y,res.z,res.visibility] for res in results.pose_landmarks.landmark ]).flatten() if results.pose_landmarks else np.zeros(33*4)
    lh = np.array([[res.x,res.y,res.z] for res in results.left_hand_landmarks.landmark ]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x,res.y,res.z] for res in results.right_hand_landmarks.landmark ]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    face = np.array([ [res.x,res.y,res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)

    return np.concatenate([pose,lh,rh,face])

In [6]:
# extract_keypoints(results).shape

Setup Directories

In [6]:
data_path = 'mp_data'
no_sequences = 30
sequence_len = 30
actions = ['hello','iloveyou','thanks']

In [8]:
# for ac in actions:
#     for i in range(no_sequences):
#         try: os.makedirs(os.path.join(data_path,ac,str(i)))
#         except: pass

Collecting Data

In [21]:
# cap = cv2.VideoCapture(0)

# with mp_holistic.Holistic() as holistic:
    
#     for action in actions:
#         for seq in range(no_sequences):
#             for frm_no in range(sequence_len):
#                 ret,img = cap.read()
#                 img,results = mediapip_detection(img,holistic)
#                 draw_landmarks(img,results) 

#                 if frm_no == 0:
#                     cv2.putText(img,'STARTING COLLECTION',(120,200),
#                                 cv2.FONT_HERSHEY_SIMPLEX,1,(0,255,0),1,cv2.LINE_AA)
#                     cv2.putText(img,'Collecting frames for {} Video Number {}'.format(action,seq),
#                                 (15,12),cv2.FONT_HERSHEY_SIMPLEX,0.5,(0,0,255),1,cv2.LINE_AA)
#                     cv2.imshow("Image",img)
#                     cv2.waitKey(2000)
#                 else:
#                     cv2.putText(img,'Collecting frames for {} Video Number {}'.format(action,seq),
#                                 (15,12),cv2.FONT_HERSHEY_SIMPLEX,0.5,(0,0,255),1,cv2.LINE_AA)
#                     cv2.imshow("Image",img)
                
#                 keypnts = extract_keypoints(results)
#                 npy_path = os.path.join('mp_data',action,str(seq),str(frm_no))
#                 np.save(npy_path,keypnts)
                
#                 if cv2.waitKey(1) & 0xFF == ord('q'): break
#     cap.release()
#     cv2.destroyAllWindows()

In [7]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

In [8]:
label_map = {label:ind for ind,label in enumerate(actions)}
print(label_map)

{'hello': 0, 'iloveyou': 1, 'thanks': 2}


In [27]:
labels = []
sequences = []

for action in actions:
    for seq in range(no_sequences):
        single_seq = []
        for frm in range(sequence_len):
            res = np.load(os.path.join(data_path,action,str(seq),str(frm)+".npy"))
            single_seq.append(res)
        sequences.append(single_seq)
        labels.append(label_map[action])


In [31]:
np.array(sequences).shape
print(len(labels))

90


In [38]:
X = np.array(sequences)
y = to_categorical(labels).astype(int) # on hot encoding
print(Y.shape,len(labels))

(90, 3) 90


In [39]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.05)

In [40]:
X_train.shape

(85, 30, 1662)

Build LSTM Model

In [42]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

In [45]:
model = Sequential([
    LSTM(64, return_sequences=True, activation='relu', input_shape=(30,1662)),
    LSTM(128, return_sequences=True, activation='relu'),
    LSTM(64, return_sequences=False, activation='relu'),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(np.array(actions).shape[0], activation='softmax')
])

In [46]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_6 (LSTM)               (None, 30, 64)            442112    
                                                                 
 lstm_7 (LSTM)               (None, 30, 128)           98816     
                                                                 
 lstm_8 (LSTM)               (None, 64)                49408     
                                                                 
 dense_5 (Dense)             (None, 64)                4160      
                                                                 
 dense_6 (Dense)             (None, 32)                2080      
                                                                 
 dense_7 (Dense)             (None, 3)                 99        
                                                                 
Total params: 596,675
Trainable params: 596,675
Non-trai

In [47]:
model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

In [48]:
model.fit(X_train, y_train, epochs=200)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

<keras.callbacks.History at 0x1ed9b656b60>

In [49]:
model.save('lstmmodel.h5')

In [62]:
y_pred = model.predict(X_test)
# y_pred = [y_pred.index(max(y_pred)) for x in y_pred]
print(y_pred)
print(y_test)

[[9.9940312e-01 5.9680414e-04 6.7778615e-08]
 [2.2588654e-06 1.9836156e-03 9.9801409e-01]
 [4.1308926e-08 7.9586440e-05 9.9992037e-01]
 [3.5455322e-01 6.3996631e-01 5.4804925e-03]
 [4.2956229e-03 9.2359227e-01 7.2112121e-02]]
[[1 0 0]
 [0 0 1]
 [0 0 1]
 [0 1 0]
 [0 1 0]]


In [9]:
model = tf.keras.models.load_model('lstmmodel.h5')

Real Time Action Detection

In [10]:
from scipy import stats

In [11]:
colors = [(245,117,16),(117,245,16),(16,117,245)]

def prob_viz(res,actions,img):
    for ind,prob in enumerate(res):
        cv2.rectangle(img,(0,60 + ind*40),(int(prob*100),90+ind*40),colors[ind],-1)
        cv2.putText(img,actions[ind],(0,85+ind*40),cv2.FONT_HERSHEY_SIMPLEX,1,(255,255,255),2,cv2.LINE_AA)

In [12]:
res = []
cam = cv2.VideoCapture(0)
sequence = []
threshold = 0.7
preds = []

with mp_holistic.Holistic() as holistic:
    
    while cam.isOpened():
        ret,img = cam.read()

        img,results = mediapip_detection(img,holistic)
        draw_landmarks(img,results)
        keypoints = extract_keypoints(results)
        sequence.append(keypoints)
        sequence = sequence[-30:]

        if len(sequence) == 30:
            res = model.predict(np.expand_dims(sequence,axis=0))[0]
            preds.append(np.argmax(res))
        
        # if np.unique(preds[-10:])[0] == np.argmax(res):
        #     if res[np.argmax(res)] > threshold:

        prob_viz(res,actions,img)
        cv2.imshow("Image - ",img)

        if cv2.waitKey(1) & 0xFF == ord('q'): break

cam.release()
cv2.destroyAllWindows()

