In [1]:
import cv2
import numpy as np
import os
import matplotlib.pyplot as plt
import time
import mediapipe as mp

In [2]:
mp_holistic=mp.solutions.holistic
mp_drawing=mp.solutions.drawing_utils
mp_face_mesh = mp.solutions.face_mesh  # for FACE_CONNECTIONS

In [3]:
def mediapipe_detection(image,model):
    image=cv2.cvtColor(image,cv2.COLOR_BGR2RGB)
    image.flags.writeable=False
    results=model.process(image)
    image.flags.writeable=True
    image=cv2.cvtColor(image,cv2.COLOR_RGB2BGR)

    return image,results

In [4]:
def draw_landmarks(image,results):
    mp_drawing.draw_landmarks(image,results.face_landmarks,mp_face_mesh.FACEMESH_TESSELATION)
    mp_drawing.draw_landmarks(image,results.pose_landmarks,mp_holistic.POSE_CONNECTIONS)
    mp_drawing.draw_landmarks(image,results.left_hand_landmarks,mp_holistic.HAND_CONNECTIONS)
    mp_drawing.draw_landmarks(image,results.right_hand_landmarks,mp_holistic.HAND_CONNECTIONS)
    

In [5]:
def draw_styled_landmarks(image,results):
    mp_drawing.draw_landmarks(
        image,results.face_landmarks,mp_face_mesh.FACEMESH_TESSELATION,
        mp_drawing.DrawingSpec(color=(80,110,1),thickness=1,circle_radius=1),
        mp_drawing.DrawingSpec(color=(80,255,121),thickness=1,circle_radius=1),
    )
    
    mp_drawing.draw_landmarks(
        image,results.pose_landmarks,mp_holistic.POSE_CONNECTIONS,
                mp_drawing.DrawingSpec(color=(80,22,1),thickness=2,circle_radius=2),
        mp_drawing.DrawingSpec(color=(80,50,121),thickness=2,circle_radius=2)
    )
    mp_drawing.draw_landmarks(
        image,results.left_hand_landmarks,mp_holistic.HAND_CONNECTIONS,
            mp_drawing.DrawingSpec(color=(121,80,110),thickness=2,circle_radius=3),
        mp_drawing.DrawingSpec(color=(121,44,250),thickness=4,circle_radius=4)
    )
    mp_drawing.draw_landmarks(
        image,results.right_hand_landmarks,mp_holistic.HAND_CONNECTIONS,
            mp_drawing.DrawingSpec(color=(140,200,120),thickness=2,circle_radius=3),
        mp_drawing.DrawingSpec(color=(200,50,80),thickness=4,circle_radius=4),
    )

In [382]:
cap=cv2.VideoCapture(0)
cap.set(3,1240)
cap.set(4,980)
with mp_holistic.Holistic(min_detection_confidence=0.7,min_tracking_confidence=0.7) as holistic:
    while cap.isOpened():
    
        #read feed
        ret,frame=cap.read()
    
        #model_detection
        image,results=mediapipe_detection(frame,holistic)
        print(results)

        draw_styled_landmarks(image,results)
        
        #show feed
        cv2.imshow("opencv feed",image)
        if cv2.waitKey(10) and 0xFF==ord('q'):
            break
    cap.release()
    cv2.destroyAllWindows()

KeyboardInterrupt: 

In [383]:
cap.release()
cv2.destroyAllWindows()

In [None]:
len(results.face_landmarks.landmark)

In [384]:
pose=np.array([[res.x,res.y,res.z,res.visibility] for res in results.pose_landmarks.landmark ]).flatten() if results.pose_landmarks else np.zeros(132)
face=np.array([[res.x,res.y,res.z] for res in results.face_landmarks.landmark ]).flatten() if results.face_landmarks else np.zeros(1404)
left_hand=np.array([[res.x,res.y,res.z] for res in results.left_hand_landmarks.landmark ]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
right_hand=np.array([[res.x,res.y,res.z] for res in results.right_hand_landmarks.landmark ]).flatten() if results.right_hand_landmarks else np.zeros(21*3)

In [385]:
face

array([ 0.49404901,  0.51509923, -0.02440509, ...,  0.54187423,
        0.39950138,  0.02921136])

In [6]:
def extract_keypoints(results):
    pose=np.array([[res.x,res.y,res.z,res.visibility] for res in results.pose_landmarks.landmark ]).flatten() if results.pose_landmarks else np.zeros(33*4)
    face=np.array([[res.x,res.y,res.z] for res in results.face_landmarks.landmark ]).flatten() if results.face_landmarks else np.zeros(468*3)
    left_hand=np.array([[res.x,res.y,res.z] for res in results.left_hand_landmarks.landmark ]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    right_hand=np.array([[res.x,res.y,res.z] for res in results.right_hand_landmarks.landmark ]).flatten() if results.right_hand_landmarks else np.zeros(21*3)

    return np.concatenate([pose,face,left_hand,right_hand])

In [7]:
DATA_PATH=os.path.join('MP_data')
actions=np.array(['hello','thankyou','iloveyou','no'])

no_sequences=30
sequence_length=30

In [8]:
for action in actions:
    for sequence in range(no_sequences):
        try:
            os.makedirs(os.path.join(DATA_PATH,action,str(sequence)))
        except:
            pass

In [74]:
actions=np.array(['iloveyou'])

In [75]:
cap=cv2.VideoCapture(0)
cap.set(3,1240)
cap.set(4,1080)
with mp_holistic.Holistic(min_detection_confidence=0.7,min_tracking_confidence=0.7) as holistic:
    for action in actions:
        for seq in range(no_sequences):
            for frame_no in range(sequence_length):
                #read feed
                ret,frame=cap.read()
            
                #model_detection
                image,results=mediapipe_detection(frame,holistic)
        
                draw_styled_landmarks(image,results)


                #collection logic here

                if frame_no==0:
                    cv2.putText(image,"starting Collection",(int(1240/2),int(1080/2)),
                                cv2.FONT_HERSHEY_SIMPLEX,1,(0,255,0),4,cv2.LINE_AA)
                    cv2.putText(image,"Collecting frames for {} Video num {}".format(action,seq),(40,40),
                                cv2.FONT_HERSHEY_SIMPLEX,1,(0,255,0),2,cv2.LINE_AA)
                    cv2.waitKey(2000)
                else:
                    cv2.putText(image,"Collecting frames for {} Video num {}".format(action,seq),(40,40),
                                cv2.FONT_HERSHEY_SIMPLEX,1,(0,255,0),2,cv2.LINE_AA)


                keypoints=extract_keypoints(results)
                frame_path=os.path.join(DATA_PATH,action,str(seq),str(frame_no))
                np.save(frame_path,keypoints)
                
                    
                
                #show feed
                cv2.imshow("opencv feed",image)
                if cv2.waitKey(10) and 0xFF==ord('q'):
                    break
    cap.release()
    cv2.destroyAllWindows()

In [80]:
cap.release()
cv2.destroyAllWindows()

In [9]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

In [10]:
label_map={label:num for num,label in enumerate(actions)}
label_map

{'hello': 0, 'thankyou': 1, 'iloveyou': 2, 'no': 3}

In [11]:
sequences,labels=[],[]

for action in actions:
    for sequence in range(no_sequences):
        window=[]
        for frame_no in range(sequence_length):
            res=np.load(os.path.join(DATA_PATH,action,str(sequence),"{}.npy".format(frame_no)))
            window.append(res)
        sequences.append(window)
        labels.append(label_map[action])

In [12]:
np.array(sequences).shape

(120, 30, 1662)

In [13]:
np.array(labels).shape

(120,)

In [14]:
X=np.array(sequences)
y=to_categorical(np.array(labels))

In [15]:
X.shape

(120, 30, 1662)

In [16]:
y.shape

(120, 4)

In [17]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=26)

In [18]:
X_val,X_test2,y_val,y_test2=train_test_split(X_test,y_test,test_size=0.5,random_state=26)

In [19]:
y_test.shape

(36, 4)

In [20]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import LSTM,Dense,Input,Dropout
from tensorflow.keras.callbacks import TensorBoard,ModelCheckpoint,EarlyStopping

In [21]:
log_dir=os.path.join("Logs")
tb_callback=TensorBoard(log_dir=log_dir)
# Save the best model (lowest val_loss)
checkpoint = ModelCheckpoint(
    filepath='best_model2.keras',   # or use .keras for newer format
    monitor='val_loss',
    save_best_only=True,
    save_weights_only=False,
    verbose=1
)



In [22]:
model=Sequential()

In [23]:
model.add(Input(shape=(30,1662)))
model.add(LSTM(64,return_sequences=True,activation='relu'))
model.add(LSTM(128,return_sequences=True,activation='relu'))
model.add(LSTM(64,return_sequences=False,activation='relu'))
# model.add(Dropout(0.5))
model.add(Dense(64,activation="relu"))
model.add(Dense(32,activation="relu"))
model.add(Dense(4,activation="softmax"))

In [24]:
model.summary()

In [25]:
model.compile(loss="categorical_crossentropy",optimizer="adam",metrics=['accuracy'])

In [26]:
model.fit(
    X_train,y_train,validation_data=(X_test2,y_test2),epochs=450,callbacks=[tb_callback,checkpoint]
)

Epoch 1/450
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step - accuracy: 0.2564 - loss: 2.9900
Epoch 1: val_loss improved from inf to 1.48865, saving model to best_model2.keras
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 679ms/step - accuracy: 0.2519 - loss: 3.1181 - val_accuracy: 0.1667 - val_loss: 1.4886
Epoch 2/450
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step - accuracy: 0.3584 - loss: 1.6839
Epoch 2: val_loss did not improve from 1.48865
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 136ms/step - accuracy: 0.3462 - loss: 1.8346 - val_accuracy: 0.1111 - val_loss: 3.3967
Epoch 3/450
[1m2/3[0m [32m━━━━━━━━━━━━━[0m[37m━━━━━━━[0m [1m0s[0m 64ms/step - accuracy: 0.2812 - loss: 2.5912 
Epoch 3: val_loss did not improve from 1.48865
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 142ms/step - accuracy: 0.2775 - loss: 2.3580 - val_accuracy: 0.2778 - val_loss: 1.4904
Epoch 4/450
[1m3/3[0

<keras.src.callbacks.history.History at 0x212caefb050>

In [27]:
model.save("lstm_model.keras")

In [28]:
from tensorflow.keras.models import load_model
lstm_model=load_model("lstm_model.keras")

In [29]:
y_pred=[np.argmax(value) for value in lstm_model.predict(X_test)]

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 671ms/step


In [30]:
y_true=[np.argmax(value) for value in y_test]

In [31]:
from sklearn.metrics import accuracy_score,multilabel_confusion_matrix,f1_score,classification_report

In [32]:
print("accueracy:",accuracy_score(y_true,y_pred))
print("f1_score:",f1_score(y_true,y_pred,average="macro"))

accueracy: 0.9166666666666666
f1_score: 0.9090579710144927


In [33]:
multilabel_confusion_matrix(y_true,y_pred)

array([[[25,  0],
        [ 0, 11]],

       [[29,  1],
        [ 1,  5]],

       [[28,  1],
        [ 0,  7]],

       [[23,  1],
        [ 2, 10]]], dtype=int64)

In [34]:
res=lstm_model.predict(X_test)

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step


In [35]:
actions[np.argmax(res[27])]

'thankyou'

In [36]:
actions[np.argmax(y_test[27])]

'thankyou'

In [37]:
from scipy import stats
colors = [(245,117,16), (117,245,16), (16,117,245),(28,150,200)]
def prob_viz(res, actions, input_frame, colors):
    output_frame = input_frame.copy()
    for num, prob in enumerate(res):
        cv2.rectangle(output_frame, (0,60+num*40), (int(prob*100), 90+num*40), colors[num], -1)
        cv2.putText(output_frame, actions[num], (0, 85+num*40), cv2.FONT_HERSHEY_SIMPLEX, 1, (255,255,255), 2, cv2.LINE_AA)
        
    return output_frame

In [None]:
#test in real time

sequence30 = []
sentence = []
predictions = []
threshold = 0.5

cap = cv2.VideoCapture(0)
cap.set(3,1080)
cap.set(4,980)
# Set mediapipe model 

with mp_holistic.Holistic(min_detection_confidence=0.7, min_tracking_confidence=0.7) as holistic:
    while cap.isOpened():

        # Read feed
        ret, frame = cap.read()

        # Make detections
        image, results = mediapipe_detection(frame, holistic)
        
        # Draw landmarks
        draw_styled_landmarks(image, results)
        
        # 2. Prediction logic
        keypoints = extract_keypoints(results)
        sequence30.append(keypoints)
        sequence30 = sequence30[-30:]
        
        if len(sequence30) == 30:
            res = lstm_model.predict(np.expand_dims(sequence30, axis=0))[0]
            print(actions[np.argmax(res)])
            predictions.append(np.argmax(res))
            
            
        #3. Viz logic
            if np.unique(predictions[-10:])[0]==np.argmax(res): 
                if res[np.argmax(res)] > threshold: 
                    
                    if len(sentence) > 0: 
                        if actions[np.argmax(res)] != sentence[-1]:
                            sentence.append(actions[np.argmax(res)])
                    else:
                        sentence.append(actions[np.argmax(res)])

            if len(sentence) > 5: 
                sentence = sentence[-5:]

            # Viz probabilities
            image = prob_viz(res, actions, image, colors)
            
        cv2.rectangle(image, (0,0), (640, 40), (245, 117, 16), -1)
        cv2.putText(image, ' '.join(sentence), (3,30), 
                       cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
        
        # Show to screen
        cv2.imshow('OpenCV Feed', image)

        # Break gracefully
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break
    cap.release()
    cv2.destroyAllWindows()


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step
iloveyou
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step
iloveyou
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step
iloveyou
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step
iloveyou
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step
iloveyou
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step
iloveyou
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step
iloveyou
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 92ms/step
iloveyou
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 93ms/step
iloveyou
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 86ms/step
iloveyou
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step
iloveyou
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step
iloveyou
[1m1/1[0m [32m━━━━━━━━━━━

In [39]:
cap.release()
cv2.destroyAllWindows()