 # Libraries

In [1]:
import os
import cv2
import numpy as np
import mediapipe as mp
import math

 # Keypoints using Mediapipe Holistic Model

In [2]:
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils

In [3]:
def mediapipe_detection(frame, model):
    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) # Model (Holistic) works on RGB not BGR
    frame.flags.writeable = False # To improve performance, optionally mark the image as not writeable to pass by reference
    results = model.process(frame)                 # Make prediction
    frame.flags.writeable = True
    frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
    return frame, results

In [4]:
def draw_landmarks(frame, model): # BGR
    mp_drawing.draw_landmarks(frame, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(80, 200, 100), thickness=2, circle_radius=3),  # Points
                             mp_drawing.DrawingSpec(color=(255, 255, 255), thickness=2, circle_radius=2) # Connection
                             ) 
    mp_drawing.draw_landmarks(frame, results.face_landmarks, mp_holistic.FACEMESH_TESSELATION,
                             mp_drawing.DrawingSpec(color=(255, 0, 0), thickness=1, circle_radius=1),     
                             mp_drawing.DrawingSpec(color=(255, 255, 255), thickness=1, circle_radius=1)  
                             )   
    mp_drawing.draw_landmarks(frame, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(0, 0, 255), thickness=2, circle_radius=3),
                             mp_drawing.DrawingSpec(color=(255, 255, 255), thickness=2, circle_radius=2)
                             )    
    mp_drawing.draw_landmarks(frame, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(0, 0, 255), thickness=2, circle_radius=3),
                             mp_drawing.DrawingSpec(color=(255, 255, 255), thickness=2, circle_radius=2)
                             )  

In [5]:
def extract_keypoints(results):
    pose = np.array([[res.x, res.y] for res in results.pose_landmarks.landmark[11:15]]).flatten() if results.pose_landmarks else np.zeros(8)
    face = np.array([[res.x, res.y] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(936)
    lefthand = np.array([[res.x, res.y] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(42)
    righthand = np.array([[res.x, res.y] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(42)
    return np.concatenate([pose, face, lefthand, righthand])

 # Get Distance

In [6]:
def getDistance(frame, results):
    h, w, c = frame.shape
    distance = int(math.sqrt((results.pose_landmarks.landmark[8].y * h - results.pose_landmarks.landmark[7].y * h)**2 + (results.pose_landmarks.landmark[8].x * w - results.pose_landmarks.landmark[7].x * w)**2)) if results.pose_landmarks else 0
    return distance

 # For Training

 # Create Folders for Actions

In [69]:
DATA_PATH = os.path.join('Data')

actions = np.array(['باذنجان'])  # Words

no_videos = 30 # 30

video_length = 30  # Video is 30 frames in length

for action in actions:
    for video in range(no_videos):
        try:
            os.makedirs(os.path.join(DATA_PATH, action, str(video)))
        except:
            pass

 # Create Data

In [70]:
cap = cv2.VideoCapture(1)
with mp_holistic.Holistic() as holistic:
   
    for action in actions:
        print(action)
        for video in range(no_videos):
            for frame_num in range(video_length):       
                ret, frame = cap.read()
                if ret == False:
                    break
            
                # Wait between each video
                if frame_num == 0:                    
                    cv2.waitKey(2000)
                    
                frame, results = mediapipe_detection(frame, holistic)
                draw_landmarks(frame, results)
                   
                # FLip frame
                frame = cv2.flip(frame,1)

                # Distance
                distance = getDistance(frame, results)
                cv2.putText(frame, f'Distance {distance}', (530,20),\
                                    cv2.FONT_ITALIC, 0.5, (0,0,255), 2)
                    
                # Video Number
                cv2.putText(frame, f'{action} video number {video}', (15,20),\
                            cv2.FONT_ITALIC, 0.5, (0,0,255), 2)
                    
                # Export Keypoints (Save as numpy)
                keypoints = extract_keypoints(results)
                npy_path = os.path.join(DATA_PATH, action, str(video), str(frame_num))
                np.save(npy_path, keypoints)
                
                cv2.imshow('Esm3ni', frame)
        
                # Pause
                if key == 32:
                    cv2.waitKey(7000)
                    print('redy')
                    cv2.waitKey(3000)
            
                key = cv2.waitKey(1)
                if key == 27:
                    cap.release()
                    cv2.destroyAllWindows() 

    cap.release()
    cv2.destroyAllWindows()

طعمية


 ### Test before creating the data

In [67]:
cap = cv2.VideoCapture(1)
with mp_holistic.Holistic() as holistic:
    while True:
        ret, frame = cap.read()
        if ret == False:
            break

        frame, results = mediapipe_detection(frame, holistic)
        # print(results)  # <class 'mediapipe.python.solution_base.SolutionOutputs'>
        
        # Draw landmarks
        draw_landmarks(frame, results)
        
        # FLip frame
        frame = cv2.flip(frame,1)
        
        # Distance
        distance = getDistance(frame, results)
        cv2.putText(frame, f'Distance {distance}', (530,20),\
                            cv2.FONT_ITALIC, 0.5, (0,0,255), 2)
        
        cv2.imshow('Esm3ni', frame)
        
        key = cv2.waitKey(1)       
        # Pause
        if key == 32:
            input()
            cv2.waitKey(2000)
        
        if key == 27:
            break

    cap.release()
    cv2.destroyAllWindows()

In [119]:
cap.release()
cv2.destroyAllWindows()

 # Works on DL Model

In [33]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

In [34]:
DATA_PATH = os.path.join('Data')
no_videos = 30
video_length = 30

actions = np.array(os.listdir(DATA_PATH))
actions

array(['اناناس', 'برتقان', 'بطيخ', 'بلح', 'تفاح', 'عنب', 'فراولة'],
      dtype='<U6')

In [36]:
action_map = {action: idd for idd, action in enumerate(actions)}
action_map

{'اناناس': 0,
 'برتقان': 1,
 'بطيخ': 2,
 'بلح': 3,
 'تفاح': 4,
 'عنب': 5,
 'فراولة': 6}

In [37]:
videos, labels = [], []
for action in actions:
    for video in range(no_videos):
        temp = []
        for frame_num in range(video_length):
            res = np.load(os.path.join(DATA_PATH, action, str(video), f'{frame_num}.npy'))
            temp.append(res)
        videos.append(temp)
        labels.append(action_map[action])

In [38]:
x = np.array(videos)
y = to_categorical(labels, dtype='int')

In [39]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4)

 # NN

In [40]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten

In [41]:
model = Sequential()
model.add(Flatten(input_shape=(30, 1028)))
model.add(Dense(32, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(actions.shape[0], activation='softmax')) # sigmoid

In [42]:
model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

In [43]:
model.fit(x_train, y_train, epochs=50) #, callbacks=[tb_callback])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x13821e4f0d0>

In [44]:
print(model.evaluate(x_test, y_test))
model.evaluate(x_train, y_train)

[0.17831891775131226, 0.976190447807312]


[0.13077093660831451, 0.9841269850730896]

 # Test in Real Time

In [46]:
video = []
sentence = []
threshold = 0.5

cap = cv2.VideoCapture(0)
with mp_holistic.Holistic() as holistic:
    while True:
        ret, frame = cap.read()
        if ret == False:
            break

        frame, results = mediapipe_detection(frame, holistic)
        draw_landmarks(frame, results)
        
        keypoints = extract_keypoints(results)
    
        video.append(keypoints)
        video = video[-30:] # video[:30]
        
        if len(video) == 30:
            res = model.predict(np.expand_dims(video, axis=0))[0]
 
        #################################################
        try:
            if res[np.argmax(res)] > threshold:
                if len(sentence) > 0:
                    if actions[np.argmax(res)] != sentence[-1]:
                        sentence.append(actions[np.argmax(res)])
                else:
                    sentence.append(actions[np.argmax(res)])

            if len(sentence) > 5:
                sentence = sentence[-5:]
        except:
            pass
        #################################################
            
        # FLip frame
        frame = cv2.flip(frame,1)
        
        # Distance
        distance = getDistance(frame, results)
        cv2.putText(frame, f'Distance {distance}', (530,70),\
                            cv2.FONT_ITALIC, 0.5, (0,0,255), 2)
        
        cv2.rectangle(frame, (0,0), (640,40), (255, 255, 255), -1)
        cv2.putText(frame, ' '.join(sentence), (3,30), cv2.FONT_ITALIC, 1, (0,0,0), 2, cv2.LINE_AA)
        
        print(sentence)
        
        cv2.imshow('OpenCV', frame)
        key = cv2.waitKey(1)
        if key == 27:
            break

    cap.release()
    cv2.destroyAllWindows()

['برتقان']
['برتقان']
['برتقان']
['برتقان']
['برتقان']
['برتقان']
['برتقان']
['برتقان']
['برتقان']
['برتقان']
['برتقان']
['برتقان']
['برتقان']
['برتقان']
['برتقان']
['برتقان']
['برتقان']
['برتقان']
['برتقان']
['برتقان']
['برتقان']
['برتقان']
['برتقان']
['برتقان']
['برتقان']
['برتقان']
['برتقان']
['برتقان']
['برتقان']
['برتقان', 'بطيخ']
['برتقان', 'بطيخ']
['برتقان', 'بطيخ']
['برتقان', 'بطيخ']
['برتقان', 'بطيخ']
['برتقان', 'بطيخ', 'برتقان']
['برتقان', 'بطيخ', 'برتقان']
['برتقان', 'بطيخ', 'برتقان']
['برتقان', 'بطيخ', 'برتقان']
['برتقان', 'بطيخ', 'برتقان']
['برتقان', 'بطيخ', 'برتقان']


In [65]:
cap.release()
cv2.destroyAllWindows()

In [79]:
len(sentence) > 0

True

In [75]:
if sentence[-1] != sentence[-2]:
    print(sentence[-1])

أنت


 # Save & Load Model

In [174]:
model.save('test.h5')

In [176]:
from keras.models import load_model
loaded_model = load_model('test.h5')
loss, accuracy = loaded_model.evaluate(x_test, y_test)

