# Computer Vision Gesture Recognition Program
# **Introduction:**
This computer vision gesture recognition program utilizes a pre-trained holistic model to predict feature points in an image. Subsequently, a classifier employing LSTM and dense layers is employed to categorize these gestures into specific classes. Users can teach the model various gestures by uploading a video of the gesture along with its appropriate label.

In [None]:
!pip install mediapipe

In [None]:
import cv2
import numpy as np
import os
from matplotlib import pyplot as plt
import time
import mediapipe as mp

In [None]:
mp_holistic = mp.solutions.holistic # Holistic model
mp_drawing = mp.solutions.drawing_utils # Drawing utilities

In [None]:
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # COLOR CONVERSION BGR 2 RGB
    image.flags.writeable = False                  # Image is no longer writeable
    results = model.process(image)                 # Make prediction
    image.flags.writeable = True                   # Image is now writeable
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # COLOR COVERSION RGB 2 BGR
    return image, results

In [None]:
def draw_landmarks(image, results):
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_TESSELATION) # Draw face connections
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS) # Draw pose connections
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS) # Draw left hand connections
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS) # Draw right hand connections

In [None]:
def draw_styled_landmarks(image, results):
    # Draw face connections
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACE_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(80,110,10), thickness=1, circle_radius=1),
                             mp_drawing.DrawingSpec(color=(80,256,121), thickness=1, circle_radius=1)
                             )
    # Draw pose connections
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(80,22,10), thickness=2, circle_radius=4),
                             mp_drawing.DrawingSpec(color=(80,44,121), thickness=2, circle_radius=2)
                             )
    # Draw left hand connections
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4),
                             mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2)
                             )
    # Draw right hand connections
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=4),
                             mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2)
                             )

In [None]:
image = cv2.imread("/content/drive/MyDrive/Junk for transfer/junk/Photo on 31-03-24 at 2.54 PM.jpg") #loading image

In [None]:
holistic = mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5)#load model
image_, results = mediapipe_detection(image, holistic)#predict result

In [None]:
draw_landmarks(image, results)#draw result

In [None]:
#displaying rendered image
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
plt.imshow(image_rgb)
plt.axis('off')
plt.show()


In [None]:
#function to extract data points
def extract_keypoints(results):
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    return np.concatenate([pose, face, lh, rh])

In [None]:
# code to create and save a rendered video from a given input video file
import cv2

# Open the video file
video_capture = cv2.VideoCapture('/content/drive/MyDrive/Junk for transfer/junk/hand_desture.mov')

# Get the video properties
frame_width = int(video_capture.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(video_capture.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = int(video_capture.get(cv2.CAP_PROP_FPS))

# Define the codec and create VideoWriter object
fourcc = cv2.VideoWriter_fourcc(*'XVID')
output_video = cv2.VideoWriter('output_video.avi', fourcc, fps, (frame_width, frame_height))

# Iterate through each frame
while True:
    ret, frame = video_capture.read()  # Read a frame

    if not ret:
        break  # Break the loop if no frame is captured

    image, results = mediapipe_detection(frame, holistic)
    draw_landmarks(image, results)

    # Write the frame to the output video
    output_video.write(image)

# Release the video capture and writer objects
video_capture.release()
output_video.release()


# Dataset creation
using the above demostrated methods, the code converts vide frame to data points and saves them in a desgnated array.

# 1 creating data for gesture 1

In [None]:
#code to extract data points from a video frame and store it in a list
import cv2
hand_gesture = []

# Open the video file
video_capture = cv2.VideoCapture('/content/drive/MyDrive/Junk for transfer/junk/hand_desture.mov')

# Iterate through each frame
while True:
    ret, frame = video_capture.read()  # Read a frame
    if not ret:
        break  # Break the loop if no frame is captured

    image, results = mediapipe_detection(frame, holistic)
    h = extract_keypoints(results)
    hand_gesture.append(h)

video_capture.release()


In [None]:
print(len(hand_gesture))

In [None]:
# Assuming your data array is named 'data'
data = hand_gesture  # Your array of data
import pandas as pd


df = pd.DataFrame(data)

df['label'] = 0


In [None]:
df.head()

# 2 creating data for gesture 2

In [None]:
import cv2
head_gesture = []

# Open the video file
video_capture = cv2.VideoCapture('/content/drive/MyDrive/Junk for transfer/junk/head_geasture.mov')

# Iterate through each frame
while True:
    ret, frame = video_capture.read()  # Read a frame
    if not ret:
        break  # Break the loop if no frame is captured

    image, results = mediapipe_detection(frame, holistic)
    h = extract_keypoints(results)
    head_gesture.append(h)

video_capture.release()


In [None]:
print(len(head_gesture))
print(head_gesture)

1829
[array([ 0.52930433,  0.45623344, -0.63678718, ...,  0.        ,
        0.        ,  0.        ]), array([ 0.53304231,  0.46539289, -0.64234114, ...,  0.        ,
        0.        ,  0.        ]), array([ 0.52871335,  0.46204859, -0.64713764, ...,  0.        ,
        0.        ,  0.        ]), array([ 0.52571607,  0.46058464, -0.69474727, ...,  0.        ,
        0.        ,  0.        ]), array([ 0.52385497,  0.45922437, -0.685211  , ...,  0.        ,
        0.        ,  0.        ]), array([ 0.52265012,  0.46058115, -0.70233011, ...,  0.        ,
        0.        ,  0.        ]), array([ 0.52178055,  0.46194309, -0.69631535, ...,  0.        ,
        0.        ,  0.        ]), array([ 0.52158839,  0.46137169, -0.69290322, ...,  0.        ,
        0.        ,  0.        ]), array([ 0.520805  ,  0.46132872, -0.68493766, ...,  0.        ,
        0.        ,  0.        ]), array([ 0.51921052,  0.46119097, -0.69959027, ...,  0.        ,
        0.        ,  0.        ]), arra

In [None]:
# Assuming your data array is named 'data'
data1 = head_gesture  # Your array of data
import pandas as pd


df1 = pd.DataFrame(data1)

df1['label'] = 1


In [None]:
df1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1653,1654,1655,1656,1657,1658,1659,1660,1661,label
0,0.529304,0.456233,-0.636787,0.999997,0.554814,0.413186,-0.579969,0.999993,0.563732,0.417493,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.533042,0.465393,-0.642341,0.999995,0.560576,0.424421,-0.585743,0.999989,0.569862,0.429506,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.528713,0.462049,-0.647138,0.999991,0.557094,0.422368,-0.591588,0.999982,0.567171,0.428533,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,0.525716,0.460585,-0.694747,0.999989,0.553884,0.422195,-0.642345,0.999978,0.564287,0.428906,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.523855,0.459224,-0.685211,0.999986,0.551782,0.421491,-0.632879,0.999973,0.562429,0.428658,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [None]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1829 entries, 0 to 1828
Columns: 1663 entries, 0 to label
dtypes: float64(1662), int64(1)
memory usage: 23.2 MB


# 3 creating data for gesture 3

In [None]:
import cv2
_gesture = []

# Open the video file
video_capture = cv2.VideoCapture('/content/drive/MyDrive/Junk for transfer/junk/side.mov')

# Iterate through each frame
while True:
    ret, frame = video_capture.read()  # Read a frame
    if not ret:
        break  # Break the loop if no frame is captured

    image, results = mediapipe_detection(frame, holistic)
    h = extract_keypoints(results)
    _gesture.append(h)

video_capture.release()


In [None]:
print(len(_gesture))
print(_gesture)

In [None]:
# Assuming your data array is named 'data'
data1 = _gesture  # Your array of data
import pandas as pd


df2 = pd.DataFrame(data1)

df2['label'] = 2


# 4 creating data for gesture 4

In [None]:
import cv2
handface_gesture = []

# Open the video file
video_capture = cv2.VideoCapture('/content/drive/MyDrive/Junk for transfer/junk/hand_face.mov')

# Iterate through each frame
while True:
    ret, frame = video_capture.read()  # Read a frame
    if not ret:
        break  # Break the loop if no frame is captured

    image, results = mediapipe_detection(frame, holistic)
    h = extract_keypoints(results)
    handface_gesture.append(h)

video_capture.release()


In [None]:
print(len(handface_gesture))
print(handface_gesture)

In [None]:
data3 = handface_gesture
import pandas as pd
df3 = pd.DataFrame(data3)
df3['label'] = 3

In [None]:
merged_df = pd.concat([df, df1, df2, df3], ignore_index=True)

In [None]:
Data preprocessing

In [None]:
from sklearn.model_selection import train_test_split

y = merged_df['label']
X = merged_df.drop(columns=['label'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes of train and test data
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

In [None]:
data is split into Train and Test

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import TensorBoard

In [None]:
input = X_train.shape
print(input)

# converting data to required shape for training

In [None]:
from keras.utils import to_categorical

# Assuming y_train has two classes
num_classes = 4

# One-hot encode the labels
y_train_encoded = to_categorical(y_train, num_classes=num_classes)


In [None]:
import numpy as np

# Assuming X_train is your input data with shape (3203, 1662)

# Convert X_train DataFrame to a NumPy array
X_train_array = X_train.values

# Reshape X_train to fit the expected input shape of (batch_size, features)
X_train_reshaped = X_train_array.reshape(X_train_array.shape[0], 1, X_train_array.shape[1])
print("Shape of X_train_array:", X_train_reshaped.shape)


In [None]:
import numpy as np

# Assuming X_train is your input data with shape (3203, 1662)

# Convert X_train DataFrame to a NumPy array
X_test_array = X_test.values

# Reshape X_train to fit the expected input shape of (batch_size, features)
X_test_reshaped = X_test.values.reshape(X_test.shape[0],1, X_test.shape[1])
print("Shape of X_train_array:", X_test_reshaped.shape)


In [None]:
y_test_encoded = to_categorical(y_test, num_classes=num_classes)


In [None]:
input_shape = (1, 1662)


In [None]:
del model #delete model if nessary

NameError: name 'model' is not defined

In [None]:
model = Sequential() #start a sequesntial model
model.add(LSTM(64, return_sequences=True, activation='relu', input_shape=input_shape)) # LSTM Layer
model.add(LSTM(128, return_sequences=True, activation='relu'))
model.add(LSTM(64, return_sequences=False, activation='relu'))
model.add(Dense(64, activation='relu')) #Dense layer
model.add(Dense(32, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))

In [None]:
model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
Training the model on preprocesssed data

In [None]:
model.fit(X_train_reshaped, y_train_encoded, epochs=20, validation_data=(X_test_reshaped, y_test_encoded)) #training model


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7935ea688520>

In [None]:
model.save("gesture_update.h5") #saving the model

In [None]:
from tensorflow.keras.models import load_model
model = load_model('/content/gesture_update.h5')

In [None]:
#function to predict class
import cv2
import pandas as pd
def predict(image, show = True):
  image = cv2.imread(image)
  holistic = mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5)
  image_, results = mediapipe_detection(image, holistic)
  image = draw_landmarks(image_, results)
  if show == True:
    image_rgb = cv2.cvtColor(image_, cv2.COLOR_BGR2RGB)
    plt.imshow(image_rgb)
    plt.axis('off')
    plt.show()
  res = extract_keypoints(results)
  df2 = pd.DataFrame(res)
  X_train_ = df2.values
  X = X_train_.reshape(X_train_.shape[0], 1, X_train_.shape[1])
  X1 = np.reshape(X, (1, 1, 1662))
  prediction = model.predict(X1)
  a = np.argmax(prediction)
  print(dict.get(a))

In [None]:
dict = {0:"gesture_L_hand", 1:"face_at_angle", 2:"body_turned",3:"mouth_hand"} #reverse dictionary to map prediction to label

# Testing on real world data

In [None]:
predict("/content/Photo on 31-03-24 at 6.36 PM.jpg")

In [None]:
# prgram to predict and display label and image
list = ["/content/Photo on 31-03-24 at 10.09 PM.jpg","/content/Photo on 31-03-24 at 4.53 PM #2.jpg","/content/Photo on 31-03-24 at 6.36 PM.jpg","//content/body.jpg"]
for i in list:
  predict(i)