In [1]:
# import data manipulation libraries
import pandas as pd
import numpy as np
# import data visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
# import machine learning libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# import CNN libraries
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Conv2D, Flatten
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.preprocessing.image import ImageDataGenerator
from keras import optimizers
from keras import regularizers
from keras.models import load_model
# import other libraries
import os
import warnings
# import computer vision libraries
import cv2
from PIL import Image
import mediapipe as mp
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv("data/sign-train.csv")
test = pd.read_csv("data/sign-test.csv")
labels = {i-1: chr(i+64) for i in range(1, 27)}

In [3]:
def show_observation(i = np.random.randint(0, len(train))):
    pixels = train.iloc[i, :][1:].values
    plt.imshow(pixels.reshape(28, 28), cmap='gray')
    plt.title("Label: " + labels.get(train.iloc[i, :][0]))

In [4]:
# create a CNN model to predict the sign language letter from the pixels
cnn = Sequential()
cnn.add(Conv2D(64, kernel_size=3, activation='relu', input_shape=(28, 28, 1)))
cnn.add(Conv2D(32, kernel_size=3, activation='relu'))
cnn.add(Flatten())
cnn.add(Dense(25, activation='softmax'))
cnn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
# split the data into training and validation sets
X_train = train.iloc[:, 1:].values.reshape(-1, 28, 28, 1)
y_train = to_categorical(train.iloc[:, 0])
X_test = test.iloc[:, 1:].values.reshape(-1, 28, 28, 1)
y_test = to_categorical(test.iloc[:, 0])
# train the model
cnn.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=4)
# save the model
cnn.save('asl_model.h5')
# load the model
cnn = load_model('asl_model.h5')
# evaluate the model
cnn.evaluate(X_test, y_test)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


[2.244319200515747, 0.7582264542579651]

In [5]:
# get predictions on the test set
X_test = test.iloc[:, 1:].values.reshape(-1, 28, 28, 1)
y_test = to_categorical(test.iloc[:, 0])
y_pred = cnn.predict(X_test)
# get the accuracy score
accuracy_score(y_test.argmax(axis=1), y_pred.argmax(axis=1))



0.7582264361405465

In [4]:
# use the webcam and the model to predict the sign language letter
# load the model
cnn = load_model('asl_model.h5')
# initialize the webcam
cap = cv2.VideoCapture(0)
# set the font
font = cv2.FONT_HERSHEY_SIMPLEX
mp_hands = mp.solutions.hands
hands = mp_hands.Hands()

mp_drawing = mp.solutions.drawing_utils

def preprocess_image(image):
    # Convert the image from BGR to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    
    # Resize the image to 28x28 pixels
    gray_resized = cv2.resize(gray, (28, 28))
    
    # Reshape the image
    gray_reshaped = gray_resized.reshape(1, 28, 28, 1)
    
    # Normalize the pixel values to be between 0 and 1
    gray_normalized = gray_reshaped / 255.0
    
    return gray_normalized

def get_prediction(image):
    preprocessed_image = preprocess_image(image)
    prediction = cnn.predict(preprocessed_image)
    return np.argmax(prediction)  # Assuming the model returns categorical predictions

cap = cv2.VideoCapture(0)

def get_prediction(image):
    preprocessed_image = preprocess_image(image)
    prediction = cnn.predict(preprocessed_image)
    return np.argmax(prediction) 

def segment_hand(frame):
    # Convert the image from BGR to HSV color space
    hsv = cv2.cvtColor(frame, cv2.COLOR_BGR2HSV)
    
    # Define a range for skin color values in HSV space
    lower_skin = np.array([0, 20, 70], dtype=np.uint8)
    upper_skin = np.array([20, 255, 255], dtype=np.uint8)
    
    # Threshold the HSV image to get only skin colors
    mask = cv2.inRange(hsv, lower_skin, upper_skin)
    
    # Find contours
    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    # If any contour is found
    if contours:
        # Get the largest contour based on area
        largest_contour = max(contours, key=cv2.contourArea)
        
        # Get the bounding rectangle around the largest contour
        x, y, w, h = cv2.boundingRect(largest_contour)
        
        # Return the segment of the frame that contains the hand
        return frame[y:y+h, x:x+w]
    else:
        return None

def get_hand_roi(frame, landmarks):
    height, width, _ = frame.shape
    
    # Convert relative landmarks coordinates to absolute coordinates
    landmarks_abs = []
    for landmark in landmarks:
        landmarks_abs.append((int(landmark[0] * width), int(landmark[1] * height)))
    
    # Get coordinates of the bounding box
    x_coordinates = [coordinate[0] for coordinate in landmarks_abs]
    y_coordinates = [coordinate[1] for coordinate in landmarks_abs]
    
    x_min, x_max = min(x_coordinates), max(x_coordinates)
    y_min, y_max = min(y_coordinates), max(y_coordinates)
    
    # Adding some padding to the bounding box for better capture of the hand
    padding = 20
    x_min = max(0, x_min - padding)
    y_min = max(0, y_min - padding)
    x_max = min(width, x_max + padding)
    y_max = min(height, y_max + padding)
    
    # Draw a rectangle around the detected hand
    cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)
    
    # Crop the hand ROI
    hand_roi = frame[y_min:y_max, x_min:x_max]
    
    return hand_roi


cap = cv2.VideoCapture(0)

while cap.isOpened():
    ret, frame = cap.read()
    
    # Convert the BGR image to RGB
    image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    
    # Process the frame and get the hand landmarks
    results = hands.process(image_rgb)
    
    # If hand landmarks are found, process and display them
    if results.multi_hand_landmarks:
        for landmarks in results.multi_hand_landmarks:
            # Drawing hand landmarks on the frame
            mp_drawing.draw_landmarks(frame, landmarks, mp_hands.HAND_CONNECTIONS)
            
            # Get the coordinates of the hand landmarks
            landmark_list = []
            for landmark in landmarks.landmark:
                landmark_list.append([landmark.x, landmark.y, landmark.z])
            
            # Your code to use the landmarks for cropping and prediction can go here
            
            # For demonstration purposes, let's assume you have a function `get_hand_roi`
            # that crops the hand area based on landmarks and returns it
            hand_roi = get_hand_roi(frame, landmark_list)
            if hand_roi.size > 0:
                prediction = get_prediction(hand_roi)
                cv2.putText(frame, f'Prediction: {prediction}', (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)
                
    cv2.imshow('Hand Tracking', frame)
    
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()

INFO: Created TensorFlow Lite XNNPACK delegate for CPU.




KeyboardInterrupt: 

: 