# ASL-Alphabet Classification with CNN Model

### Author: Augusto Perin

##### Model Summary:
* 3 2D-Convolutional Layers with 2D-MaxPooling
* 1 Flatten Layer
* 3 Dense Layers

In [9]:
# Import of the Libraries
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Rescaling
from keras.losses import SparseCategoricalCrossentropy
from keras.utils import image_dataset_from_directory
import tensorflow as tf
import cv2
import numpy as np

ModuleNotFoundError: No module named 'cv2'

In [None]:
# Uncomment below if you need to install Kaggle
#!pip install kaggle

In [None]:
# Download my Dataset from Kaggle
!kaggle datasets download -w --unzip -d augustoperin/asl-alphabet-classification

In [2]:
# The Train and Test Data
train_ds = image_dataset_from_directory(
    directory='asl_alphabet_train',
    subset='training',
    seed=2102,
    validation_split=0.2,
    image_size=(128,128),
    batch_size=64,
    color_mode='rgb'
)

test_ds = image_dataset_from_directory(
    directory='asl_alphabet_test',
    subset='validation',
    seed=2102,
    validation_split=0.2,
    image_size=(128,128),
    batch_size=64,
    color_mode='rgb'
)

Found 87000 files belonging to 29 classes.
Using 69600 files for training.
Found 29 files belonging to 29 classes.
Using 5 files for validation.


In [3]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [4]:
# The Sequential Model
model = Sequential()

model.add(Rescaling(1./255))

model.add(Conv2D(32, 3, activation='relu'))
model.add(MaxPooling2D())

model.add(Conv2D(32, 3, activation='relu'))
model.add(MaxPooling2D())

model.add(Flatten())

model.add(Dense(128, activation='relu'))

model.add(Dense(29))

In [5]:
model.compile(optimizer='adam', loss=SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy'])

In [8]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 rescaling (Rescaling)       (None, 128, 128, 3)       0         
                                                                 
 conv2d (Conv2D)             (None, 126, 126, 32)      896       
                                                                 
 max_pooling2d (MaxPooling2D  (None, 63, 63, 32)       0         
 )                                                               
                                                                 
 conv2d_1 (Conv2D)           (None, 61, 61, 32)        9248      
                                                                 
 max_pooling2d_1 (MaxPooling  (None, 30, 30, 32)       0         
 2D)                                                             
                                                                 
 flatten (Flatten)           (None, 28800)             0

In [6]:
model.fit(train_ds, validation_data=test_ds, epochs=2)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x19648b8f690>

In [7]:
# Save the entire model
model.save('asl_model.keras')

In [None]:
def predict_from_webcam(model):

  # Open webcam
  cap = cv2.VideoCapture(0)

  # Initialize an empty string to hold the recognized text
  recognized_text = ""

  # Mapping of predicted labels to corresponding ASL signs
  labels = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "del", "nothing", "space"]

  while True:
      # Capture frame-by-frame
      ret, frame = cap.read()
      
      if not ret:
          break
      
      # Preprocess the frame
      img = cv2.resize(frame, (128, 128))
      img = img / 255.0
      img = np.expand_dims(img, axis=0)
      
      # Predict the sign
      predictions = model.predict(img)
      predicted_label = np.argmax(predictions[0])
      predicted_sign = labels[predicted_label]
      
      # Update the recognized text
      if predicted_sign == "del":
          recognized_text = recognized_text[:-1]  # Delete the last character if 'del' is predicted
      elif predicted_sign == "space":
          recognized_text += " "  # Add a space if 'space' is predicted
      elif predicted_sign != "nothing":
          recognized_text += predicted_sign  # Add the predicted sign to the recognized text
      
      # Display the resulting frame with the recognized text
      cv2.putText(frame, recognized_text, (10, 450), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)
      cv2.putText(frame, f"Predicted: {predicted_sign}", (10, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)
      cv2.imshow('ASL Translator', frame)
      
      # Break the loop on 'q' key press
      if cv2.waitKey(1) & 0xFF == ord('q'):
          break

  # When everything done, release the capture
  cap.release()
  cv2.destroyAllWindows()

In [None]:
# Load the saved model
model = tf.keras.models.load_model('asl_model.keras')

# Call the prediction function
predict_from_webcam(model)