In [None]:
import time
import os
import cv2
import numpy as np
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import arabic_reshaper
from PIL import ImageFont, ImageDraw, Image
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
# Path to your dataset
dataset_path = 'RGB ArSL dataset'  # Main folder with subfolders for each letter

In [3]:
def load_dataset(folder):
    """Load dataset from the folder containing subfolders of images for each letter."""
    images = []
    labels = []

    # Iterate over each subfolder (corresponding to a letter/gesture)
    for label in os.listdir(folder):
        label_folder = os.path.join(folder, label)
        if os.path.isdir(label_folder):
            for img_file in os.listdir(label_folder):
                img_path = os.path.join(label_folder, img_file)
                img = cv2.imread(img_path)
                if img is not None:  # Ensure the image was read correctly
                    img = cv2.resize(img, (64, 64))
                    images.append(img)
                    labels.append(label)

    return np.array(images), np.array(labels)

In [4]:
# Load the dataset
X, y = load_dataset(dataset_path)

In [5]:
#Converts labels (letters) into numerical values.
# Encode labels and create a dynamic label map
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y_categorical = to_categorical(y_encoded) #Converts numerical labels into one-hot encoded vectors 

In [6]:
# Save the label classes
np.save('label_classes.npy', label_encoder.classes_)  # Save classes for later use

In [7]:
# Create a dynamic label map based on the encoder's order
label_map = {i: label for i, label in enumerate(label_encoder.classes_)}
print(f"Label Map: {label_map}")  # Ensure it matches your expectations

Label Map: {0: 'Ain', 1: 'Al', 2: 'Alef', 3: 'Beh', 4: 'Dad', 5: 'Dal', 6: 'Feh', 7: 'Ghain', 8: 'Hah', 9: 'Heh', 10: 'Jeem', 11: 'Kaf', 12: 'Khah', 13: 'Laa', 14: 'Lam', 15: 'Meem', 16: 'Noon', 17: 'Qaf', 18: 'Reh', 19: 'Sad', 20: 'Seen', 21: 'Sheen', 22: 'Tah', 23: 'Teh', 24: 'Teh_Marbuta', 25: 'Thal', 26: 'Theh', 27: 'Waw', 28: 'Yeh', 29: 'Zah', 30: 'Zain', 31: 'masafa', 32: 'mash'}


In [8]:
# Define the mapping from English labels to Arabic letters
english_to_arabic = {
    'Ain': 'ع', 'Al': 'ال', 'Alef': 'ا', 'Beh': 'ب', 'Dad': 'ض', 'Dal': 'د',
    'Feh': 'ف', 'Ghain': 'غ', 'Hah': 'ح', 'Heh': 'ه', 'Jeem': 'ج', 'Kaf': 'ك',
    'Khah': 'خ', 'Laa': 'لا', 'Lam': 'ل', 'Meem': 'م', 'Noon': 'ن', 'Qaf': 'ق',
    'Reh': 'ر', 'Sad': 'ص', 'Seen': 'س', 'Sheen': 'ش', 'Tah': 'ط', 'Teh': 'ت',
    'Teh_Marbuta': 'ة', 'Thal': 'ذ', 'Theh': 'ث', 'Waw': 'و', 'Yeh': 'ي',
    'Zah': 'ظ', 'Zain': 'ز', 'masafa': ' ', 'mash': ''  # 'mash' represents delete
}

In [9]:
# Create a dynamic label map based on the encoder's order
label_map = {i: english_to_arabic[label] for i, label in enumerate(label_encoder.classes_)}
print(f"Label Map: {label_map}")  # Ensure it matches your expectations

Label Map: {0: 'ع', 1: 'ال', 2: 'ا', 3: 'ب', 4: 'ض', 5: 'د', 6: 'ف', 7: 'غ', 8: 'ح', 9: 'ه', 10: 'ج', 11: 'ك', 12: 'خ', 13: 'لا', 14: 'ل', 15: 'م', 16: 'ن', 17: 'ق', 18: 'ر', 19: 'ص', 20: 'س', 21: 'ش', 22: 'ط', 23: 'ت', 24: 'ة', 25: 'ذ', 26: 'ث', 27: 'و', 28: 'ي', 29: 'ظ', 30: 'ز', 31: ' ', 32: ''}


In [10]:
# Normalize images Converts pixel values from [0, 255] to [0, 1] to improve model performance.
X = X / 255.0

In [11]:
# Split dataset into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y_categorical, test_size=0.2, random_state=42)

In [12]:
def create_cnn_model():
    model = Sequential()

    # 1st Convolutional Block
    model.add(Conv2D(32, (3, 3), activation='relu', input_shape=(64, 64, 3)))
    model.add(MaxPooling2D(pool_size=(2, 2)))

    # 2nd Convolutional Block
    model.add(Conv2D(64, (3, 3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))

    # 3rd Convolutional Block
    model.add(Conv2D(128, (3, 3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))

    # New 4th Convolutional Block
    model.add(Conv2D(256, (3, 3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))

    #Convolution + MaxPooling layers: Extract spatial features and reduce the size of the feature map
    # Flattening Converts the 2D output of the final MaxPooling layer into a 1D vector
    model.add(Flatten())

    # Fully Connected Layers
    model.add(Dense(256, activation='relu'))  # New Fully Connected Layer
    model.add(Dropout(0.5))  # Dropout to prevent overfitting

    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))  # Additional dropout layer for robustness

    # Output Layer
    model.add(Dense(len(label_encoder.classes_), activation='softmax'))

    return model

In [13]:
# Create the enhanced model
model = create_cnn_model()
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [14]:
# Data augmentation
datagen = ImageDataGenerator(
    rotation_range=10,
    width_shift_range=0.1,
    height_shift_range=0.1,
    horizontal_flip=True
)

In [15]:
# Train the model
history = model.fit(
    datagen.flow(X_train, y_train, batch_size=32),
    validation_data=(X_val, y_val),
    epochs=100
)
# Save the model
model.save('asl_cnn2_model.h5')

Epoch 1/100


  self._warn_if_super_not_called()


[1m207/207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 44ms/step - accuracy: 0.0310 - loss: 3.5001 - val_accuracy: 0.0369 - val_loss: 3.4941
Epoch 2/100
[1m207/207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 41ms/step - accuracy: 0.0401 - loss: 3.4854 - val_accuracy: 0.0393 - val_loss: 3.4004
Epoch 3/100
[1m207/207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 56ms/step - accuracy: 0.0560 - loss: 3.4031 - val_accuracy: 0.0811 - val_loss: 3.2198
Epoch 4/100
[1m207/207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 57ms/step - accuracy: 0.1032 - loss: 3.1761 - val_accuracy: 0.1852 - val_loss: 2.8043
Epoch 5/100
[1m207/207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 81ms/step - accuracy: 0.1548 - loss: 2.8976 - val_accuracy: 0.2149 - val_loss: 2.6098
Epoch 6/100
[1m207/207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 97ms/step - accuracy: 0.1993 - loss: 2.6691 - val_accuracy: 0.2875 - val_loss: 2.2776
Epoch 7/100
[1m207/207



In [16]:
# Evaluate the model on the validation set
val_loss, val_acc = model.evaluate(X_val, y_val, verbose=2)
print(f'Validation Accuracy: {val_acc * 100:.2f}%')

52/52 - 1s - 12ms/step - accuracy: 0.8299 - loss: 0.6160
Validation Accuracy: 82.99%


# Load Saved model for evaluations

In [None]:
# Load label encoder and define label map dynamically
label_encoder = LabelEncoder()
label_encoder.classes_ = np.load('label_classes.npy', allow_pickle=True)  # Ensure to save and load classes
label_map = {i: english_to_arabic[label] for i, label in enumerate(label_encoder.classes_)}

In [None]:
# (Re)load your evaluation/test dataset – here we assume the same loading function as before.
def load_dataset(folder):
    images = []
    labels = []
    for label in os.listdir(folder):
        label_folder = os.path.join(folder, label)
        if os.path.isdir(label_folder):
            for img_file in os.listdir(label_folder):
                img_path = os.path.join(label_folder, img_file)
                img = cv2.imread(img_path)
                if img is not None:
                    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                    img = cv2.resize(img, (224, 224))
                    images.append(img)
                    labels.append(label)
    return np.array(images), np.array(labels)

In [None]:
# Specify the test or evaluation dataset path (it could be a dedicated folder)
test_dataset_path = 'RGB ArSL dataset'  # adjust as needed
X_test, y_test = load_dataset(test_dataset_path)

In [None]:
# Preprocess the test images
X_test = preprocess_input(X_test)
y_test_encoded = label_encoder.transform(y_test)
y_test_cat = tf.keras.utils.to_categorical(y_test_encoded, num_classes=len(label_encoder.classes_))

In [None]:
# Load the saved model
model = load_model('asl_cnn2_model.h5')

In [None]:
# Evaluate on the test set
test_loss, test_acc = model.evaluate(X_test, y_test_cat, verbose=2)
print(f'Test Accuracy: {test_acc * 100:.2f}%')

In [None]:
# Get predictions for evaluation metrics
y_pred_probs = model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)
y_true = np.argmax(y_test_cat, axis=1)

In [None]:
# Classification Report
print("\nClassification Report:")
print(classification_report(y_true, y_pred, target_names=label_encoder.classes_))


In [None]:
# Confusion Matrix
cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.title('Confusion Matrix')
plt.show()

# Load saved model for real time

In [None]:
# Load the saved model for real-time inference
model = load_model('asl_cnn2_model.h5')

In [None]:
# Load label encoder and define label map dynamically
label_encoder = LabelEncoder()
label_encoder.classes_ = np.load('label_classes.npy', allow_pickle=True)  # Ensure to save and load classes
label_map = {i: english_to_arabic[label] for i, label in enumerate(label_encoder.classes_)}

In [None]:
# Initialize webcam
cap = cv2.VideoCapture(0)
captured_letters = []  # Store the formed sentence
last_predicted_label = None  # Track the previous prediction
frames_with_same_letter = 5  # Counter for consistent frames
cooldown_threshold = 20  # Threshold for consecutive frames with the same prediction
last_activity_time = time.time()  # Track time for inactivity
idle_timeout = 15  # Inactivity time limit (seconds)

def format_arabic_text(letters):
    """Format Arabic text to display properly from right to left."""
    return arabic_reshaper.reshape(''.join(letters))

def draw_text(frame, text, position):
    """Draw text on the video frame."""
    font = ImageFont.truetype("arial.ttf", 35)
    img_pil = Image.fromarray(frame)
    draw = ImageDraw.Draw(img_pil)
    draw.text(position, text, font=font, fill=(0, 255, 0))
    return np.array(img_pil)

while True:
    ret, frame = cap.read()
    if not ret:
        break

    # Define Region of Interest (ROI)
    roi = frame[100:400, 100:400]
    cv2.rectangle(frame, (100, 100), (400, 400), (0, 255, 0), 2)

    # Predict the letter/gesture
    roi_resized = cv2.resize(roi, (64, 64)) / 255.0
    roi_resized = np.expand_dims(roi_resized, axis=0)
    prediction = model.predict(roi_resized, verbose=0)
    predicted_id = np.argmax(prediction)
    predicted_label = label_map.get(predicted_id, '')

    # Confidence filtering to avoid random predictions
    if np.max(prediction) < 0.7:
        predicted_label = None
    else:
        predicted_id = np.argmax(prediction)
        predicted_label = label_map.get(predicted_id, '')

    # Check if the prediction is consistent
    if predicted_label == last_predicted_label:
        frames_with_same_letter += 1
    else:
        frames_with_same_letter = 0  # Reset if prediction changes

    # Register the letter only if it remains consistent for the threshold
    if frames_with_same_letter >= cooldown_threshold:
        if predicted_label == 'masafa':
            captured_letters.append(' ')  # Arabic for space
        elif predicted_label == 'mash' and captured_letters:
            captured_letters.pop()  # Remove last letter for delete
        elif predicted_label and predicted_label not in ['masafa', 'mash']:
            captured_letters.append(predicted_label)

        # Reset tracking variables after registering the letter
        last_predicted_label = None
        frames_with_same_letter = 0
        last_activity_time = time.time()  # Reset the idle time

    else:
        last_predicted_label = predicted_label  # Update the last prediction
    
    # **Update last_activity_time if no valid sign is detected**
    if predicted_label is None or np.max(prediction) < 0.7:
        last_activity_time = time.time()  # Update on inactivity

    # Check for inactivity timeout
    if time.time() - last_activity_time > idle_timeout:
        print(f"Final Sentence: {format_arabic_text(captured_letters)}")
        break  # Exit the loop if idle for too long

    # Format and display the text on the frame
    sentence = format_arabic_text(captured_letters)
    frame = draw_text(frame, sentence, (10, 30))

    # Show the frame
    cv2.imshow('ASL Recognition', frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        print(f"Final Sentence: {format_arabic_text(captured_letters)}")
        break

cap.release()
cv2.destroyAllWindows()



Final Sentence: ﺑﺎﺏ


## 4 Convolutional Layers:

# 1st Conv Layer: 32 filters, 3x3 kernel.
# 2nd Conv Layer: 64 filters, 3x3 kernel.
# 3rd Conv Layer: 128 filters, 3x3 kernel.
# 4th Conv Layer: 256 filters, 3x3 kernel.

# 4 MaxPooling Layers:

# After each convolutional layer, MaxPooling reduces the feature map size.

# 1 Flatten Layer:
# Converts the 2D output of the final MaxPooling layer into a 1D vector.

# 2 Dense (Fully Connected) Layers:

# First Dense Layer: 256 neurons with ReLU activation.
# Second Dense Layer: 128 neurons with ReLU activation.

# 1 Dropout Layer after each Dense layer:

# Prevents overfitting by randomly disabling neurons during training.

# 1 Output Layer:

# Uses softmax activation to output class probabilities.