In [None]:
import time
import os
import cv2
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout, GlobalAveragePooling2D, BatchNormalization
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras import regularizers
from tensorflow.keras.applications.efficientnet import preprocess_input  # EfficientNet preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix
import arabic_reshaper
from PIL import ImageFont, ImageDraw, Image
import matplotlib.pyplot as plt
import seaborn as sns
from bidi.algorithm import get_display

In [None]:
# ------------------------------
# 1. DATA LOADING & PREPROCESSING
# ------------------------------
dataset_path = 'RGB ArSL dataset'
IMG_SIZE = (224, 224)

In [None]:
def load_dataset(folder, img_size=IMG_SIZE):
    images = []
    labels = []
    for label in os.listdir(folder):
        label_folder = os.path.join(folder, label)
        if os.path.isdir(label_folder):
            for img_file in os.listdir(label_folder):
                img_path = os.path.join(label_folder, img_file)
                img = cv2.imread(img_path)
                if img is not None:
                    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                    img = cv2.resize(img, img_size)
                    images.append(img)
                    labels.append(label)
    return np.array(images), np.array(labels)

In [None]:
# Load images and labels
X, y = load_dataset(dataset_path)

In [None]:
# Encode labels to integers and then to one-hot vectors
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y_categorical = to_categorical(y_encoded)

In [None]:
# Save label classes for later inference
np.save('label_classes.npy', label_encoder.classes_)

In [None]:
# Create a dynamic label map based on the encoder's order (English labels)
label_map = {i: label for i, label in enumerate(label_encoder.classes_)}
print(f"Original Label Map: {label_map}")

Label Map (English): {0: 'Ain', 1: 'Al', 2: 'Alef', 3: 'Beh', 4: 'Dad', 5: 'Dal', 6: 'Feh', 7: 'Ghain', 8: 'Hah', 9: 'Heh', 10: 'Jeem', 11: 'Kaf', 12: 'Khah', 13: 'Laa', 14: 'Lam', 15: 'Meem', 16: 'Noon', 17: 'Qaf', 18: 'Reh', 19: 'Sad', 20: 'Seen', 21: 'Sheen', 22: 'Tah', 23: 'Teh', 24: 'Teh_Marbuta', 25: 'Thal', 26: 'Theh', 27: 'Waw', 28: 'Yeh', 29: 'Zah', 30: 'Zain', 31: 'masafa', 32: 'mash'}


In [None]:
# Mapping from English label names to Arabic letters
english_to_arabic = {
    'Ain': 'ع', 'Al': 'ال', 'Alef': 'ا', 'Beh': 'ب', 'Dad': 'ض', 'Dal': 'د',
    'Feh': 'ف', 'Ghain': 'غ', 'Hah': 'ح', 'Heh': 'ه', 'Jeem': 'ج', 'Kaf': 'ك',
    'Khah': 'خ', 'Laa': 'لا', 'Lam': 'ل',
    'masafa': '<space>',
    'mash': '<delete>',
    'Meem': 'م', 'Noon': 'ن', 'Qaf': 'ق',
    'Reh': 'ر', 'Sad': 'ص', 'Seen': 'س', 'Sheen': 'ش', 'Tah': 'ط', 'Teh': 'ت',
    'Teh_Marbuta': 'ة', 'Thal': 'ذ', 'Theh': 'ث', 'Waw': 'و', 'Yeh': 'ي',
    'Zah': 'ظ', 'Zain': 'ز'
}

In [None]:
# Create a dynamic label map converting English labels to Arabic letters
label_map = {i: english_to_arabic.get(label, label) for i, label in enumerate(label_encoder.classes_)}
print(f"Arabic Label Map: {label_map}")

Label Map (Arabic): {0: 'ع', 1: 'ال', 2: 'ا', 3: 'ب', 4: 'ض', 5: 'د', 6: 'ف', 7: 'غ', 8: 'ح', 9: 'ه', 10: 'ج', 11: 'ك', 12: 'خ', 13: 'لا', 14: 'ل', 15: 'م', 16: 'ن', 17: 'ق', 18: 'ر', 19: 'ص', 20: 'س', 21: 'ش', 22: 'ط', 23: 'ت', 24: 'ة', 25: 'ذ', 26: 'ث', 27: 'و', 28: 'ي', 29: 'ظ', 30: 'ز', 31: ' ', 32: ''}


In [None]:
# EfficientNet's preprocess_input scales the pixel values appropriately
X = preprocess_input(X)

In [None]:
# Split dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y_categorical, test_size=0.2, random_state=42)

In [None]:
# ------------------------------
# 2. BUILD THE EFFICIENTNET-BASED MODEL
# ------------------------------

def create_efficientnet_model():
    """
    Create an EfficientNetB0-based model for Arabic Sign Language recognition.
    Uses ImageNet pre-trained weights with custom top layers.
    """
    # Load the EfficientNetB0 base model without the top layers
    base_model = EfficientNetB0(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
    
    # Initially, freeze all layers in the base model
    for layer in base_model.layers:
        layer.trainable = False
    
    model = Sequential([
        base_model,
        GlobalAveragePooling2D(),
        Dense(1024, activation='relu', kernel_regularizer=regularizers.l2(0.01), use_bias=False),
        BatchNormalization(),
        Dropout(0.5),
        
        Dense(512, activation='relu', use_bias=False),
        BatchNormalization(),
        Dropout(0.3),
        
        Dense(256, activation='relu', use_bias=False),
        BatchNormalization(),
        Dropout(0.3),
        
        Dense(128, activation='relu', use_bias=False),
        BatchNormalization(),
        Dropout(0.3),
        
        # Final softmax output for multi-class classification
        Dense(len(label_encoder.classes_), activation='softmax')
    ])
    return model

In [12]:
# Create and compile the model
model = create_efficientnet_model()
model.compile(optimizer=Adam(learning_rate=1e-4),
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()


Downloading data from https://storage.googleapis.com/keras-applications/efficientnetb0_notop.h5
[1m16705208/16705208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 1us/step


In [13]:
# ------------------------------
# 3. DATA AUGMENTATION & MODEL TRAINING
# ------------------------------

# Data augmentation configuration
datagen = ImageDataGenerator(
    rotation_range=10,
    width_shift_range=0.1,
    height_shift_range=0.1,
    shear_range=0.1,
    brightness_range=[0.9, 1.1],
    zoom_range=0.1,
    horizontal_flip=True,
    fill_mode='nearest'
)

In [14]:
# Compute balanced class weights based on the training data
y_train_int = np.argmax(y_train, axis=1)
class_weights_array = compute_class_weight('balanced', classes=np.unique(y_train_int), y=y_train_int)
class_weights = dict(enumerate(class_weights_array))
print("Class weights:", class_weights)

Class weights: {0: 1.0477550372838331, 1: 0.9055258467023173, 2: 0.8894276094276095, 3: 0.8004848484848485, 4: 0.9351458510337015, 5: 1.093558536181487, 6: 0.9906990699069907, 7: 1.0107131925313744, 8: 1.0210265924551638, 9: 0.9667691406821841, 10: 1.1702994860889597, 11: 0.9809863339275104, 12: 0.9762010347376201, 13: 0.897404538660144, 14: 0.9762010347376201, 15: 1.005634231764885, 16: 1.0532695374800638, 17: 1.1911976911976911, 18: 1.064474532559639, 19: 0.9264870931537598, 20: 0.9667691406821841, 21: 0.9307963354474983, 22: 1.1771836007130125, 23: 0.7609171563544187, 24: 1.0006060606060605, 25: 1.2746574020459371, 26: 0.8201689021361153, 27: 0.9906990699069907, 28: 0.8933982683982684, 29: 1.0532695374800638, 30: 1.3706932337069324, 31: 1.220251293422025, 32: 1.2055494706097116}


In [15]:
# Set up callbacks for training
callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True),
    tf.keras.callbacks.ModelCheckpoint('best_asl_efficientnet_model.keras', monitor='val_loss', save_best_only=True),
    tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, verbose=1)
]

In [16]:
# ----------- Phase 1: Train with Frozen Base -----------
initial_epochs = 30
history = model.fit(
    datagen.flow(X_train, y_train, batch_size=32),
    validation_data=(X_val, y_val),
    epochs=initial_epochs,
    class_weight=class_weights,
    callbacks=callbacks
)

  self._warn_if_super_not_called()


Epoch 1/30
[1m207/207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m207s[0m 917ms/step - accuracy: 0.0452 - loss: 15.6244 - val_accuracy: 0.1186 - val_loss: 14.1480 - learning_rate: 1.0000e-04
Epoch 2/30
[1m207/207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m187s[0m 905ms/step - accuracy: 0.1034 - loss: 14.4290 - val_accuracy: 0.2052 - val_loss: 13.1536 - learning_rate: 1.0000e-04
Epoch 3/30
[1m207/207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m184s[0m 887ms/step - accuracy: 0.1377 - loss: 13.5150 - val_accuracy: 0.2893 - val_loss: 12.2547 - learning_rate: 1.0000e-04
Epoch 4/30
[1m207/207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m185s[0m 893ms/step - accuracy: 0.1870 - loss: 12.6928 - val_accuracy: 0.3535 - val_loss: 11.4383 - learning_rate: 1.0000e-04
Epoch 5/30
[1m207/207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m186s[0m 897ms/step - accuracy: 0.2375 - loss: 11.8475 - val_accuracy: 0.3983 - val_loss: 10.7181 - learning_rate: 1.0000e-04
Epoch 6/30
[1m207/2

KeyboardInterrupt: 

In [None]:
# (Optional) Inspect the base model's layers
base_model = model.layers[0]
for idx, layer in enumerate(base_model.layers):
    print(f"Layer {idx}: {layer.name}")

In [None]:
# ----------- Phase 2: Fine-Tuning -----------
# Unfreeze the last few layers of the base model for fine-tuning.
# Here, we freeze all layers first and then unfreeze the last 20 layers.
base_model = model.layers[0]
for layer in base_model.layers:
    layer.trainable = False  # Freeze all layers initially

In [None]:
for layer in base_model.layers[-20:]:
    layer.trainable = True   # Unfreeze the last 20 layers

In [None]:
# Recompile with a lower learning rate for fine-tuning
model.compile(optimizer=Adam(learning_rate=1e-5), loss='categorical_crossentropy', metrics=['accuracy'])
fine_tune_epochs = 25
total_epochs = initial_epochs + fine_tune_epochs
history_fine = model.fit(
    datagen.flow(X_train, y_train, batch_size=32),
    validation_data=(X_val, y_val),
    epochs=total_epochs,
    initial_epoch=initial_epochs,
    class_weight=class_weights,
    callbacks=callbacks
)

In [None]:
# Save the final EfficientNet model
model.save('asl_efficientnet_model.keras')

In [None]:
# Evaluate the model on the validation set
val_loss, val_acc = model.evaluate(X_val, y_val, verbose=2)
print(f'Validation Accuracy: {val_acc * 100:.2f}%')

# Load saved model for evaluations

In [None]:
test_dataset_path = 'RGB ArSL dataset'


In [None]:
X_test, y_test = load_dataset(test_dataset_path)


In [None]:
X_test = preprocess_input(X_test)


In [None]:
y_test_encoded = label_encoder.transform(y_test)
y_test_cat = to_categorical(y_test_encoded, num_classes=num_classes)

In [None]:
model = load_model('asl_efficientnet_model.keras')


In [None]:
test_loss, test_acc = model.evaluate(X_test, y_test_cat, verbose=2)
print(f'Test Accuracy: {test_acc * 100:.2f}%')

In [None]:
y_pred_probs = model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)
y_true = np.argmax(y_test_cat, axis=1)
print("\nClassification Report:")
print(classification_report(y_true, y_pred, target_names=label_encoder.classes_))
cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.title('Confusion Matrix')
plt.show()


# Load saved model for real time

In [None]:
model = load_model('asl_efficientnet_model.keras')
label_classes = np.load('label_classes.npy', allow_pickle=True)

In [None]:
label_encoder.classes_ = label_classes
label_map = {i: label for i, label in enumerate(label_encoder.classes_)}
label_map = {i: english_to_arabic.get(label, label) for i, label in label_map.items()}
print("Loaded Label Map for Inference:", label_map)

In [None]:
cap = cv2.VideoCapture(0)
captured_letters = []
last_predicted_label = None
frames_with_same_letter = 0
cooldown_threshold = 7
idle_timeout = 15
last_activity_time = time.time()

def format_arabic_text(letters):
    return arabic_reshaper.reshape(''.join(letters))

def draw_text(frame, text, position):
    font_path = "arial.ttf"
    font = ImageFont.truetype(font_path, 35)
    img_pil = Image.fromarray(frame)
    draw = ImageDraw.Draw(img_pil)
    draw.text(position, text, font=font, fill=(0, 255, 0))
    return np.array(img_pil)

while True:
    ret, frame = cap.read()
    if not ret:
        break
    height, width, _ = frame.shape
    roi_size = min(height, width) // 2
    center_x, center_y = width // 2, height // 2
    start_x = max(center_x - roi_size // 2, 0)
    end_x = start_x + roi_size
    start_y = max(center_y - roi_size // 2, 0)
    end_y = start_y + roi_size
    roi = frame[start_y:end_y, start_x:end_x]
    cv2.rectangle(frame, (start_x, start_y), (end_x, end_y), (0, 255, 0), 2)
    roi_resized = cv2.resize(roi, IMG_SIZE)
    roi_resized = preprocess_input(roi_resized)
    roi_resized = np.expand_dims(roi_resized, axis=0)
    prediction = model.predict(roi_resized, verbose=0)
    confidence = np.max(prediction)
    predicted_id = np.argmax(prediction)
    predicted_label = label_map.get(predicted_id, '')
    if confidence < 0.7:
        predicted_label = None
    if predicted_label == last_predicted_label:
        frames_with_same_letter += 1
    else:
        frames_with_same_letter = 0
    if frames_with_same_letter >= cooldown_threshold:
        if predicted_label == '<space>':
            captured_letters.append(' ')
        elif predicted_label == '<delete>' and captured_letters:
            captured_letters.pop()
        elif predicted_label and predicted_label not in ['<space>', '<delete>']:
            captured_letters.append(predicted_label)
        last_predicted_label = None
        frames_with_same_letter = 0
        last_activity_time = time.time()
    else:
        last_predicted_label = predicted_label
    if time.time() - last_activity_time > idle_timeout:
        print("Final Sentence:", format_arabic_text(captured_letters))
        break
    sentence = format_arabic_text(captured_letters)
    frame = draw_text(frame, sentence, (10, 30))
    cv2.imshow('ASL Recognition - EfficientNet', frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        print("Final Sentence:", format_arabic_text(captured_letters))
        break
cap.release()
cv2.destroyAllWindows()