# Two Approaches 
##### 1 - Using mediapipe to get hand coordinates and then using these as features to train the model and get classes
##### 2 - Without mediapipe directly training the images and getting output as different classes 


### without mediapipe

In [8]:
import os
from PIL import Image

In [29]:
from sklearn.model_selection import train_test_split
from torchvision.transforms import Resize
from PIL import Image
import os
import numpy as np

# Path to the directory
directory_path = "D:/sign_language_recognition/dataset sign language/"

images = []
labels = []

resize_transform = Resize((224, 224))

for root, dirs, files in os.walk(directory_path):
    if root == directory_path:
        continue

    label = int(os.path.basename(root))

    # Iterate over all files in the directory
    for file in files:
        if file.endswith(".jpg"):
            # Load the image
            image_path = os.path.join(root, file)
            image = Image.open(image_path)
            image = image.convert("L")

            # Resize the image
            resized_image = resize_transform(image)

            # Append the image and label to the arrays
            images.append(resized_image)
            labels.append(label)

# Convert the lists to numpy arrays
images = np.array(images)
labels = np.array(labels)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(images, labels, test_size=0.2, random_state=42)

# Print the shapes of the arrays
print("Training set - Images:", X_train.shape, "Labels:", y_train.shape)
print("Validation set - Images:", X_val.shape, "Labels:", y_val.shape)



Training set - Images: (800, 224, 224) Labels: (800,)
Validation set - Images: (200, 224, 224) Labels: (200,)


In [34]:
X_train = np.expand_dims(X_train, axis=-1)  # Assuming grayscale images
X_train = np.repeat(X_train, 3, axis=-1)  # Repeat the single channel to match the expected 3 channels

X_val = np.expand_dims(X_val, axis=-1)  # Assuming grayscale images
X_val = np.repeat(X_val, 3, axis=-1)

In [48]:
from tensorflow.keras.applications import ResNet50, EfficientNetB0

# Load pre-trained ResNet50 model without the top (classification) layer
base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
# Load pre-trained EfficientNetB0 model without the top (classification) layer
# base_model = EfficientNetB0(weights='imagenet', include_top=False, input_shape=(224, 224, 3))


In [50]:
from tensorflow.keras import layers, models

# Freeze the base model layers
base_model.trainable = False

# Create new classification layers
flatten_layer = layers.Flatten()(base_model.output)
dense_layer = layers.Dense(512, activation='relu')(flatten_layer)
output_layer = layers.Dense(10, activation='softmax')(dense_layer)  # Assuming 10 classes for the output

# Combine the base model with custom classification layers
model = models.Model(inputs=base_model.input, outputs=output_layer)

In [55]:
# Compile the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])


In [59]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from keras.utils import to_categorical

# Create an ImageDataGenerator with augmentation settings
datagen = ImageDataGenerator(
    rotation_range=90,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

# Flow from directory with augmentation
train_generator = datagen.flow_from_directory(
    'D:/sign_language_recognition/dataset sign language/train',
    target_size=(224, 224),
    batch_size=32,
    class_mode='categorical'
)

# Flow from directory without augmentation (for validation)
val_generator = ImageDataGenerator().flow_from_directory(
    'D:/sign_language_recognition/dataset sign language/valid',
    target_size=(224, 224),
    batch_size=32,
    class_mode='categorical'
)


Found 800 images belonging to 10 classes.
Found 200 images belonging to 10 classes.


In [62]:
train_generator.image_shape

(224, 224, 3)

In [61]:


# Use these generators for training
model.fit(
    train_generator,
    epochs=10,
    validation_data=val_generator
)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2c1c3cfb2e0>

In [83]:
import cv2
from tensorflow.keras.applications.efficientnet import preprocess_input

# Load the test image using OpenCV
test_image = cv2.imread("C:/Users/pytho/Desktop/maxresdefault.jpg")
# Resize the test image
resized_test_image = cv2.resize(test_image, (224, 224))  # Resize to model's input shape
# Preprocess the input image
preprocessed_test_image = preprocess_input(resized_test_image)

# Reshape the image for prediction (add batch dimension)
input_image = preprocessed_test_image.reshape((1,) + preprocessed_test_image.shape)

# Make prediction
prediction = model.predict(input_image)

print("Prediction:", prediction)
# prediction

Prediction: [[4.1400328e-09 9.9204820e-01 2.1029218e-32 3.1539818e-10 7.5203981e-03
  5.8461647e-15 3.7913790e-04 1.3975905e-34 5.2328538e-05 1.4788157e-19]]


In [72]:
import cv2
# Initialize the camera
# Initialize the camera
cap = cv2.VideoCapture(0)

while True:
    ret, frame = cap.read()
    if not ret:
        break

    # Preprocess the frame
    frame1 = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)  # Convert to RGB
    frame_resized = cv2.resize(frame1, (224, 224))  # Resize to model's input shape
    frame_for_prediction = np.expand_dims(frame_resized, axis=0)  # Add batch dimension
    #frame_for_prediction = preprocess_input(frame_for_prediction)  # Preprocess input for EfficientNet

    # Make prediction
    prediction = model.predict(frame_for_prediction)

    # Resize frame back to 1024x1024 for display
    frame_display = cv2.resize(frame1, (1024, 1024))

    # Convert frame back to BGR for display
    frame_display_bgr = cv2.cvtColor(frame_display, cv2.COLOR_RGB2BGR)

    # Display the frame and prediction
    cv2.imshow('frame', frame)
    cv2.putText(frame, 'Prediction: {}'.format(prediction), (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
    
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release the camera and close OpenCV windows
cap.release()
cv2.destroyAllWindows()



In [68]:
cap.release()
cv2.destroyAllWindows()