# Creating Datasets

In [None]:
import os  # Importing the OS module for directory handling
import cv2  # Importing OpenCV for image capturing

# Define the directory where the dataset will be stored
DATA_DIR = './data'
if not os.path.exists(DATA_DIR):  # Check if the directory exists
    os.makedirs(DATA_DIR)  # Create the directory if it doesn't exist

# Define the number of classes (different sign language gestures) and dataset size per class
number_of_classes = 38  # Total number of different signs to be captured
dataset_size = 100  # Number of images per sign

# Open the webcam for capturing images
cap = cv2.VideoCapture(0)

# Loop through each class to collect images
for j in range(number_of_classes):
    # Create a directory for the current class if it doesn't exist
    class_dir = os.path.join(DATA_DIR, str(j))
    if not os.path.exists(class_dir):
        os.makedirs(class_dir)

    print(f'Collecting data for class {j}')

    # Wait for user input before starting to collect data
    while True:
        ret, frame = cap.read()  # Capture a frame from the webcam
        if not ret:
            break  # If no frame is captured, exit loop

        # Display a message on the frame to inform the user
        cv2.putText(frame, 'Ready? Press "Q" ! :)', (100, 50), cv2.FONT_HERSHEY_SIMPLEX, 1.3, (0, 255, 0), 3, cv2.LINE_AA)
        cv2.imshow('frame', frame)  # Show the frame
        
        # Wait for the user to press 'q' to start capturing images
        if cv2.waitKey(25) == ord('q'):
            break

    counter = 0  # Initialize image counter
    while counter < dataset_size:
        ret, frame = cap.read()  # Capture a frame
        if not ret:
            break
        
        cv2.imshow('frame', frame)  # Display the frame
        cv2.waitKey(25)  # Wait briefly before capturing the next frame
        
        # Save the captured frame as an image in the respective class folder
        image_path = os.path.join(class_dir, f'{counter}.jpg')
        cv2.imwrite(image_path, frame)
        
        counter += 1  # Increment the counter

# Release the webcam and close all OpenCV windows
cap.release()
cv2.destroyAllWindows()


Above code successfully captures and stores images for each sign class using OpenCV. Now that you have your dataset, what’s our next step? 

# Processing Datasets

In [None]:
import os
import pickle

import mediapipe as mp
import cv2
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# Initialize Mediapipe Hands solution
mp_hands = mp.solutions.hands  # Load Mediapipe Hands module
hands = mp_hands.Hands(static_image_mode=True, min_detection_confidence=0.3)  # Set up hand detection model

# Define dataset directory and expected number of features per sample
DATA_DIR = './data'  # Directory where dataset images are stored
EXPECTED_FEATURES = 42  # Expected number of features (21 hand landmarks × 2 coordinates)

# Lists to store processed data and corresponding labels
data = []
labels = []

# Loop through each class directory in the dataset
for dir_ in os.listdir(DATA_DIR):
    for img_path in os.listdir(os.path.join(DATA_DIR, dir_)):
        data_aux = []  # Temporary list to store landmark features for one image
        x_ = []  # List to store x-coordinates of landmarks
        y_ = []  # List to store y-coordinates of landmarks

        # Read the image
        img = cv2.imread(os.path.join(DATA_DIR, dir_, img_path))  # Load the image
        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)  # Convert to RGB (required by Mediapipe)

        # Process the image using Mediapipe Hands to extract hand landmarks
        results = hands.process(img_rgb)
        if results.multi_hand_landmarks:  # Check if any hands were detected
            for hand_landmarks in results.multi_hand_landmarks:
                for i in range(len(hand_landmarks.landmark)):
                    x = hand_landmarks.landmark[i].x  # Extract x-coordinate
                    y = hand_landmarks.landmark[i].y  # Extract y-coordinate

                    x_.append(x)  # Store x-coordinate
                    y_.append(y)  # Store y-coordinate

                # Normalize landmark coordinates by subtracting the minimum x and y values
                for i in range(len(hand_landmarks.landmark)):
                    x = hand_landmarks.landmark[i].x
                    y = hand_landmarks.landmark[i].y
                    data_aux.append(x - min(x_))  # Store normalized x-coordinate
                    data_aux.append(y - min(y_))  # Store normalized y-coordinate

            # Ensure the extracted data matches the expected feature count before adding to dataset
            if len(data_aux) == EXPECTED_FEATURES:
                data.append(data_aux)  # Add processed data
                labels.append(dir_)  # Add corresponding label
            else:
                print(f"Skipped image {img_path} in {dir_}: incomplete data with {len(data_aux)} features.")

# Save the dataset as a pickle file for later use
with open('data.pickle', 'wb') as f:
    pickle.dump({'data': data, 'labels': labels}, f)

print(f"Dataset saved. Total samples: {len(data)}")


This function processes your collected sign language images and extracts hand landmark features using Mediapipe. Here’s a breakdown of what it does:

1. **Initialize Mediapipe Hands**:  
   - Loads the Mediapipe Hands module to detect and extract hand landmarks.
   - Sets it to process static images with a minimum confidence of 0.3.

2. **Iterate through the dataset directory**:  
   - Loops through each class (gesture) folder and its images.

3. **Process each image**:  
   - Reads the image and converts it to RGB.
   - Uses Mediapipe to detect hand landmarks.

4. **Extract and normalize hand landmarks**:  
   - Stores x and y coordinates of 21 hand landmarks.
   - Normalizes them by subtracting the minimum x and y values.

5. **Filter valid data**:  
   - Ensures the extracted features match the expected 42 (21 landmarks × 2).
   - If valid, appends the data and its corresponding class label.

6. **Save the processed dataset**:  
   - Stores the extracted hand landmark data and labels in a pickle file (`data.pickle`) for later model training.


### Why Pickle datasets?

Using a pickle file is a common practice for the following reasons:

1. **Efficiency in Loading Processed Data**:  
   Once you extract the hand landmark features from your images, saving them as a pickle file allows you to quickly reload this structured data without having to re-run the computationally expensive image processing step every time you train or test your model.

2. **Structured Data Format**:  
   The processed data consists of numerical feature vectors (e.g., normalized landmark coordinates) that are much more manageable for machine learning models compared to raw image data. Models like Random Forests or other classical ML algorithms work best with these structured numerical arrays.

3. **Storage and Reusability**:  
   Pickle files can store complex Python objects (like dictionaries, lists, numpy arrays, etc.) in a serialized binary format. This makes it convenient to save and share your preprocessed dataset across different parts of your project or even with other collaborators.

4. **Separation of Concerns**:  
   By converting images into feature vectors and storing them separately, you separate the heavy-lifting of image processing from model training. This modular approach allows you to experiment with different machine learning models without having to repeatedly process raw images.

While you can use images directly, that approach is more common with deep learning models (e.g., convolutional neural networks) that learn features from raw pixel data. In your case, since you've already extracted and normalized hand landmarks, using a pickle file to store this data is both efficient and practical.

# Train Model

In [None]:
import os
import cv2
import pickle
import mediapipe as mp
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Initialize Mediapipe Hands solution
mp_hands = mp.solutions.hands  # Load Mediapipe Hands module
hands = mp_hands.Hands(static_image_mode=True, min_detection_confidence=0.3)  # Set up hand detection model

# Define dataset directory and expected number of features per sample
DATA_DIR = './data'  # Directory where dataset images are stored
EXPECTED_FEATURES = 42  # Expected number of features (21 hand landmarks × 2 coordinates)

# Lists to store processed data and corresponding labels
data = []
labels = []

# Loop through each class directory in the dataset
for dir_ in os.listdir(DATA_DIR):
    for img_path in os.listdir(os.path.join(DATA_DIR, dir_)):
        data_aux = []  # Temporary list to store landmark features for one image
        x_ = []  # List to store x-coordinates of landmarks
        y_ = []  # List to store y-coordinates of landmarks

        # Read the image
        img = cv2.imread(os.path.join(DATA_DIR, dir_, img_path))  # Load the image
        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)  # Convert to RGB (required by Mediapipe)

        # Process the image using Mediapipe Hands to extract hand landmarks
        results = hands.process(img_rgb)
        if results.multi_hand_landmarks:  # Check if any hands were detected
            for hand_landmarks in results.multi_hand_landmarks:
                for i in range(len(hand_landmarks.landmark)):
                    x = hand_landmarks.landmark[i].x  # Extract x-coordinate
                    y = hand_landmarks.landmark[i].y  # Extract y-coordinate

                    x_.append(x)  # Store x-coordinate
                    y_.append(y)  # Store y-coordinate

                # Normalize landmark coordinates by subtracting the minimum x and y values
                for i in range(len(hand_landmarks.landmark)):
                    x = hand_landmarks.landmark[i].x
                    y = hand_landmarks.landmark[i].y
                    data_aux.append(x - min(x_))  # Store normalized x-coordinate
                    data_aux.append(y - min(y_))  # Store normalized y-coordinate

            # Ensure the extracted data matches the expected feature count before adding to dataset
            if len(data_aux) == EXPECTED_FEATURES:
                data.append(data_aux)  # Add processed data
                labels.append(dir_)  # Add corresponding label
            else:
                print(f"Skipped image {img_path} in {dir_}: incomplete data with {len(data_aux)} features.")

# Save the dataset as a pickle file for later use
with open('data.pickle', 'wb') as f:
    pickle.dump({'data': data, 'labels': labels}, f)

print(f"Dataset saved. Total samples: {len(data)}")

# Load the dataset from the pickle file
data_dict = pickle.load(open('./data.pickle', 'rb'))

data = np.asarray(data_dict['data'])  # Convert data to numpy array
labels = np.asarray(data_dict['labels'])  # Convert labels to numpy array

# Split the dataset into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(
    data, labels, test_size=0.2, shuffle=True, stratify=labels)

# Initialize the machine learning model (Random Forest Classifier)
model = RandomForestClassifier()

# Train the model on the training data
model.fit(x_train, y_train)

# Make predictions on the test data
y_predict = model.predict(x_test)

# Calculate the accuracy of the model
score = accuracy_score(y_predict, y_test)

# Print the accuracy result
print('{}% of samples were classified correctly!'.format(score * 100))

# Save the trained model as a pickle file for later use
with open('model.p', 'wb') as f:
    pickle.dump({'model': model}, f)
