# AI Accessibility Enhancements for the Disabled

### Import necessary dependencies

In [1]:
import speech_recognition as sr  # For voice command recognition
import time  # To handle timing for gesture detection
from gtts import gTTS  # Google Text-to-Speech for converting text to speech
from playsound import playsound  # To play the generated speech
import os  # For handling directory and file operations
import pickle  # For saving data and labels to a file
import mediapipe as mp  # For hand landmark detection
import cv2  # OpenCV for image processing
from sklearn.ensemble import RandomForestClassifier  # For building the classification model
from sklearn.model_selection import train_test_split  # For splitting data into training and testing sets
from sklearn.metrics import accuracy_score  # For evaluating the model's accuracy
import pickle  # For loading and saving serialized objects
import numpy as np  # For numerical operations and array handling
import pyautogui  # For automating GUI interactions like taking screenshots
import webbrowser  # For opening web pages

## Sign language recognition

### Collect images

In [10]:
# Define the directory where the dataset will be stored
DATA_DIR = './data'

# Check if the data directory exists, if not, create it
if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR)

# List of class labels, representing the different gestures or characters to be captured
class_labels = [
    'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 
    'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', 
    '6', '7', '8', '9', '-', '!'
]  # Example class labels

# Set the number of images to capture per class
dataset_size = 300

# Open a connection to the default camera (webcam)
cap = cv2.VideoCapture(0)

# Loop through each class label to capture images for each one
for class_index in range(len(class_labels)):
    class_name = class_labels[class_index]
    
    # Create a directory for each class if it doesn't already exist
    class_dir = os.path.join(DATA_DIR, class_name)
    if not os.path.exists(class_dir):
        os.makedirs(class_dir)

    # Notify the user which class is being captured
    print('Collecting data for class {}'.format(class_name))

    # Wait for the user to press 'r' to start capturing images for the current class
    while True:
        ret, frame = cap.read()  # Capture a frame from the camera

        # Check if the frame was successfully captured
        if not ret:
            break  # Exit the loop if the frame was not captured

        # Display a message on the video feed to prompt the user to start capturing
        cv2.putText(
            frame, "Ready? Press 'r' to start capturing!", 
            (25, 60), cv2.FONT_HERSHEY_COMPLEX, 0.9, (0, 255, 255), 2, cv2.LINE_AA
        )
        cv2.imshow("Capturing datasets", frame)
        
        key = cv2.waitKey(25)

        # Start capturing images when 'r' is pressed
        if key == ord('r'):
            print('Starting image capture for class {}'.format(class_name))
            break  # Exit the loop and start capturing images
        elif key == ord('x'):
            # Release the camera and close all OpenCV windows if 'x' is pressed
            cap.release()
            cv2.destroyAllWindows()
            break  # Exit the entire program if 'x' is pressed

    # Image capture process: capture images until the desired dataset size is reached
    counter = 0
    while counter < dataset_size:
        ret, frame = cap.read()  # Capture a frame from the camera

        # Check if the frame was successfully captured
        if not ret:
            print("Failed to grab frame during capture")
            break  # Exit the loop if the frame was not captured

        # Display the current frame
        cv2.imshow("Capturing datasets", frame)
        cv2.waitKey(25)

        # Save the captured frame to the class directory
        cv2.imwrite(os.path.join(class_dir, '{}.jpg'.format(counter)), frame)
        counter += 1

# Release the camera and close all OpenCV windows when done
cap.release()
cv2.destroyAllWindows()

Collecting data for class h
Starting image capture for class h
Collecting data for class e
Starting image capture for class e
Collecting data for class l
Starting image capture for class l
Collecting data for class d
Starting image capture for class d
Collecting data for class o
Starting image capture for class o
Collecting data for class w
Starting image capture for class w


### Create the dataset

In [11]:
# Initialize MediaPipe hands module
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles

# Set up the Hands object for static image mode with a minimum detection confidence
hands = mp_hands.Hands(static_image_mode=True, min_detection_confidence=0.3)

# Define the directory where the image data is stored
DATA_DIR = './data'

# Initialize lists to store data and labels
data = []
labels = []

# Loop through each sub-directory in the data directory
for sub_dir in os.listdir(DATA_DIR):
    sub_dir_path = os.path.join(DATA_DIR, sub_dir)
    if not os.path.isdir(sub_dir_path):
        continue  # Skip if it's not a directory
    
    # Loop through each image in the sub-directory
    for img_path in os.listdir(sub_dir_path):
        img_path_full = os.path.join(sub_dir_path, img_path)
        # print(f"Loading image from: {img_path_full}")

        # Read the image using OpenCV
        img = cv2.imread(img_path_full)
        if img is None:
            # print(f"Failed to load image at: {img_path_full}")
            continue  # Skip if the image cannot be loaded

        # Convert the image to RGB format for processing with MediaPipe
        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        results = hands.process(img_rgb)  # Process the image to detect hand landmarks

        # Check if any hand landmarks are detected
        if results.multi_hand_landmarks:
            for hand_landmarks in results.multi_hand_landmarks:
                x_ = []
                y_ = []
                data_aux = []

                # Collect x and y coordinates of the hand landmarks
                for i in range(len(hand_landmarks.landmark)):
                    x = hand_landmarks.landmark[i].x
                    y = hand_landmarks.landmark[i].y

                    x_.append(x)
                    y_.append(y)

                # Normalize the coordinates by subtracting the minimum values
                for i in range(len(hand_landmarks.landmark)):
                    x = hand_landmarks.landmark[i].x
                    y = hand_landmarks.landmark[i].y
                    data_aux.append(x - min(x_))
                    data_aux.append(y - min(y_))

                # Append the normalized data to the data list
                data.append(data_aux)
                # Append the corresponding label (sub-directory name) to the labels list
                labels.append(sub_dir)

# Save the data and labels to a pickle file
f = open('data.pickle', 'wb')
pickle.dump({'data': data, 'labels': labels}, f)
f.close()


### Train the model

In [12]:
# Load the data from a pickle file
data_dict = pickle.load(open('./data.pickle', 'rb'))

# Determine the maximum sequence length in the data
max_len = max(len(x) for x in data_dict['data'])
print(f"Maximum sequence length: {max_len}")

# Pad the sequences in the data to ensure they all have the same length
padded_data = np.array([np.pad(x, (0, max_len - len(x)), 'constant') for x in data_dict['data']])

# Convert labels to a NumPy array for consistency
labels = np.asarray(data_dict['labels'])

# Split the data into training and testing sets (80% training, 20% testing)
x_train, x_test, y_train, y_test = train_test_split(padded_data, labels, test_size=0.2, shuffle=True, stratify=labels)

# Initialize a Random Forest Classifier model
model = RandomForestClassifier()

# Train the model on the training data
model.fit(x_train, y_train)

# Predict the labels for the test data
y_predict = model.predict(x_test)

# Calculate the accuracy of the model's predictions
score = accuracy_score(y_predict, y_test)

# Print the accuracy of the model
print('{}% of samples were classified correctly!'.format(score * 100))

# Save the trained model to a pickle file
f = open('model.p', 'wb')
pickle.dump({'model': model}, f)
f.close()

Maximum sequence length: 42
100.0% of smaples were classified correctly !


### Make predictions & detect hand gestures with the model

In [13]:
# Load the pre-trained model from a file
model_dict = pickle.load(open('./model.p', 'rb'))
model = model_dict['model']  # Extract the model from the loaded dictionary

# Initialize the webcam for video capture
cap = cv2.VideoCapture(0)

# Initialize MediaPipe Hands for hand detection and landmarks
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles
hands = mp_hands.Hands(static_image_mode=True, min_detection_confidence=0.3)

# Define a dictionary mapping numeric labels to characters
labels_dict = {
    0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E', 5: 'F', 6: 'G',
    7: 'H', 8: 'I', 9: 'J', 10: 'K', 11: 'L', 12: 'M',
    13: 'N', 14: 'O', 15: 'P', 16: 'Q', 17: 'R', 18: 'S',
    19: 'T', 20: 'U', 21: 'V', 22: 'W', 23: 'X', 24: 'Y', 25: 'Z',
    26: '1', 27: '2', 28: '3', 29: '4', 30: '5', 31: '6',
    32: '7', 33: '8', 34: '9', 35: '0', 36: '-', 37: '!'
}

# Start a loop to continuously capture frames from the webcam
while True:
    ret, frame = cap.read()  # Capture a frame
    if not ret:  # If the frame is not captured correctly, exit the loop
        break

    # Initialize lists to store auxiliary data and coordinates
    data_aux = []
    x_, y_ = [], []

    # Convert the frame to RGB format for MediaPipe processing
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands.process(frame_rgb)  # Process the frame for hand landmarks

    if results.multi_hand_landmarks:  # If hand landmarks are detected
        for hand_landmarks in results.multi_hand_landmarks:
            # Extract and normalize landmark coordinates
            for i in range(len(hand_landmarks.landmark)):
                x = hand_landmarks.landmark[i].x
                y = hand_landmarks.landmark[i].y
                x_.append(x)
                y_.append(y)

            # Prepare the feature vector by subtracting minimum coordinates
            for i in range(len(hand_landmarks.landmark)):
                x = hand_landmarks.landmark[i].x
                y = hand_landmarks.landmark[i].y
                data_aux.append(x - min(x_))
                data_aux.append(y - min(y_))

            # Make sure the length of features matches what the model expects
            if len(data_aux) == 42:  # Adjust this according to your model's expected input length
                prediction = model.predict([np.asarray(data_aux)])
                predicted_character = prediction[0]  # Directly use the predicted string

                # Draw bounding box and predicted character on the frame
                x1, y1 = int(min(x_) * frame.shape[1]), int(min(y_) * frame.shape[0])
                x2, y2 = int(max(x_) * frame.shape[1]), int(max(y_) * frame.shape[0])
                cv2.rectangle(frame, (x1, y1), (x2, y2), (25, 32, 48), 4)
                cv2.putText(frame, predicted_character, (x1, y1 - 10), cv2.FONT_HERSHEY_COMPLEX, 1.3, (25, 32, 48), 3, cv2.LINE_AA)

            # Draw hand landmarks on the frame
            mp_drawing.draw_landmarks(
                frame,
                hand_landmarks,
                mp_hands.HAND_CONNECTIONS,
                mp_drawing_styles.get_default_hand_landmarks_style()
            )

    # Display the frame with the drawn landmarks and predictions
    cv2.imshow('Sign Language Detector', frame)
    key = cv2.waitKey(1)  # Wait for 1 ms for a key press

    if key == ord('x'):  # Exit the loop if 'x' is pressed
        break

# Release the webcam and close all OpenCV windows
cap.release()
cv2.destroyAllWindows()

### Convert Hand Gestures into text & speech

In [25]:
# Function to convert text to speech
def text_to_speech(text):
    if not text.strip():  # Check if the text is empty or only whitespace
        print("No text to convert to speech.")
        return
    # Convert the text to speech using gTTS with Nigerian accent (tld='com.ng')
    speech = gTTS(text, tld='us', lang='en', slow=False)
    speech_file = 'speech.mp3'  # Temporary filename for the speech audio
    speech.save(speech_file)  # Save the speech to the file
    playsound(speech_file)  # Play the speech audio
    os.remove(speech_file)  # Remove the audio file after playing

# Function to get the predicted gesture from the frame
def get_predicted_gesture(frame, hands, model, mp_drawing, mp_hands, mp_drawing_styles):
    data_aux = []  # To store the normalized coordinates of landmarks
    x_, y_ = [], []  # To store x and y coordinates separately

    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)  # Convert frame to RGB for processing
    results = hands.process(frame_rgb)  # Process the frame to detect hand landmarks

    if results.multi_hand_landmarks:  # If hand landmarks are detected
        for hand_landmarks in results.multi_hand_landmarks:
            # Collect x and y coordinates of each landmark
            for i in range(len(hand_landmarks.landmark)):
                x = hand_landmarks.landmark[i].x
                y = hand_landmarks.landmark[i].y
                x_.append(x)
                y_.append(y)

            # Normalize the coordinates and append to data_aux
            for i in range(len(hand_landmarks.landmark)):
                x = hand_landmarks.landmark[i].x
                y = hand_landmarks.landmark[i].y
                data_aux.append(x - min(x_))
                data_aux.append(y - min(y_))

        # Check if the number of features matches the model's expectation
        if len(data_aux) == model.n_features_in_:
            prediction = model.predict([np.asarray(data_aux)])  # Predict the gesture
            predicted_gesture = prediction[0]  # Get the predicted gesture

            for hand_landmarks in results.multi_hand_landmarks:
                # Draw the hand landmarks on the frame
                mp_drawing.draw_landmarks(
                    frame,
                    hand_landmarks,
                    mp_hands.HAND_CONNECTIONS,
                    mp_drawing_styles.get_default_hand_landmarks_style()
                )
            return predicted_gesture  # Return the predicted gesture
        else:
            print("Skipped frame due to incorrect number of features.")
            return None

    return None  # Return None if no hand landmarks are detected

# Function to capture gestures, convert them to text, and then to speech
def gesture_to_text_and_speech():
    captured_text = ""  # String to store the captured text
    capturing = False  # Boolean to track if capturing is ongoing
    capture_start_time = None  # To store the start time of capturing

    model_dict = pickle.load(open('./model.p', 'rb'))  # Load the trained model
    model = model_dict['model']  # Extract the model from the dictionary

    cap = cv2.VideoCapture(0)  # Start video capture from the default camera

    # Initialize MediaPipe components for hand detection and drawing
    mp_hands = mp.solutions.hands
    mp_drawing = mp.solutions.drawing_utils
    mp_drawing_styles = mp.solutions.drawing_styles
    hands = mp_hands.Hands(static_image_mode=False, min_detection_confidence=0.3)

    try:
        while True:
            ret, frame = cap.read()  # Capture a frame from the video feed
            if not ret:  # Break if frame capture fails
                break

            predicted_gesture = get_predicted_gesture(frame, hands, model, mp_drawing, mp_hands, mp_drawing_styles)

            if predicted_gesture:
                if predicted_gesture == '!':  # Start or stop capturing based on the '!' gesture
                    if not capturing:
                        if capture_start_time is None:
                            capture_start_time = time.time()
                        elif time.time() - capture_start_time >= 2:
                            capturing = True
                            print("Capturing started...")
                            capture_start_time = None
                    else:
                        if capture_start_time is None:
                            capture_start_time = time.time()
                        elif time.time() - capture_start_time >= 2:
                            capturing = False
                            print("Capturing stopped.")
                            print("Captured text:", captured_text)
                            text_to_speech(captured_text)  # Convert captured text to speech
                            captured_text = ""  # Reset captured text
                            capture_start_time = None

                elif capturing:
                    if predicted_gesture == '-':  # Add a space to the captured text with '-' gesture
                        if capture_start_time is None:
                            capture_start_time = time.time()
                        elif time.time() - capture_start_time >= 2:
                            captured_text += " "
                            print("Added space.")
                            capture_start_time = None
                    else:
                        if capture_start_time is None:
                            capture_start_time = time.time()
                        elif time.time() - capture_start_time >= 2:
                            captured_text += predicted_gesture  # Append the captured letter
                            print("Captured letter:", predicted_gesture)
                            capture_start_time = None

            # Display the captured text on the frame
            cv2.putText(frame, captured_text, (25, 60), cv2.FONT_HERSHEY_COMPLEX, 0.9, (0, 0, 0), 2, cv2.LINE_AA)
            cv2.imshow("Gesture-Based Text and Speech", frame)  # Show the frame with the captured text

            key = cv2.waitKey(1)  # Check for key press
            if key == ord('x'):  # Break the loop if 'x' is pressed
                break

    finally:
        cap.release()  # Release the video capture object
        cv2.destroyAllWindows()  # Close all OpenCV windows

# Run the gesture to text and speech conversion
gesture_to_text_and_speech()


Skipped frame due to incorrect number of features.
Capturing started...
Captured letter: m
Captured letter: y
Added space.
Captured letter: n
Captured letter: u
Captured letter: m
Captured letter: b
Captured letter: e
Captured letter: r
Added space.
Captured letter: i
Captured letter: s
Added space.
Captured letter: 0
Captured letter: 9
Captured letter: 0
Captured letter: 4
Captured letter: 8
Captured letter: 2
Captured letter: 9
Captured letter: 5
Captured letter: 4
Captured letter: 5
Captured letter: 9
Capturing stopped.
Captured text: my number is 09048295459


## Voice command recognition & Execution

### Capture voice input

In [3]:
# Initialize the recognizer for speech recognition
recognizer = sr.Recognizer()

def capture_voice_input():
    """
    Captures voice input from the microphone and returns the audio data.
    Handles ambient noise by adjusting the recognizer sensitivity.
    """
    with sr.Microphone() as source:
        print("Listening...")
        recognizer.adjust_for_ambient_noise(source)  # Adjust for background noise

        try:
            # Listen for the user's input, with a timeout for silence and maximum duration
            audio = recognizer.listen(source, timeout=20, phrase_time_limit=12)
        except sr.WaitTimeoutError:
            print("Listening timed out while waiting for you to speak")
            return None  # Return None if the timeout is reached
    return audio  # Return the captured audio

### Convert text to speech

In [8]:
def text_to_speech(text):
    """
    Converts text to speech using Google Text-to-Speech (gTTS) and plays it.
    The speech is saved to a temporary file and deleted after playback.
    """
    speech = gTTS(text, tld='us', lang='en', slow=False)  # Create speech object
    speech_file = 'speech.mp3'  # Define the filename for the audio file
    speech.save(speech_file)  # Save the speech to the file
    playsound('speech.mp3')  # Play the audio file
    os.remove(speech_file)  # Remove the file after playing

### Convert Voice to Text

In [9]:
def convert_voice_to_text(audio):
    """
    Converts the captured audio to text using Google's speech recognition.
    Handles errors like unrecognized speech and request errors.
    """
    if audio is None:
        return ""  # Return an empty string if no audio is captured
    try:
        text = recognizer.recognize_google(audio)  # Convert audio to text
        print("You said: " + text)
    except sr.UnknownValueError:
        text = ""
        print("Sorry I didn't understand that.")
        text_to_speech(f"Sorry I didn't understand that")
    except sr.RequestError as e:
        text = ""
        print("Error: {0}".format(e))  # Print the error if the API request fails
    return text

### Process Voice Command

In [10]:
def process_voice_command(text):
    """
    Processes the recognized text and performs specific actions based on voice commands.
    """
    if "hello" in text.lower():
        print("Hello! How can I help you?")
        text_to_speech(f"Hello! How can I help you")

    elif "what is your name" in text.lower():
        print("My name is Jarvis")
        text_to_speech(f"My name is Jarvis")

    elif ("how are you doing today" in text.lower() or
          "how are you" in text.lower() or
          "how are you doing" in text.lower()):
        print("I'm doing alright, thank you very much")
        text_to_speech(f"I'm doing alright, thank you very much")

    elif ("take a screenshot" in text.lower() or 
          "take screenshot" in text.lower() or
          "screenshot" in text.lower()):
        pyautogui.screenshot("screenshot.png")  # Take a screenshot and save it
        print("I took a screenshot for you")
        text_to_speech(f"I took a screenshot for you")

    elif "open youtube" in text.lower():
        print("Opening YouTube")
        webbrowser.open("https://www.youtube.com/")  # Open YouTube in the web browser
        text_to_speech(f"Opening YouTube")

    elif ("read the news" in text.lower() or 
          "the news" in text.lower() or
          "news" in text.lower() or
          "open news" in text.lower()):
        print("Opening the news")
        webbrowser.open("https://punchng.com/")  # Open a news website
        text_to_speech(f"Opening the news")

    elif ("alright, goodbye" in text.lower() or 
          "alright" in text.lower() or 
          "goodbye" in text.lower() or 
          "all right" in text.lower() or 
          "stop" in text.lower()):
        print("Goodbye! Have a nice day")
        text_to_speech(f"Goodbye! Have a nice day")
        return True  # End the program
    else: 
        print("I didn't understand that command. Please try again.") 
        text_to_speech(f"I didn't understand that command. Please try again.")
    return False  # Continue running the program

### Main Function

In [11]:
def main():
    """
    Main loop for capturing voice input, processing commands, and handling retries.
    The loop exits after a successful command or after a set number of failed attempts.
    """
    end_program = False  # Flag to indicate if the program should end
    attempts = 0  # Track the number of failed attempts
    max_attempts = 3  # Limit to the number of attempts

    while not end_program and attempts < max_attempts:
        audio = capture_voice_input()  # Capture voice input
        if audio is None:
            attempts += 1
            print(f"Retrying... ({attempts}/{max_attempts})")
            time.sleep(1)  # Delay to prevent rapid looping
        else:
            text = convert_voice_to_text(audio)  # Convert voice input to text
            if text == "":
                attempts += 1
                print(f"Retrying... ({attempts}/{max_attempts})")
            else:
                end_program = process_voice_command(text)  # Process the voice command
                attempts = 0  # Reset attempts if a valid command is processed
    if attempts >= max_attempts:
        print("Too many failed attempts due to timeout. Exiting program.")
        text_to_speech(f"Too many failed attempts due to timeout.")

if __name__ == "__main__":
    main()  # Run the main function

Listening...
You said: hello how are you doing
Hello! How can I help you?
Listening...
You said: open the news
Opening the news
Listening...
You said: alright goodbye
Goodbye! Have a nice day
