In [1]:
import cv2
import speech_recognition as sr
from transformers import pipeline
from PIL import Image
import numpy as np

In [2]:
# Load Hugging Face object detection pipeline
object_detector = pipeline("object-detection", model="facebook/detr-resnet-50")

Some weights of the model checkpoint at facebook/detr-resnet-50 were not used when initializing DetrForObjectDetection: ['model.backbone.conv_encoder.model.layer1.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing DetrForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DetrForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Using a slow image processor as `use_fast` is unset and a slow processor was saved w

In [None]:
# Convert OpenCV frame to PIL and run detection
def detect_objects(frame):
    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    pil_img = Image.fromarray(rgb)
    results = object_detector(pil_img)
    detected_labels = [res['label'].lower() for res in results if res['score'] > 0.7]
    print(" Detected objects:", detected_labels)
    return detected_labels

In [None]:
# Simple speech-to-text and token splitting
def listen_and_understand():
    recognizer = sr.Recognizer()
    mic = sr.Microphone()

    with mic as source:
        print(" Listening for command...")
        recognizer.adjust_for_ambient_noise(source)
        audio = recognizer.listen(source)

    try:
        command = recognizer.recognize_google(audio)
        print(f" You said: {command}")
        tokens = command.lower().split()  # Avoid using nltk.word_tokenize
        return command, tokens
    except sr.UnknownValueError:
        print(" Could not understand audio")
        return "", []
    except sr.RequestError as e:
        print(f" Speech error: {e}")
        return "", []

In [None]:
def decide_task(detected, tokens):
    for obj in detected:
        if obj in tokens:
            print(f"n = {obj} recognized and matched with the command")
            print("Action:")
            
            if obj == "bottle":
                print("- Take the bottle that is far away.")
                print("- Use your hand to lift the bottle.")
                print("- Pick it up and show it in the camera.")

            elif obj == "plate":
                print("- Move toward the plate on the table.")
                print("- Slide the plate gently to the right.")
                print("- Lift the plate and place it at the desired location.")

            elif obj == "cup":
                print("- Reach for the cup carefully.")
                print("- Grip the cup handle with your fingers.")
                print("- Lift the cup and place it near the bottle.")

            elif obj == "book":
                print("- Identify the book on the surface.")
                print("- Pick up the book from the top.")
                print("- Hold the book up to show the cover to the camera.")

            elif obj == "remote":
                print("- Locate the remote beside the book.")
                print("- Grab the remote using your fingers.")
                print("- Point it towards the screen and press the power button.")

            elif obj == "person":
                print("- Approach the person carefully.")
                print("- Wave your hand to get their attention.")
                print("- Communicate the necessary instructions.")

            elif obj == "tie":
                print("- Identify the tie on the person.")
                print("- Adjust it gently if needed.")
                print("- Ensure it is visible to the camera.")

            else:
                print(f"- Perform an action related to the {obj}.")
            return f"{obj} matched"

    print(" No relevant object found in the command.")
    return "No action taken."


In [None]:
# Main loop
def main():
    cap = cv2.VideoCapture(0)
    print(" Press 'c' to capture and speak a command, 'q' to quit.")

    while True:
        ret, frame = cap.read()
        if not ret:
            print(" Failed to read from webcam.")
            break

        cv2.imshow("Live Feed - Press 'c' to capture", frame)
        key = cv2.waitKey(1) & 0xFF

        if key == ord('q'):
            break
        elif key == ord('c'):
            # Step 1: Object Detection
            detected = detect_objects(frame)

            # Step 2: Voice Command Recognition
            command_text, tokens = listen_and_understand()

            # Step 3: Match & Respond
            decision = decide_task(detected, tokens)
            print(" Decision:", decision)

    cap.release()
    cv2.destroyAllWindows()

if __name__ == "__main__":
    main()

📸 Press 'c' to capture and speak a command, 'q' to quit.
🤖 Detected objects: ['person', 'tie', 'person', 'cell phone', 'person', 'person']
🎙️ Listening for command...
🗣️ You said: take the notebook
🤖 No relevant object found in the command.
🤖 Decision: No action taken.
