In [None]:
import cv2
import pyttsx3
import torch
from torchvision import transforms
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from PIL import Image
# Function to perform object detection using PyTorch
def detect_objects(frame):
    # Load a pre-trained Faster R-CNN model
    model = fasterrcnn_resnet50_fpn(pretrained=True)
    model.eval()

    # Preprocess the frame
    transform = transforms.Compose([transforms.ToPILImage(), transforms.ToTensor()])
    input_tensor = transform(frame).unsqueeze(0)

    # Perform object detection
    with torch.no_grad():
        predictions = model(input_tensor)

    # Extract information from the predictions
    boxes = predictions[0]['boxes']
    labels = predictions[0]['labels']
    scores = predictions[0]['scores']

    # Filter out detections with low confidence
    threshold = 0.5
    selected_indices = scores > threshold
    boxes = boxes[selected_indices]
    labels = labels[selected_indices]

    # Map label indices to COCO class names (you may need to adjust this based on your model)
    class_names = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"]

    # Convert the PyTorch Tensor to a NumPy array
    boxes = boxes.numpy().astype(int)

    # Extract detected objects and their distances
    detected_objects = []
    for i in range(len(boxes)):
        label = class_names[labels[i] - 1]
        distance = 1.0  # You may need to implement actual distance estimation logic
        detected_objects.append((label, distance))

    return detected_objects

# Function to convert text to speech
def text_to_speech(text):
    engine = pyttsx3.init()
    engine.say(text)
    engine.runAndWait()

# Main function for real-time processing
def main():
    # Replace '0' with the camera index or video file path
    cap = cv2.VideoCapture(0)

    while True:
        ret, frame = cap.read()

        # Object detection
        detected_objects = detect_objects(frame)

        for obj_name, distance in detected_objects:
            # Concatenate object name and distance information
            text = f"{obj_name} at {distance:.2f} meters"
            print(text)  # Print for debugging purposes

            # Text-to-speech
            text_to_speech(text)

        # Display the frame with detected objects (you can skip this part in the final app)
        cv2.imshow('Object Detection', frame)

        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()

In [None]:
if __name__ == "__main__":
    main()