In [None]:
import torch
from torchvision import models, transforms
from PIL import Image
from gtts import gTTS
import os

# Load the pre-trained Faster R-CNN model
model = models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
model.eval()

# Define the transformation for the input image
transform = transforms.Compose([transforms.ToTensor()])

# Define a mapping from class labels to human-readable object names
class_names = [
    'background', 'person', 'bicycle', 'car', 'motorcycle', 'airplane',
    'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant',
    'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse',
    'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack',
    'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis',
    'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove',
    'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass',
    'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
    'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
    'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed',
    'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
    'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
    'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear',
    'hair drier', 'toothbrush'
]

# Function to perform object detection
def detect_objects(image_path):
    image = Image.open(image_path).convert("RGB")
    image_tensor = transform(image).unsqueeze(0)

    with torch.no_grad():
        prediction = model(image_tensor)

    # Assuming only one image in the batch
    boxes = prediction[0]['boxes']
    labels = prediction[0]['labels']

    return boxes, labels

# Function to read detected objects aloud
def read_aloud(labels):
    object_names = [class_names[label] for label in labels]
    text = "I found the following objects: " + ", ".join(object_names)
    tts = gTTS(text=text, lang='en')
    tts.save("Outputs Audio/image_to_audio_output.mp3")
    os.system("start output.mp3")

# Specify the path to your input image
image_path = "F:\Graduation Project\Test\cars.jpg"

# Perform object detection
boxes, labels = detect_objects(image_path)

# Read detected objects aloud
read_aloud(labels)

