# Importing Libraries

In [2]:
!pip install -q transformers gradio timm inflect

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m109.4 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m85.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m47.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m30.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
import io, json, hashlib, requests
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import inflect
import gradio as gr
from transformers import pipeline

2025-11-04 18:19:50.406242: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1762280390.577916      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1762280390.629415      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Setting Model

In [4]:
DETECTION_MODEL = "facebook/detr-resnet-101"       
TRANSLATION_MODEL = "Helsinki-NLP/opus-mt-en-ar"   
TTS_MODEL = "facebook/mms-tts-ara"                 
THRESHOLD = 0.7 

# (functions for processing)

# Prepare images

In [5]:
def load_image(url_or_pil):   
    if isinstance(url_or_pil, Image.Image):
        return url_or_pil.convert("RGB")
    elif isinstance(url_or_pil, str):
        r = requests.get(url_or_pil, stream=True, timeout=15)
        r.raise_for_status()
        return Image.open(r.raw).convert("RGB")
    else:
        raise ValueError("Unsupported input type")

# Colors of object

In [6]:
def label_color(label):
    cmap = plt.cm.get_cmap('tab20')
    h = int(hashlib.sha256(label.encode()).hexdigest(), 16)
    idx = h % 20
    rgba = cmap(idx)
    return tuple(int(255*c) for c in rgba[:3])

# Draw detection boxes

In [7]:
def draw_boxes(image, detections):
    plt.figure(figsize=(12,8))
    plt.imshow(image)
    ax = plt.gca()
    for det in detections:
        box = det["box"]
        color = np.array(label_color(det["label"])) / 255.0
        rect = plt.Rectangle((box["xmin"], box["ymin"]),
                             box["xmax"]-box["xmin"],
                             box["ymax"]-box["ymin"],
                             fill=False, linewidth=2, edgecolor=color)
        ax.add_patch(rect)
        ax.text(box["xmin"], box["ymin"],
                f"{det['label']} {det['score']*100:.1f}%",
                fontsize=11, weight="bold", color=color)
    plt.axis("off")
    buf = io.BytesIO()
    plt.savefig(buf, format="png", bbox_inches="tight", pad_inches=0)  
    plt.close()
    buf.seek(0)
    return Image.open(buf).convert("RGB")


# Generate text description

In [8]:
def describe(detections):
    p = inflect.engine()
    counts = {}
    for det in detections:
        counts[det["label"]] = counts.get(det["label"], 0) + 1
    if not counts:
        return "No objects detected."
    parts = []
    for label, count in counts.items():
        parts.append(f"{p.number_to_words(count)} {label}{'s' if count>1 else ''}")
    if len(parts) == 1:
        return f"In this image, there is {parts[0]}."
    return "In this image, there are " + ", ".join(parts[:-1]) + f" and {parts[-1]}."


# translation

In [9]:
def translate(text):

    trans = pipeline("translation", model=TRANSLATION_MODEL)
    return trans(text)[0]['translation_text']

# text_to_speech_ar

In [10]:
def text_to_speech_ar(text):

    tts = pipeline("text-to-speech", model=TTS_MODEL)
    audio_data = tts(text)
    return (audio_data["audio"][0], audio_data["sampling_rate"])

# Processing

In [11]:
def process(image_or_url):
    img = load_image(image_or_url)
    od_pipe = pipeline("object-detection", model=DETECTION_MODEL)
    outputs = od_pipe(img)
    detections = [o for o in outputs if o['score'] >= THRESHOLD]
    labeled_img = draw_boxes(img, detections) if detections else img
    desc_en = describe(detections)
    desc_ar = translate(desc_en)
    audio_ar, sr = text_to_speech_ar(desc_ar)
    results_json = json.dumps(detections, indent=2, ensure_ascii=False)
    return labeled_img, desc_en, desc_ar, (sr, audio_ar), results_json

# Gradio

In [None]:
with gr.Blocks(title="High-Accuracy Object Detection with Voice") as demo:
    with gr.Row():
        img_in = gr.Image(type="pil", label="Upload image")
        url_in = gr.Textbox(label="Post an image link")
    btn_img = gr.Button("Play on uploaded image")
    btn_url = gr.Button("Run on the link")

    out_img = gr.Image(label="Image after education")
    out_en = gr.Textbox(label="Description in English")
    out_ar = gr.Textbox(label="Description in Arabic")
    out_audio = gr.Audio(label="Arabic pronunciation")

    btn_img.click(
        fn=lambda img: process(img),
        inputs=img_in,
        outputs=[out_img, out_en, out_ar, out_audio])

    btn_url.click(
        fn=lambda url: process(url),
        inputs=url_in,
        outputs=[out_img, out_en, out_ar, out_audio])
    
demo.launch(debug=False, share=True)
