# Install Required packages

In [1]:
pip install gtts

Collecting gtts
  Downloading gTTS-2.5.3-py3-none-any.whl.metadata (4.1 kB)
Downloading gTTS-2.5.3-py3-none-any.whl (29 kB)
Installing collected packages: gtts
Successfully installed gtts-2.5.3


In [2]:
pip install gradio

Collecting gradio
  Downloading gradio-4.44.0-py3-none-any.whl.metadata (15 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0 (from gradio)
  Downloading fastapi-0.115.0-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.3.0 (from gradio)
  Downloading gradio_client-1.3.0-py3-none-any.whl.metadata (7.1 kB)
Collecting httpx>=0.24.1 (from gradio)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting orjson~=3.0 (from gradio)
  Downloading orjson-3.10.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.9 (from g

# Import Libraries

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import requests
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
from gtts import gTTS
from IPython.display import Audio
import gradio as gr

# Create APP Classses

In [4]:
class Translate_en_to_ar:
    def __init__(self,model_id="silma-ai/SILMA-9B-Instruct-v1.0"):
        # Load Model
        self.model = AutoModelForCausalLM.from_pretrained(
                    model_id,
                    device_map="auto",
                    torch_dtype=torch.bfloat16,)
        # Load Tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_id)
        # Define System Prompt
        self.system_prompt = {
            "role": "system",
            "content": (
                "أنت نموذج مختص بترجمة الأوصاف القصيرة من اللغة الإنجليزية إلى اللغة العربية الفصحى. "
                "عليك أن تنتج الترجمة العربية فقط، وتحرص على أن تكون الترجمة صحيحة ودقيقة مع بعض المرونة لتبدو طبيعية في العربية."
            )
        }
        
    def get_translation(self,text):
        # user input message
        messages = [ self.system_prompt,
            {"role": "user", "content": f"Translate the following message: '{text}'"}
        ]
        # Tokenize input
        input_ids = self.tokenizer.apply_chat_template(messages, return_tensors="pt", return_dict=True).to("cuda")
        # Generate the output
        outputs = self.model.generate(**input_ids, max_new_tokens=256)
        # Decode the output and clean it up
        output_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        # Make sure to output only Arabic translation
        if "model" in output_text:
            output_text = output_text.split("model")[1].strip()
        return output_text

In [5]:
class ImageCaption:
    def __init__(self,model_id="Salesforce/blip-image-captioning-large"):
        # Load model
        self.model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large", torch_dtype=torch.float16).to("cuda")
        # Load Proccessor
        self.processor =  BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")

    def get_caption(self,img):
        # Get input tensor
        inputs = self.processor(img,return_tensors="pt").to("cuda", torch.float16)
        # generate output
        output = self.model.generate(**inputs)
        # decode output
        output = self.processor.decode(output[0], skip_special_tokens=True)
        return output

In [6]:
class TextToAudio:
    def __init__(self):
        pass

    def get_audio(self,text,output_file_name="output.mp3"):
        try:
            # Generate the speech using gTTS
            tts = gTTS(text=text, lang='ar')

            # Save the audio to an mp3 file
            tts.save(output_file_name)

            # Return the file path
            return output_file_name

        except Exception as e:
            print(f"Error generating audio: {e}")
            return None

# Create APP Class

In [8]:
class APP:
    def __init__(self):
        self.image_captioner  = ImageCaption()
        self.text_to_audio = TextToAudio()
        self.translator  = Translate_en_to_ar()

    def Run(self,image):
        caption = self.image_captioner.get_caption(image)
        translation = self.translator.get_translation(caption)
        audio = self.text_to_audio.get_audio(translation,'image_descripton.mp3')
        return audio

In [9]:
# create an object from APP class
app = APP()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/4.60k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/445 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/527 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/895 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/39.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/5 [00:00<?, ?it/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/3.91G [00:00<?, ?B/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/2.69G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/46.9k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

# Create Gradio interface

In [10]:
iface = gr.Interface(
    fn=app.Run,
    inputs=gr.Image(type="numpy"),  # Accepts an image
    outputs=gr.Audio(type="filepath"),  # Outputs an audio file
    title="Image Caption Translator",
    description="Upload an image to generate an Arabic audio description.",
)

# Launch the Gradio app
iface.launch()

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://e965f38ffa3c3960b6.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


