# Step 1: Install Dependencies

In [None]:
!pip install transformers sentencepiece torch torchvision  # NLP & Deep Learning libraries (models + backend)
!pip install gTTS                                          # Google Text-to-Speech (convert text → audio)
!pip install gradio                                        # Build interactive web UI for your AI app


Collecting gTTS
  Downloading gTTS-2.5.4-py3-none-any.whl.metadata (4.1 kB)
Collecting click<8.2,>=7.1 (from gTTS)
  Downloading click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Downloading gTTS-2.5.4-py3-none-any.whl (29 kB)
Downloading click-8.1.8-py3-none-any.whl (98 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: click, gTTS
  Attempting uninstall: click
    Found existing installation: click 8.3.0
    Uninstalling click-8.3.0:
      Successfully uninstalled click-8.3.0
Successfully installed click-8.1.8 gTTS-2.5.4


# Step 2: Import Libraries

In [None]:
from transformers import BlipProcessor, BlipForConditionalGeneration  # For image captioning using BLIP model
from gtts import gTTS                                                 # For converting text captions into speech
import gradio as gr                                                   # For creating an interactive web app interface
from PIL import Image                                                 # For loading and handling image files
import torch                                                          # For running deep learning models (PyTorch backend)


# Step 3: Load BLIP Model for Image Captioning

In [None]:
# Load BLIP (Bootstrapping Language-Image Pretraining) model
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

# Step 4: Define Function (Image → Text → Speech)

In [None]:
def image_to_speech(image):
    # Step 1: Generate a more detailed caption
    inputs = processor(image, return_tensors="pt")
    out = model.generate(
        **inputs,
        max_length=90,          # Allow longer, more detailed sentences
        num_beams=8,            # Use beam search to improve quality
        repetition_penalty=1.2, # Prevent repeating words
        length_penalty=1.0,     # Balanced caption length
        early_stopping=True
    )
    caption = processor.decode(out[0], skip_special_tokens=True)

    # Step 2: Convert caption to speech
    tts = gTTS(caption)
    tts.save("output.mp3")

    return caption, "output.mp3"



# Step 5: Build Gradio Interface

In [None]:
interface = gr.Interface(
    fn=image_to_speech,
    inputs=gr.Image(type="pil"),
    outputs=[gr.Textbox(label="Image Caption"), gr.Audio(label="Spoken Caption")]
)

interface.launch()


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://5e97f7207ed2278f4d.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


