# 🚀 Image Captioning Deployment (with Voice Output)
This notebook loads the trained model and tokenizer to generate captions and play audio using Gradio + gTTS.

In [None]:
!pip install gradio gTTS tensorflow



In [39]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [34]:
from google.colab import files
uploaded = files.upload()

Saving image_captioning_model.keras to image_captioning_model (1).keras


In [36]:
from tensorflow.keras.models import load_model
model = load_model("image_captioning_model.keras")

In [49]:
from tensorflow.keras.models import load_model
import pickle
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.models import Model
from PIL import Image
import numpy as np
model = load_model("image_captioning_model.keras")
with open("/content/drive/MyDrive/tokenizer.pkl", "rb") as f:
    tokenizer = pickle.load(f)
with open("/content/drive/MyDrive/max_length.txt", "r") as f:
    max_length = int(f.read())
from tensorflow.keras.applications.inception_v3 import InceptionV3, preprocess_input
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.image import img_to_array
from PIL import Image
import numpy as np

def extract_features(image):
    # Load InceptionV3 model
    model_incep = InceptionV3(weights='imagenet')
    model_incep = Model(inputs=model_incep.inputs, outputs=model_incep.layers[-2].output)

    # Prepare the image
    image = image.resize((299, 299))
    image = img_to_array(image)
    image = np.expand_dims(image, axis=0)
    image = preprocess_input(image)

    # Extract and return features
    features = model_incep.predict(image, verbose=0)
    return features

def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None
def generate_caption(model, tokenizer, photo, max_length):
    in_text = 'startseq'
    seen_words = set()

    for _ in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)

        yhat = model.predict([photo, sequence], verbose=0)
        yhat = np.argmax(yhat)

        word = word_for_id(yhat, tokenizer)
        if word is None:
            break
        if word in seen_words:
            break
        seen_words.add(word)

        in_text += ' ' + word

        if word == 'endseq':
            break
    return ' '.join([w for w in in_text.split() if w not in ['startseq', 'endseq']])


In [50]:
import gradio as gr
from gtts import gTTS
import os

def caption_and_speak(image):
    try:
        # Extract features
        features = extract_features(image)

        # Generate caption
        caption = generate_caption(model, tokenizer, features, max_length)

        # Convert to speech
        tts = gTTS(text=caption, lang='en')
        audio_path = "caption_audio.mp3"
        tts.save(audio_path)

        return caption, audio_path
    except Exception as e:
        return f"❌ Error: {str(e)}", None

# Define advanced UI
with gr.Blocks(title="AI Captioning App with Voice Output") as demo:
    gr.Markdown("## 🖼️ Image Captioning with Voice Output")
    gr.Markdown(
        "Upload an image to generate a natural language description using a deep learning model. "
        "The caption will also be read aloud using Google Text-to-Speech (gTTS)."
    )

    with gr.Row():
        with gr.Column():
            image_input = gr.Image(type="pil", label="Upload Image")
            generate_btn = gr.Button("🔍 Generate Caption")

        with gr.Column():
            caption_output = gr.Textbox(label="📝 Generated Caption", lines=2)
            audio_output = gr.Audio(label="🔊 Caption Voice")

    generate_btn.click(
        fn=caption_and_speak,
        inputs=[image_input],
        outputs=[caption_output, audio_output]
    )

demo.launch()


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://00b7d3a5b11d2f6133.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


