In [1]:
!pip install torch torchvision transformers gradio openai-whisper gtts Pillow

Collecting gradio
  Downloading gradio-5.28.0-py3-none-any.whl.metadata (16 kB)
Collecting openai-whisper
  Downloading openai-whisper-20240930.tar.gz (800 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.5/800.5 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting gtts
  Downloading gTTS-2.5.4-py3-none-any.whl.metadata (4.1 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 k

In [None]:
import whisper
from transformers import Blip2Processor, Blip2ForConditionalGeneration
from gtts import gTTS
import torch
import gradio as gr
from PIL import Image
import tempfile
import os

# Device configuration
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load Whisper for ASR (Speech-to-text)
asr_model = whisper.load_model("small").to(device)



# Load BLIP-2 for Image Question-Answering
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained(
    "Salesforce/blip2-opt-2.7b",
    torch_dtype=torch.float16 if device == "cuda" else torch.float32
).to(device)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [10]:
def speech_to_text(audio_file):
    if audio_file is None or not os.path.exists(audio_file):
        return "No audio received or file not found."

    # Whisper ASR transcription
    audio = whisper.load_audio(audio_file)
    audio = whisper.pad_or_trim(audio)
    mel = whisper.log_mel_spectrogram(audio).to(device)

    options = whisper.DecodingOptions(language="en")
    result = whisper.decode(asr_model, mel, options)

    return result.text


def image_qa(image, question):
    print("\n🔍 [INPUT to VQA model]")
    print("Question:", question)

    # Use a properly formatted prompt
    prompt = f"Question: {question.strip()} Answer:"

    # ✅ FIX: pass text as a list
    inputs = processor(images=image, text=[prompt], return_tensors="pt").to(
        device, torch.float16 if device == "cuda" else torch.float32
    )

    print("Tokenized input IDs:", inputs["input_ids"])

    generated_ids = model.generate(**inputs, max_new_tokens=50)

    print("\n🧠 [Raw Output token IDs from model]")
    print(generated_ids)

    answer = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

    print("\n✅ [Final Decoded Answer]")
    print("Answer:", answer)

    return answer





def text_to_speech(text):
    if not text or text.strip() == "":
        text = "No answer was generated."

    tts = gTTS(text=text, lang='en')
    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
    tts.save(temp_file.name)
    return temp_file.name


def multimodal_pipeline(audio, image):
    # Speech-to-text
    question = speech_to_text(audio)

    # Image-QA
    img = Image.open(image).convert("RGB")
    answer = image_qa(img, question)

    # Text-to-speech
    audio_response = text_to_speech(answer)

    return question, answer, audio_response

# Gradio Interface
iface = gr.Interface(
    fn=multimodal_pipeline,
    inputs=[

        gr.Audio(sources=["microphone", "upload"], type="filepath", label="Ask or Upload your Question (audio)"),
        gr.Image(type="filepath", label="Upload an Image")
    ],
    outputs=[
        gr.Textbox(label="Transcribed Question"),
        gr.Textbox(label="Answer"),
        gr.Audio(label="Spoken Answer")
    ],
    title="🎙️🖼️ Ask-the-Image Mini-App",
    description="Record a spoken question and upload an image. The AI answers your question about the image."
)

iface.launch(debug=True)


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://16654a94aec2d1ad8b.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)



🔍 [INPUT to VQA model]
Question: How many dogs in this image?
Tokenized input IDs: tensor([[50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265,
         50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265,
         50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265,
         50265, 50265,     2, 45641,    35,  1336,   171,  3678,    11,    42,
          2274,   116, 31652,    35]], device='cuda:0')

🧠 [Raw Output token IDs from model]
tensor([[50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265,
         50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265,
         50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265,
         50265, 50265,     2, 45641,    35,  1336,   171,  3678,    11,    42,
          2274,   116, 31652,    35,   112, 50118]], device='cuda:0')

✅ [Final Decoded Answer]
Answer: Question: How many dogs in this image? Answer: 1


🔍 [INPUT to VQA model]
Question: How

