In [None]:
pip install gradio transformers torch torchvision pillow

Collecting gradio
  Downloading gradio-5.23.3-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.8.0 (from gradio)
  Downloading gradio_client-1.8.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.4-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6 

In [None]:
import gradio as gr
import torch
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load BLIP model for image captioning
caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
caption_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# Load GPT-2 model for text generation
text_tokenizer = AutoTokenizer.from_pretrained("gpt2")
text_model = AutoModelForCausalLM.from_pretrained("gpt2")

# Generate image caption using BLIP
def get_image_caption(image):
    inputs = caption_processor(image, return_tensors="pt")
    with torch.no_grad():
        caption_ids = caption_model.generate(**inputs)
    caption = caption_processor.decode(caption_ids[0], skip_special_tokens=True)
    return caption

# Generate chatbot response
def multimodal_chatbot(user_input, image):
    image_caption = ""
    if image:
        image_caption = get_image_caption(image)

    # Construct prompt for GPT-2
    prompt = f"User said: '{user_input}'"
    if image_caption:
        prompt += f"\nBased on the image, it looks like: {image_caption}."

    # Generate response
    inputs = text_tokenizer.encode(prompt, return_tensors="pt")
    outputs = text_model.generate(
        inputs,
        max_length=200,
        temperature=0.7,
        top_p=0.9,
        repetition_penalty=1.2,
        do_sample=True,
        pad_token_id=text_tokenizer.eos_token_id
    )
    response = text_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response.strip()

# Gradio UI
gr.Interface(
    fn=multimodal_chatbot,
    inputs=[
        gr.Textbox(label="Ask Something"),
        gr.Image(type="pil", label="Upload Image (optional)")
    ],
    outputs=gr.Textbox(label="Chatbot Response"),
    title="üß†üì∑ Smart Multimodal Chatbot",
    description="Understands any question and any image. Combines BLIP for image understanding and GPT-2 for text response."
).launch()


preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://1835e539badb63885d.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
pip install torchaudio



In [None]:
import gradio as gr
import torch
import os
from PIL import Image
from transformers import (
    BlipProcessor, BlipForConditionalGeneration,
    AutoTokenizer, AutoModelForCausalLM,
    pipeline
)
import torchaudio
import torchaudio.transforms as transforms

# Load BLIP model for image captioning
caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
caption_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# Load GPT-2 model for response generation and audio summarization
text_tokenizer = AutoTokenizer.from_pretrained("gpt2")
text_model = AutoModelForCausalLM.from_pretrained("gpt2")

# Load Whisper for audio transcription
asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-base")

# Generate image caption using BLIP
def get_image_caption(image):
    inputs = caption_processor(image, return_tensors="pt")
    with torch.no_grad():
        caption_ids = caption_model.generate(**inputs)
    caption = caption_processor.decode(caption_ids[0], skip_special_tokens=True)
    return caption

# Generate transcription from audio using Whisper
def get_audio_transcription(audio_path):
    try:
        if not os.path.exists(audio_path):
            return "Error: Audio file not found."

        # Load audio
        waveform, sample_rate = torchaudio.load(audio_path)

        # Convert stereo to mono
        if waveform.shape[0] > 1:
            waveform = torch.mean(waveform, dim=0, keepdim=True)

        # Resample to 16kHz
        if sample_rate != 16000:
            resampler = transforms.Resample(orig_freq=sample_rate, new_freq=16000)
            waveform = resampler(waveform)

        waveform = waveform.squeeze().numpy()

        # Optional: clip to 30 seconds
        max_length = 16000 * 30
        if waveform.shape[-1] > max_length:
            waveform = waveform[:max_length]

        # Transcribe with Whisper
        result = asr_pipeline(waveform, return_timestamps=False)
        return result.get("text", "Error: Unable to transcribe audio.")
    except Exception as e:
        return f"Error processing audio: {str(e)}"

# Generate GPT-2 output (also used for summarizing audio)
def generate_response(prompt):
    inputs = text_tokenizer.encode(prompt, return_tensors="pt")
    outputs = text_model.generate(
        inputs,
        max_length=200,
        temperature=0.7,
        top_p=0.9,
        repetition_penalty=1.2,
        do_sample=True,
        pad_token_id=text_tokenizer.eos_token_id
    )
    return text_tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

# Main chatbot function
def multimodal_chatbot(user_input, image, audio):
    image_caption = ""
    audio_text = ""
    audio_summary = ""

    if image is not None:
        image_caption = get_image_caption(image)

    if audio is not None:
        audio_path = audio.name
        audio_text = get_audio_transcription(audio_path)
        audio_summary = generate_response(f"Summarize this in a simple way:\n{audio_text}")

    # Build final prompt
    prompt = f"User said: '{user_input}'"
    if image_caption:
        prompt += f"\nImage description: {image_caption}."
    if audio_summary:
        prompt += f"\nSummary of the audio: {audio_summary}"

    # Generate chatbot response
    final_response = generate_response(prompt)
    return final_response

# Launch Gradio UI
gr.Interface(
    fn=multimodal_chatbot,
    inputs=[
        gr.Textbox(label="Ask Something"),
        gr.Image(type="pil", label="Upload Image (optional)"),
        gr.Audio(type="filepath", label="Upload Audio (optional)")
    ],
    outputs=gr.Textbox(label="Chatbot Response"),
    title="üß†üì∑üéß Smart Multimodal Chatbot",
    description="Understands text, images (with BLIP), and audio (with Whisper). Combines all inputs to generate intelligent responses."
).launch()


Device set to use cpu


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://239747f5e7767eb531.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
import gradio as gr
import torch
import os
import numpy as np
from PIL import Image
from transformers import (
    BlipProcessor, BlipForConditionalGeneration,
    AutoTokenizer, AutoModelForCausalLM,
    pipeline
)
import torchaudio
import torchaudio.transforms as transforms

# Load BLIP model for image captioning
caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
caption_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# Load GPT-2 model for response generation and audio summarization
text_tokenizer = AutoTokenizer.from_pretrained("gpt2")
text_model = AutoModelForCausalLM.from_pretrained("gpt2")

# Load Whisper for audio transcription
asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-base")

# Image Captioning Function
def get_image_caption(image):
    try:
        print("üîç Processing image...")
        image = image.convert("RGB")  # Ensure correct format
        inputs = caption_processor(image, return_tensors="pt")

        with torch.no_grad():
            caption_ids = caption_model.generate(**inputs)

        caption = caption_processor.decode(caption_ids[0], skip_special_tokens=True)
        print(f"‚úÖ Image Caption: {caption}")
        return caption
    except Exception as e:
        print(f"‚ùå Image Processing Error: {e}")
        return "Error processing image."

# Audio Transcription Function
def get_audio_transcription(audio_path):
    try:
        if not os.path.exists(audio_path):
            return "Error: Audio file not found."

        print("üîâ Processing audio...")

        # Load and preprocess audio
        waveform, sample_rate = torchaudio.load(audio_path)
        waveform = waveform.mean(dim=0, keepdim=True) if waveform.shape[0] > 1 else waveform  # Convert to mono

        # Resample to 16kHz
        if sample_rate != 16000:
            resampler = transforms.Resample(orig_freq=sample_rate, new_freq=16000)
            waveform = resampler(waveform)

        waveform = waveform.squeeze().numpy().astype(np.float32)  # Ensure correct format

        # Clip to 30 seconds max
        max_length = 16000 * 30
        if waveform.shape[-1] > max_length:
            waveform = waveform[:max_length]

        print("üéôÔ∏è Running Whisper model for transcription...")
        result = asr_pipeline(waveform)

        transcription = result.get("text", "Error: Unable to transcribe audio.")
        print(f"‚úÖ Transcription: {transcription}")
        return transcription
    except Exception as e:
        print(f"‚ùå Audio Processing Error: {e}")
        return f"Error processing audio: {e}"

# Text Generation Function
def generate_response(prompt):
    try:
        print(f"ü§ñ Generating response for: {prompt[:100]}...")  # Limit log size
        inputs = text_tokenizer.encode(prompt, return_tensors="pt")

        with torch.no_grad():
            outputs = text_model.generate(
                inputs,
                max_length=200,
                temperature=0.7,
                top_p=0.9,
                repetition_penalty=1.2,
                do_sample=True,
                pad_token_id=text_tokenizer.eos_token_id
            )

        response = text_tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
        print(f"‚úÖ GPT-2 Response: {response}")
        return response
    except Exception as e:
        print(f"‚ùå GPT-2 Error: {e}")
        return "Error generating response."

# Main Chatbot Function
def multimodal_chatbot(user_input, image, audio):
    try:
        image_caption = ""
        audio_text = ""
        audio_summary = ""

        print(f"\nüí¨ User input: {user_input}")

        if image is not None:
            image_caption = get_image_caption(image)

        if audio is not None:
            audio_path = audio.name
            audio_text = get_audio_transcription(audio_path)

            # Trim long audio texts before summarization
            summary_prompt = f"Summarize this in a simple way:\n{audio_text[:500]}"  # Keep within GPT-2's limit
            audio_summary = generate_response(summary_prompt)

        # Build final chatbot prompt
        prompt = f"User said: '{user_input}'"
        if image_caption:
            prompt += f"\nImage description: {image_caption}."
        if audio_summary:
            prompt += f"\nSummary of the audio: {audio_summary}"

        print("üöÄ Final Prompt to GPT-2:", prompt)
        final_response = generate_response(prompt)
        return final_response
    except Exception as e:
        print(f"‚ùå Error in chatbot function: {e}")
        return f"Error: {e}"

# Launch Gradio UI
gr.Interface(
    fn=multimodal_chatbot,
    inputs=[
        gr.Textbox(label="Ask Something"),
        gr.Image(type="pil", label="Upload Image (optional)"),
        gr.Audio(type="filepath", label="Upload Audio (optional)")
    ],
    outputs=gr.Textbox(label="Chatbot Response"),
    title="üß†üì∑üéß Smart Multimodal Chatbot",
    description="Understands text, images (with BLIP), and audio (with Whisper). Combines all inputs to generate intelligent responses."
).launch()


Device set to use cpu


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://03c791900a15ce1b6a.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


