In [3]:
!pip install --upgrade gradio
!pip install accelerate
!pip install SpeechRecognition
!pip install diffusers
!pip install torch
!pip install pillow



In [4]:
import gradio as gr
import speech_recognition as sr
from diffusers import StableDiffusionPipeline
import torch
from PIL import Image

# Load the model once when the script starts
model_id = "CompVis/stable-diffusion-v1-4"
pipe = StableDiffusionPipeline.from_pretrained(model_id)

# Check if CUDA is available
device = "cuda" if torch.cuda.is_available() else "cpu"
pipe = pipe.to(device)

# Function to recognize speech
def recognize_speech(audio_file_path):
    recognizer = sr.Recognizer()
    with sr.AudioFile(audio_file_path) as source:
        audio = recognizer.record(source)

    try:
        text = recognizer.recognize_google(audio)
        print(f"Recognized: {text}")
        return text
    except sr.UnknownValueError:
        print("Speech recognition could not understand audio")
    except sr.RequestError as e:
        print(f"Could not request results from Google Speech Recognition service; {e}")
    return None

# Function to generate image based on text using Stable Diffusion
def generate_image(text):
    if device == "cuda":
        with torch.cuda.amp.autocast():
            outputs = pipe(text)
    else:
        outputs = pipe(text)
    image = outputs.images[0]  # Assuming you want the first image from the batch
    return image

# Function to recognize speech and generate image for Gradio interface
def recognize_and_generate(audio_file_path):
    text = recognize_speech(audio_file_path)
    if text:
        image = generate_image(text)
        return text, image, audio_file_path
    else:
        return "No speech recognized", None, None

# Gradio interface
def gradio_interface():
    with gr.Interface(
        fn=recognize_and_generate,
        inputs=gr.Audio(type="filepath", label="Record your voice"),
        outputs=[
            gr.Textbox(label="Recognized Text"),
            gr.Image(label="Generated Image"),
            gr.Audio(label="Recorded Audio")
        ],
        title="Acoustic Artistry-Voice to Image Generator",
        description="Use the buttons below to record your voice. The app will display the recognized text and generate an image based on your speech."
    ) as iface:
        return iface

if __name__ == "__main__":
    gradio_interface().launch()


Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://1c92217376d1916c35.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
