In [2]:
!pip install SpeechRecognition
!pip install googletrans==4.0.0-rc1
!pip install pydub
!pip install Pillow
!pip install requests
!pip install torch torchvision torchaudio
!pip install diffusers transformers accelerate

from io import BytesIO
from base64 import b64decode
from google.colab import output
from IPython.display import Javascript, display, Markdown, Image
import speech_recognition as sr
from pydub import AudioSegment
import tempfile
import ipywidgets as widgets
from diffusers import StableDiffusionPipeline
import torch
from googletrans import Translator

# Load Stable Diffusion Model from Hugging Face
pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
pipe.to("cuda" if torch.cuda.is_available() else "cpu")

# JavaScript for recording audio
RECORD = """
const sleep = time => new Promise(resolve => setTimeout(resolve, time))
const b2text = blob => new Promise(resolve => {
  const reader = new FileReader()
  reader.onloadend = e => resolve(e.srcElement.result)
  reader.readAsDataURL(blob)
})
var record = time => new Promise(async resolve => {
  stream = await navigator.mediaDevices.getUserMedia({ audio: true })
  recorder = new MediaRecorder(stream)
  chunks = []
  recorder.ondataavailable = e => chunks.push(e.data)
  recorder.start()
  await sleep(time)
  recorder.onstop = async ()=>{
    blob = new Blob(chunks, { type: 'audio/webm' })
    text = await b2text(blob)
    resolve(text)
  }
  recorder.stop()
})
"""

def record(sec=3):
    print("Speak Now...")
    display(Javascript(RECORD))
    sec += 1
    s = output.eval_js('record(%d)' % (sec * 1000))
    print("Done Recording!")
    b = b64decode(s.split(',')[1])
    return BytesIO(b)

def convert_webm_to_wav(webm_stream):
    audio = AudioSegment.from_file(webm_stream, format="webm")
    wav_stream = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
    audio.export(wav_stream.name, format="wav")
    wav_stream.seek(0)
    return wav_stream

def audio_to_text(audio_stream, language_code="en-US"):
    recognizer = sr.Recognizer()
    audio_data = sr.AudioFile(audio_stream)
    with audio_data as source:
        audio = recognizer.record(source)
    try:
        print("Transcribing audio...")
        text = recognizer.recognize_google(audio, language=language_code)
        return text
    except sr.UnknownValueError:
        return "Could not understand audio"
    except sr.RequestError as e:
        return f"Speech API request error: {e}"

# Function to translate any regional language to English
def translate_to_english(text, src_language):
    translator = Translator()
    translation = translator.translate(text, src=src_language, dest='en')
    return translation.text

def generate_art(prompt, num_inference_steps=50, width=512, height=512):
    print("Generating image with Stable Diffusion...")
    if not prompt:
        return None

    image = pipe(prompt, num_inference_steps=num_inference_steps, height=height, width=width).images[0]
    img_byte_arr = BytesIO()
    image.save(img_byte_arr, format='JPEG')
    return img_byte_arr.getvalue()

# Language selection for input (Indian regional languages)
language_selector = widgets.Dropdown(
    options=[
        ("Hindi", "hi"),
        ("Bengali", "bn"),
        ("Telugu", "te"),
        ("Marathi", "mr"),
        ("Tamil", "ta"),
        ("Gujarati", "gu"),
        ("Kannada", "kn"),
        ("Malayalam", "ml"),
        ("Punjabi", "pa")
    ],
    description='Input Language:',
    style={'description_width': 'initial'}
)

translate_button = widgets.Button(
    description='Transcribe & Generate Art',
    button_style='success'
)

output_widget = widgets.Output()

def on_translate_button_clicked(b):
    with output_widget:
        output_widget.clear_output()
        audio_webm_stream = record(sec=10)
        audio_wav_stream = convert_webm_to_wav(audio_webm_stream)
        input_language = language_selector.value
        transcription = audio_to_text(audio_wav_stream, input_language)

        if "Could not understand audio" not in transcription:
            display(Markdown(f"### Transcription: {transcription}"))

            # Translate the transcription to English
            translated_text = translate_to_english(transcription, input_language)
            display(Markdown(f"### Translated Text (to English): {translated_text}"))

            # Generate the image using the translated text as the prompt
            art_image = generate_art(translated_text, num_inference_steps=50, width=512, height=512)
            if art_image:
                display(Markdown(f"### AI-Generated Art:"))
                display(Image(data=art_image, format='jpeg'))
            else:
                display(Markdown("### Image generation failed. Please try again."))
        else:
            display(Markdown(transcription))

translate_button.on_click(on_translate_button_clicked)

display(language_selector, translate_button, output_widget)




Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

Dropdown(description='Input Language:', options=(('Hindi', 'hi'), ('Bengali', 'bn'), ('Telugu', 'te'), ('Marat…

Button(button_style='success', description='Transcribe & Generate Art', style=ButtonStyle())

Output()