In [None]:
!apt install libasound2-dev portaudio19-dev libportaudio2 libportaudiocpp0 ffmpeg
!pip install pyaudio pydub
!pip install --quiet --upgrade diffusers transformers accelerate cohere openai openai-whisper

In [None]:
# realtime audio to image with a christmas theme using Stable Diffusion Turbo and Whisper
# by Ed Fries
# public domain

import random, sys, torch
from diffusers import AutoPipelineForText2Image
from IPython.display import Javascript
from google.colab import output
from base64 import b64decode
from io import BytesIO
from pydub import AudioSegment

RECORD = """
const sleep  = time => new Promise(resolve => setTimeout(resolve, time))
const b2text = blob => new Promise(resolve => {
  const reader = new FileReader()
  reader.onloadend = e => resolve(e.srcElement.result)
  reader.readAsDataURL(blob)
})
var record = time => new Promise(async resolve => {
  stream = await navigator.mediaDevices.getUserMedia({ audio: true })
  recorder = new MediaRecorder(stream)
  chunks = []
  recorder.ondataavailable = e => chunks.push(e.data)
  recorder.start()
  await sleep(time)
  recorder.onstop = async ()=>{
    blob = new Blob(chunks)
    text = await b2text(blob)
    resolve(text)
  }
  recorder.stop()
})
"""

def record(sec=3):
  #print("Speak Now...")
  display(Javascript(RECORD))
  sec += 1
  s = output.eval_js('record(%d)' % (sec*1000))
  #print("Done Recording !")
  b = b64decode(s.split(',')[1])
  return b #byte stream

# Change these variables to customize your experience
bonusPrompt = "a christmas themed " # this string is prepended to the prompt sent to SD. Change to anything you want to give your pictures a consistent theme
X=512       # you can use a larger size but SD Turbo makes better images at 512x512 resolution
Y=512
whisperModel = "openai/whisper-large-v3"
whisperDevice = 'cuda' #'cpu' or 'cuda'
runLocal = False    # Set this to True to run without needing to be connected to the internet.
timeoutLength = 5   # This changes how long it collects audio information before passing it to Whisper. Try 5 for short phrases, 15 for longer phrases.

big=False  #set big=True for 16gb graphics cards
if big:
    overscale=2 #adjust to fill your screen
    sdModel = "stabilityai/sdxl-turbo"
else:
    overscale=2
    sdModel = "stabilityai/sdxl-turbo"

def InitRender():
    global pipe, font, scrn, info
    pipe = AutoPipelineForText2Image.from_pretrained(sdModel, torch_dtype=torch.float16, use_safetensors=True, variant="fp16", local_files_only = runLocal)
    pipe = pipe.to("cuda")

def RenderImage(prompt):
    seed = random.randint(0, sys.maxsize)

    images = pipe(
        prompt = prompt,
        guidance_scale = 0.0,
        width = X,
        height= Y,
        num_inference_steps = 4,
        generator = torch.Generator("cuda").manual_seed(seed),
        ).images
    images[0].save("output.jpg")

import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from PIL import Image

def ShowImage(caption):
    img = Image.open('output.jpg')
    #img.thumbnail((1024,1024))
    plt.imshow(img)
    plt.axis('off')
    plt.show()

from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import whisper
from tempfile import NamedTemporaryFile
import io

def init_hear_text():
    global data_queue
    global source
    global temp_file
    global transcription
    global phrase_timeout
    global whisperPipe
    argsmodel = whisperModel
    argsnon_english = False

    # Load / Download model
    model = AutoModelForSpeechSeq2Seq.from_pretrained(whisperModel, torch_dtype=torch.float16, low_cpu_mem_usage=True, use_safetensors=True)
    model.to(whisperDevice)
    processor = AutoProcessor.from_pretrained(whisperModel)
    whisperPipe = pipeline(
        "automatic-speech-recognition",
        model=model,
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,
        max_new_tokens=128,
        chunk_length_s=30,
        batch_size=16,
        return_timestamps=True,
        torch_dtype=torch.float16,
        device=whisperDevice)

    temp_file = NamedTemporaryFile().name
    transcription = ['']

def hear_text():
    audio = record(3)
    with open(temp_file, 'w+b') as f:
      f.write(audio)
    result = whisperPipe(temp_file)
    text = result['text'].strip()
    return(text)

# main program starts here
InitRender()
init_hear_text()

while True:
    prompt = ""
    #print("listening...")
    while prompt == "":
        prompt = hear_text()

    if (prompt == "Thank you."): # Whisper likes to return this when it's quiet
        continue
    if (prompt == "Terminate." or prompt == "terminate"):
        break
    print(prompt)
    RenderImage(bonusPrompt+prompt)
    ShowImage(prompt)
print('Bye for now!')