In [1]:
import azure.cognitiveservices.speech as speechsdk
import simpleaudio as sa
import io
from pydub import AudioSegment, silence
import gradio as gr
import tempfile
from dotenv import load_dotenv
import wave
from datetime import datetime
import threading
import asyncio
import os

from utils.config_defaults import get_default_prompts, get_supported_languages, get_supported_themes
from utils.logger import log_event, get_all_events
from utils.human_response import human_speaking, get_human_response
from utils.bot_response import get_bot_response
from utils.conv_summary import get_conv_summary
from utils.audio_queue import get_final_segment

load_dotenv(override=True)

output_folder = "output"
avatar_folder = "assets/avatars"
stop_audio = False

if not os.path.exists(output_folder):
    os.makedirs(output_folder)



In [2]:
def get_avatars(theme):
    persona = "bot"
    bot_file_name = f"{avatar_folder}/{theme}_{persona}.png"

    persona = "human"
    human_file_name = f"{avatar_folder}/{theme}_{persona}.png"

    return bot_file_name, human_file_name

In [3]:
async def main(language, bot_msgs, human_msgs, response_mode, voice_mode):
    i = 0
    
    log_event("Start")
    while i < 5:
        if stop_audio:
            break
        if voice_mode == "Custom":
            custom = True
        else:
            custom = False
        log_event("Start Get Bot Response")
        ret_response = await get_bot_response(bot_msgs, "bot", language, custom)
        bot_msgs.append({"role": "assistant", "content": ret_response})
        human_msgs.append({"role": "user", "content": ret_response}) 

        if i > 4:
            if "Thank" in bot_msgs[-1]["content"]:
                print("Conversation over!")
                break
            if "thank" in bot_msgs[-1]["content"]:
                print("Conversation over!")
                break
            if "धन्यवाद" in bot_msgs[-1]["content"]:
                print("Conversation over!")
                break
            print("Calling customer conversation")
        
        log_event("Start Get Human Response")
        if response_mode == "Customer Speaks":
            
            customer_response, audio_segment = get_human_response(language)
            final_segment = get_final_segment(audio_segment)
            bot_msgs.append({"role": "user", "content": customer_response})
        else:
            ret_response = await get_bot_response(human_msgs, "human", language, False)
            human_msgs.append({"role": "assistant", "content": ret_response})
            bot_msgs.append({"role": "user", "content": ret_response})
        i += 1
    



In [4]:
def run_script(language, bot_prompt, human_prompt, response_mode, voice_mode):
    #global bot_msgs, human_msgs
    global final_segment
    global stop_audio
    stop_audio = False
    #global summary_prompt
    print("run main loop")
    #person1_prompt, person2_prompt = set_prompt(language, theme)
    bot_msgs = [{"role":"system", "content": bot_prompt}]
    human_msgs = [{"role":"system", "content": human_prompt}]

    try:
        import nest_asyncio
        nest_asyncio.apply()
    except ImportError:
        pass

    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(loop)
    loop.run_until_complete(main(language, bot_msgs, human_msgs, response_mode, voice_mode))
    
    summary = get_conv_summary(language, bot_msgs)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_path = f"{output_folder}/conversation_{timestamp}.wav"
    final_segment = get_final_segment()
    final_segment.export(output_path, format="wav")
    
    return get_all_events(), summary, output_path


def run_main_loop(language, bot_prompt, human_prompt, response_mode, voice_mode):
    return run_script(language, bot_prompt, human_prompt, response_mode, voice_mode)


In [5]:
def set_prompt(language, theme):
    bot_prompt, human_prompt = get_default_prompts(theme,language)
    bot_avatar, human_avatar = get_avatars(theme)
    return bot_prompt, human_prompt, bot_avatar, human_avatar


def stop_main_loop():
    global stop_audio 
    global human_speaking
    stop_audio = True
    human_speaking = True

with gr.Blocks() as iface:
    with gr.Row():
        language = gr.Dropdown(choices=get_supported_languages(), label="Select Language")
        theme = gr.Dropdown(choices=get_supported_themes(), label="Select Theme")
        lang_button = gr.Button("Set Prompts")
    with gr.Row():
        bot_prompt = gr.Textbox(lines=5, placeholder="Bot Prompt", label="Bot Prompt")
        human_prompt = gr.Textbox(lines=5, placeholder="Human Prompt", label="Human Prompt")
    with gr.Row():
        response_mode = gr.Dropdown(choices=["Customer Speaks", "OpenAI Generates"], label="Response Mode")
        voice_mode = gr.Dropdown(choices=["Default", "Custom"], label="Voice Mode")
        run_button = gr.Button("Run")
        stop_button = gr.Button("Stop")
    with gr.Row():
            with gr.Column():
                person1_image = gr.Image(label="Person 1 Image")
            with gr.Column():
                summary = gr.HTML(label="Conversation Summary")
            with gr.Column():
                person2_image = gr.Image(label="Person 2 Image")
    with gr.Row():
        final_audio = gr.Audio()
    with gr.Row():
        time_details = gr.Textbox(lines=10)
    lang_button.click(fn=set_prompt, inputs=[language, theme], outputs=[bot_prompt, human_prompt, person1_image, person2_image])
    run_button.click(fn=run_main_loop, inputs=[language, bot_prompt, human_prompt, response_mode, voice_mode], outputs=[time_details, summary, final_audio])
    stop_button.click(fn=stop_main_loop)
iface.launch()

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




run main loop
{'human': 'en-IN-KavyaNeural', 'bot': 'en-IN-KunalNeural'}
en-IN-KunalNeural
Speech synthesized for text [Good morning,]
Speech synthesized for text [and thank you for joining us today.]
Speech synthesized for text [Our company is a leading player in the manufacturing sector,]
Speech synthesized for text [and we are looking for a Head of Data Science to lead our data initiatives.]
Speech synthesized for text [To start,]
Speech synthesized for text [could you please tell me about your current role and your experience in managing data science teams?]
{'human': 'en-IN-KavyaNeural', 'bot': 'en-IN-KunalNeural'}
en-IN-KavyaNeural
Speech synthesized for text [Good morning,]
Speech synthesized for text [and thank you for having me.]
Speech synthesized for text [Currently,]
Speech synthesized for text [I am leading a data science team of 30 people at a retail company where we focus on customer analytics and predictive modeling.]
Speech synthesized for text [I've been responsible f