# Gradio 

In [9]:
!pip install gradio langchain langchain_openai

[0m^C


# 基本使用

In [1]:
import gradio as gr

from langchain.chains import ConversationChain
from langchain.memory import ConversationSummaryBufferMemory
from langchain_openai import OpenAI,ChatOpenAI

memory = ConversationSummaryBufferMemory(llm=ChatOpenAI(), max_token_limit=2048)
conversation = ConversationChain(
    llm=OpenAI(max_tokens=2048, temperature=0.5),
    memory=memory,
)

def predict(input, history=[]):
    history.append(input)
    response = conversation.predict(input=input)
    history.append(response)
    response = [(u, b) for u, b in zip(history[::2], history[1::2])]
    return response, history


with gr.Blocks(css="#chatbot{height:800px} .overflow-y-auto{height:800px}") as demo:
    chatbot = gr.Chatbot(elem_id="chatbot_base")
    state = gr.State([])
    
    with gr.Row():
        txt = gr.Textbox(
            show_label=False,
            placeholder="Enter text and press enter",
            container=False,
        )
    txt.submit(predict, [txt, state], [chatbot, state])
    
demo.launch(server_name="0.0.0.0", server_port=7860)

Running on local URL:  http://0.0.0.0:7860

To create a public link, set `share=True` in `launch()`.




# 增加语音功能（支持服务器没有声卡）

In [1]:
import openai, os
import gradio as gr

from langchain.chains import ConversationChain
from langchain.memory import ConversationSummaryBufferMemory
from langchain_openai import OpenAI,ChatOpenAI

import azure.cognitiveservices.speech as speechsdk
from IPython.display import Audio
import tempfile, base64

client = openai.Client(
    base_url=os.getenv("OPENAI_API_BASE"),
)

memory = ConversationSummaryBufferMemory(llm=ChatOpenAI(), max_token_limit=2048)
conversation = ConversationChain(
    llm=OpenAI(max_tokens=2048, temperature=0.5),
    memory=memory,
)

speech_config = speechsdk.SpeechConfig(subscription=os.environ['AZURE_SPEECH_KEY'], region=os.environ['AZURE_SPEECH_REGION'])
speech_config.speech_synthesis_voice_name='zh-CN-XiaohanNeural'

def convert_to_base64(audio_file_path):
    with open(audio_file_path, 'rb') as audio_file:
        audio_bytes = audio_file.read()
    audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
    return audio_base64
   
def play_voice(text):
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as tmpfile:
        audio_config = speechsdk.audio.AudioOutputConfig(filename=tmpfile.name)
        speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)
        result = speech_synthesizer.speak_text(text)

        if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
            audio_base64 = convert_to_base64(tmpfile.name)
            audio_html = f'''
                <audio controls autoplay>
                    <source src="data:audio/wav;base64,{audio_base64}" type="audio/wav">
                    Your browser does not support the audio element.
                </audio>
            '''
        elif result.reason == speechsdk.ResultReason.Canceled:
            cancellation_details = result.cancellation_details
            audio_html = f"合成被取消: {cancellation_details.reason}\n"
            if cancellation_details.reason == speechsdk.CancellationReason.Error:
                audio_html += f"详细错误信息: {cancellation_details.error_details}"
        return audio_html
    
def predict(input, history=[]):
    if input is not None:
        history.append(input)
        response = conversation.predict(input=input)
        history.append(response)
        audio_html = play_voice(response)
    else:
        audio_html = ""
    response = [(u, b) for u, b in zip(history[::2], history[1::2])]
    return response, audio_html, history

def transcribe(audio):
    audio_file = open(audio, "rb")
    translation = client.audio.translations.create(model="whisper-1", file=audio_file)
    return translation.text

def process_audio(audio, history=[]):
    if audio is not None:
        text = transcribe(audio)
        return predict(text, history)
    else:
        text = None
        return predict(text, history)

with gr.Blocks(css="#chatbot{height:800px} .overflow-y-auto{height:800px}") as demo:
    chatbot = gr.Chatbot(elem_id="chatbot_audio")
    state = gr.State([])
    
    with gr.Row():
        txt = gr.Textbox(
            show_label=False,
            placeholder="Enter text and press enter",
            container=False,
        )
        
    with gr.Row():
        audio = gr.Audio(sources=["microphone"], type="filepath")
        
    with gr.Row():
        audio_html = gr.HTML('')
    
    txt.submit(predict, [txt, state], [chatbot, audio_html, state])
    audio.change(process_audio, [audio, state], [chatbot, audio_html, state])
    
# 重启内核或更换端口
demo.launch(server_name="0.0.0.0", server_port=7860)

Running on local URL:  http://0.0.0.0:7860

To create a public link, set `share=True` in `launch()`.




# 用D-ID给语音对口型

In [6]:
import requests
import os

def generate_talk(input, avatar_url,
                  voice_type = "microsoft",
                  voice_id = "zh-CN-XiaomoNeural",
                  api_key = os.environ.get('DID_API_KEY')):
    url = "https://api.d-id.com/talks"
    payload = {
        "script": {
            "type": "text",
            "provider": {
                "type": voice_type,
                "voice_id": voice_id
            },
            "ssml": "false",
            "input": input
        },
        "config": {
            "fluent": "false",
            "pad_audio": "0.0"
        },
        "source_url": avatar_url
    }
    headers = {
        "accept": "application/json",
        "content-type": "application/json",
        "authorization": "Basic " + api_key
    }

    response = requests.post(url, json=payload, headers=headers)
    return response.json()

avatar_url = "https://www.d-id.com/wp-content/uploads/2023/11/Hero-image-1.png"

def get_a_talk(id, api_key = os.environ.get('DID_API_KEY')):
    url = "https://api.d-id.com/talks/" + id
    headers = {
        "accept": "application/json",
        "authorization": "Basic "+api_key
    }
    response = requests.get(url, headers=headers)
    return response.json()

def play_mp4_video(url):
    video_tag = f"""
    <video width="640" height="480" controls>
        <source src="{url}" type="video/mp4">
    Your browser does not support the video tag.
    </video>
    """
    return HTML(video_tag)

text = "今天天气真不错呀。"

response = generate_talk(input=text, avatar_url=avatar_url)
print(response)

talk = get_a_talk(response['id'])
print(talk)
result_url = talk['result_url']
play_mp4_video(result_url)

{'id': 'tlk_rh3vkTQ7484UM1zHnLtlg', 'created_at': '2024-05-12T09:54:20.803Z', 'created_by': 'auth0|664091516f819b79cafd862a', 'status': 'created', 'object': 'talk'}


In [1]:
import openai, os
import gradio as gr

from langchain.chains import ConversationChain
from langchain.memory import ConversationSummaryBufferMemory
from langchain_openai import OpenAI,ChatOpenAI

import azure.cognitiveservices.speech as speechsdk
from IPython.display import Audio

from gradio import HTML
import tempfile, base64
import requests,time

client = openai.Client(
    base_url=os.getenv("OPENAI_API_BASE"),
)

memory = ConversationSummaryBufferMemory(llm=ChatOpenAI(), max_token_limit=2048)
conversation = ConversationChain(
    llm=OpenAI(max_tokens=2048, temperature=0.5),
    memory=memory,
)

speech_config = speechsdk.SpeechConfig(subscription=os.environ['AZURE_SPEECH_KEY'], region=os.environ['AZURE_SPEECH_REGION'])
speech_config.speech_synthesis_voice_name='zh-CN-XiaohanNeural'

avatar_url = "https://www.d-id.com/wp-content/uploads/2023/11/Hero-image-1.png"

def generate_talk(input, avatar_url,
                  voice_type = "microsoft",
                  voice_id = "zh-CN-YunyeNeural",
                  api_key = os.environ.get('DID_API_KEY')):
    url = "https://api.d-id.com/talks"
    payload = {
        "script": {
            "type": "text",
            "provider": {
                "type": voice_type,
                "voice_id": voice_id
            },
            "ssml": "false",
            "input": input
        },
        "config": {
            "fluent": "false",
            "pad_audio": "0.0"
        },
        "source_url": avatar_url
    }
    headers = {
        "accept": "application/json",
        "content-type": "application/json",
        "authorization": "Basic " + api_key
    }

    response = requests.post(url, json=payload, headers=headers)
    return response.json()

def get_a_talk(id, api_key = os.environ.get('DID_API_KEY')):
    url = "https://api.d-id.com/talks/" + id
    headers = {
        "accept": "application/json",
        "authorization": "Basic "+api_key
    }
    response = requests.get(url, headers=headers)
    return response.json()

def get_mp4_video(input, avatar_url=avatar_url):
    response = generate_talk(input=input, avatar_url=avatar_url)
    talk = get_a_talk(response['id'])
    video_url = ""
    index = 0
    while index < 30:
        index += 1
        if 'result_url' in talk:
            video_url = talk['result_url']
            return video_url
        else:
            time.sleep(1)
            talk = get_a_talk(response['id'])
    return video_url
    
def predict(input, history=[]):
    if input is not None:
        history.append(input)
        response = conversation.predict(input=input)
        history.append(response)
        video_url = get_mp4_video(input=response, avatar_url=avatar_url)
        video_html = f"""<video width="320" height="240" controls autoplay><source src="{video_url}" type="video/mp4"></video>"""
    else:
        video_html = ""
    response = [(u, b) for u, b in zip(history[::2], history[1::2])]
    return response, video_html, history

def transcribe(audio):
    audio_file = open(audio, "rb")
    translation = client.audio.translations.create(model="whisper-1", file=audio_file, prompt="这是一段简体中文的问题。")
    return translation.text

def process_audio(audio, history=[]):
    if audio is not None:
        text = transcribe(audio)
    else:
        text = None
    return predict(text, history)

with gr.Blocks(css="#chatbot{height:800px} .overflow-y-auto{height:800px}") as demo:
    chatbot = gr.Chatbot(elem_id="chatbot_digital_person")
    state = gr.State([])
    
    with gr.Row():
        txt = gr.Textbox(
            show_label=False,
            placeholder="Enter text and press enter",
            container=False,
        )
        
    with gr.Row():
        audio = gr.Audio(sources=["microphone"], type="filepath")
        
    with gr.Row():
        video = gr.HTML(f'<img src="{avatar_url}" width="320" height="240" alt="John Carmack">')
    
    txt.submit(predict, [txt, state], [chatbot, video, state])
    audio.change(process_audio, [audio, state], [chatbot, video, state])
    
# 重启内核或更换端口
demo.launch(server_name="0.0.0.0", server_port=7860)

Running on local URL:  http://0.0.0.0:7860

To create a public link, set `share=True` in `launch()`.


