In [1]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration, SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
import torch
import gradio as gr
import librosa
import openai
import numpy as np

### OpenAI API Test

In [None]:
prompt = "What kind of probabilistic distribution is most suited for modeling the number of cars that arrive to a toll station in one hour? Answer only with the name of the distribution."

completion = openai.ChatCompletion.create(
  model="gpt-3.5-turbo",
  messages=[
    {"role": "user", "content": prompt}
  ]
)

In [3]:
generated_text = completion.choices[0].message["content"]
print(generated_text)

Poisson distribution.


### GPT-Siri

In [2]:
# load model and processor for ASR
checkpoint_asr = "openai/whisper-base"
processor_asr = WhisperProcessor.from_pretrained(checkpoint_asr)
model_asr = WhisperForConditionalGeneration.from_pretrained(checkpoint_asr)

# load model and processor for TTS
checkpoint_tts = "microsoft/speecht5_tts"
vocoder_tts = "microsoft/speecht5_hifigan"
processor_tts = SpeechT5Processor.from_pretrained(checkpoint_tts)
model_tts = SpeechT5ForTextToSpeech.from_pretrained(checkpoint_tts)
vocoder_tts = SpeechT5HifiGan.from_pretrained(vocoder_tts)

In [3]:
def process_audio(sampling_rate, waveform):
    # convert from int16 to floating point
    waveform = waveform / 32678.0

    # convert to mono if stereo
    if len(waveform.shape) > 1:
        waveform = librosa.to_mono(waveform.T)

    # resample to 16 kHz if necessary
    if sampling_rate != 16000:
        waveform = librosa.resample(waveform, orig_sr=sampling_rate, target_sr=16000)

    # limit to 30 seconds
    #waveform = waveform[:16000*30]

    # make array
    #waveform = torch.tensor(waveform)
    waveform = np.array(waveform)
    return waveform

In [4]:
def transcript(audio):
    # audio = tuple (sample_rate, frames) or (sample_rate, (frames, channels))
    if audio is not None:
        sampling_rate, waveform = audio
    else:
        return "(please provide audio)"

    waveform = process_audio(sampling_rate, waveform)
    
    input = processor_asr(audio=waveform, sampling_rate=16000, return_tensors="pt").input_features
    predicted_ids = model_asr.generate(input)
    transcription = processor_asr.batch_decode(predicted_ids, skip_special_tokens=True)
    return transcription[0]

In [5]:
def textToSpeech(text):
    if len(text.strip()) == 0:
        return (16000, np.zeros(0).astype(np.int16))

    speaker_embedding = np.load("./speaker_embeddings/cmu_us_clb_arctic-wav-arctic_a0144.npy")
    speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0)

    inputs = processor_tts(text=text, return_tensors="pt")
    speech = model_tts.generate_speech(inputs["input_ids"], speaker_embedding, vocoder=vocoder_tts)
    speech = (speech.numpy() * 32767).astype(np.int16)
    return (16000, speech)

In [6]:
def chat(openAI_key, audio):
    openai.key = openAI_key
    # Automatic Speech Recognition
    prompt = transcript(audio)
    # GPT gives an answer
    completion = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "user", "content": prompt}
        ]
    )
    generated_text = completion.choices[0].message["content"]
    # Text to Speech
    answer = textToSpeech(generated_text)
    return prompt, generated_text, answer

In [7]:
description = """
Your GPT-based vocal assistant. Speech recognition is performed with the <b>openai/whisper-base model</b>, while Text-to-Speech with <b>microsoft/speecht5_tts</b>.
<br>
<br>
References:<br>
<a href="https://huggingface.co/openai/whisper-base">OpenAI Whisper-base</a><br>
<a href="https://huggingface.co/microsoft/speecht5_tts">Microsoft SpeechT5_tts</a><br>
<a href="https://huggingface.co/blog/speecht5">Matthijs, Huggingface - Speech Synthesis, Recognition, and More With SpeechT5</a><br>
<a href="https://huggingface.co/docs/transformers/tasks/asr">Huggingface - ASR with Transformers</a>.<br>
<a href="https://platform.openai.com">OpenAI API Reference</a><br>
"""

gr.Interface(
    fn=chat,
    inputs=[
        gr.Text(label="OpenAI API Key"),
        gr.Audio(label="Record", source="microphone", type="numpy")
    ],
    outputs=[
        gr.Text(label="Transcription"),
        gr.Text(label="GPT Answer"),
        gr.Audio(label="Speech Answer", type="numpy")
    ],
    title="GIVA - GPT-based Interactive Vocal Agent",
    description=description
).launch()

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




