# Voice Interactive System

In this section, we put everything together to build a system which simulates calls to the bank.

In particular, the system takes as input a recorded audio question from the user (caller) and run the question through the pipeline composed of the ASR model, the chatbot and the text-to-speech model to generate a recorded audio response.

This interaction stops when the user stop asking question or when the chatbot predict the end of conversation (usually when it predicts 'bye' or 'have a great day')

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os

path = 'NLP-Project'

os.chdir(f'/content/drive/MyDrive/{path}')
os.getcwd()

'/content/drive/MyDrive/NLP-Project'

## Install packages and initial imports

In [None]:
!pip install transformers
!pip install --upgrade accelerate
!pip install datasets
!pip install soundfile wave torchaudio
!pip install ffmpeg-python

In [None]:
!pip install speechbrain

In [5]:
import numpy as np
import torch
import datasets

In [6]:
from torch.nn.utils.rnn import pad_sequence
from IPython.display import display, HTML, Audio
import IPython.display as ipd
from speechbrain.inference.classifiers import EncoderClassifier
import soundfile as sf
from google.colab.output import eval_js
from base64 import b64decode
from scipy.io.wavfile import read as wav_read
import io
import torchaudio
import librosa
import ffmpeg

from transformers import (
    AutoModelWithLMHead,
    AutoTokenizer,
    Wav2Vec2Processor,
    Wav2Vec2ForCTC,
    SpeechT5Processor,
    SpeechT5ForTextToSpeech,
    SpeechT5HifiGan,
    AutoModelForSequenceClassification
)

## Load the models

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# Load the wav2vec model and processor
wav2vec_model_name = './ASR_fine_tune_HarperValleyBank'
wav2vec_processor = Wav2Vec2Processor.from_pretrained(wav2vec_model_name)
wav2vec_model = Wav2Vec2ForCTC.from_pretrained(wav2vec_model_name)

# Load the DialogGPT model and tokenizer
dialoggpt_model_name = './chatbot-3'
dialoggpt_tokenizer = AutoTokenizer.from_pretrained(dialoggpt_model_name)
dialoggpt_model = AutoModelWithLMHead.from_pretrained(dialoggpt_model_name)

# Load the SpeechT5 model and processor
speecht5_model_name = './text-to-speech_fine_tune_HarperValleyBank'
speecht5_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
speecht5_model = SpeechT5ForTextToSpeech.from_pretrained(speecht5_model_name)

# Load the vocoder to obtain the audio wave from the spectogram
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")



## Create the pipeline stages

Record the question

In [9]:
AUDIO_HTML = """
<script>
var my_div = document.createElement("DIV");
var my_p = document.createElement("P");
var my_btn = document.createElement("BUTTON");
var t = document.createTextNode("Press to start recording");

my_btn.appendChild(t);
//my_p.appendChild(my_btn);
my_div.appendChild(my_btn);
document.body.appendChild(my_div);

var base64data = 0;
var reader;
var recorder, gumStream;
var recordButton = my_btn;

var handleSuccess = function(stream) {
  gumStream = stream;
  var options = {
    //bitsPerSecond: 8000, //chrome seems to ignore, always 48k
    mimeType : 'audio/webm;codecs=opus'
    //mimeType : 'audio/webm;codecs=pcm'
  };
  //recorder = new MediaRecorder(stream, options);
  recorder = new MediaRecorder(stream);
  recorder.ondataavailable = function(e) {
    var url = URL.createObjectURL(e.data);
    var preview = document.createElement('audio');
    preview.controls = true;
    preview.src = url;
    document.body.appendChild(preview);

    reader = new FileReader();
    reader.readAsDataURL(e.data);
    reader.onloadend = function() {
      base64data = reader.result;
      //console.log("Inside FileReader:" + base64data);
    }
  };
  recorder.start();
  };

recordButton.innerText = "Recording... press to stop";

navigator.mediaDevices.getUserMedia({audio: true}).then(handleSuccess);


function toggleRecording() {
  if (recorder && recorder.state == "recording") {
      recorder.stop();
      gumStream.getAudioTracks()[0].stop();
      recordButton.innerText = "Saving the recording... pls wait!"
  }
}

// https://stackoverflow.com/a/951057
function sleep(ms) {
  return new Promise(resolve => setTimeout(resolve, ms));
}

var data = new Promise(resolve=>{
//recordButton.addEventListener("click", toggleRecording);
recordButton.onclick = ()=>{
toggleRecording()

sleep(2000).then(() => {
  // wait 2000ms for the data to be available...
  // ideally this should use something like await...
  //console.log("Inside data:" + base64data)
  resolve(base64data.toString())

});

}
});

</script>
"""

def get_audio():
  display(HTML(AUDIO_HTML))
  data = eval_js("data")
  binary = b64decode(data.split(',')[1])

  process = (ffmpeg
    .input('pipe:0')
    .output('pipe:1', format='wav')
    .run_async(pipe_stdin=True, pipe_stdout=True, pipe_stderr=True, quiet=True, overwrite_output=True)
  )
  output, err = process.communicate(input=binary)

  riff_chunk_size = len(output) - 8
  # Break up the chunk size into four bytes, held in b.
  q = riff_chunk_size
  b = []
  for i in range(4):
      q, r = divmod(q, 256)
      b.append(r)

  # Replace bytes 4:8 in proc.stdout with the actual size of the RIFF chunk.
  riff = output[:4] + bytes(b) + output[8:]

  sr, audio = wav_read(io.BytesIO(riff))

  return audio, sr

Automatic Speech Recognition

In [10]:
def transcribe_audio(waveform, sample_rate):
    inputs = wav2vec_processor(waveform, sampling_rate=sample_rate, return_tensors="pt").input_values
    logits = wav2vec_model(inputs).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = wav2vec_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
    return transcription

Chatbot

In [11]:
def sample_to_string(sample, eos_token):
    # Join strings
    dialogue = eos_token.join(f" {utterance['speaker_role']}: {utterance['text']} " for utterance in sample)
    # Build the dialogue string
    dialogue_string = f"{dialogue}{eos_token}"

    return dialogue_string

def generate_response(input_string):
    input_encoding = dialoggpt_tokenizer(input_string, return_tensors="pt")
    output_ids = dialoggpt_model.generate(input_encoding.input_ids, num_beams=8, max_new_tokens=35, pad_token_id=dialoggpt_tokenizer.eos_token_id, early_stopping = True)
    chatbot_response = dialoggpt_tokenizer.decode(output_ids[0, input_encoding.input_ids.size(1):], skip_special_tokens=True)
    return chatbot_response

Text-to-Speech

In [13]:
def text_to_speech(text, speaker_embeddings):
    inputs = speecht5_processor(text=text, return_tensors="pt")
    spectrogram = speecht5_model.generate_speech(inputs.input_ids, speaker_embeddings)
    return spectrogram


Play response

In [14]:
def save_audio(spectrogram, output_path):
    with torch.no_grad():
      speech = vocoder(spectrogram)
    sf.write(output_path, speech.numpy(), samplerate=16000)

def play_audio(spectrogram):
    with torch.no_grad():
      speech = vocoder(spectrogram)

    display(Audio(speech.cpu().numpy(), rate=16000, autoplay=True))


## Interaction Loop

In [18]:
dialogue = [{'speaker_role': 'agent', 'text': 'hello this is harper valley national bank my name is elizabeth how can i help you today'}]
speaker_embeddings = torch.load("./speaker_embeddings")
print(speaker_embeddings.shape)
spectrogram = text_to_speech(dialogue[0]['text'], speaker_embeddings)
play_audio(spectrogram)

output_path = "output.wav"

torch.Size([1, 512])


In [19]:
def main():
    while True:

        '''SPEECH-TO-TEXT'''

        waveform, sample_rate = get_audio()
        sf.write(output_path, waveform, samplerate=sample_rate)
        waveform, sample_rate = librosa.load(output_path, sr=16000)
        transcription = transcribe_audio(waveform, sample_rate)
        print(f"User: {transcription}")


        '''CHATBOT'''
        # Append message to dialogue history
        dialogue.append(
            {'speaker_role': 'caller', 'text': transcription.lower()}
        )
        # Convert dialogue to string
        input_string = sample_to_string(dialogue, dialoggpt_tokenizer.eos_token)

        response = generate_response(input_string)

        # Crop initial speaker token
        response = response[7:]
        # Append chatbot response to dialogue history
        dialogue.append(
            {'speaker_role': 'agent', 'text': response}
        )
        print(f"Agent: {response}")


        '''TEXT-TO-SPEECH'''
        spectrogram = text_to_speech(response, speaker_embeddings)
        play_audio(spectrogram)

        user_input = input("Do you want to ask another question? (yes/no): ").strip().lower()
        if user_input != 'yes':
            break


In [None]:
dialogue = [{'speaker_role': 'agent', 'text': 'hello this is harper valley national bank my name is elizabeth how can i help you today'}]
main()