## Importar librerías (si algo no anda, preguntenme y vemos)

In [9]:
from nemo.collections.tts.models import FastPitchModel
from nemo.collections.tts.models import HifiGanModel
from ipywebrtc import AudioRecorder, CameraStream
import torchaudio
import soundfile as sf
import random
from IPython.display import Audio
import librosa
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from datasets import load_dataset
import numpy as np

## Record audio

In [2]:
def record_audio():
  camera = CameraStream(constraints={'audio': True,'video':False})
  recorder = AudioRecorder(stream=camera)
  display(recorder)
  return recorder

def save_recording(recorder, audio_out_path):
  with open('recording.webm', 'wb') as f:
    f.write(recorder.audio.value)
  !ffmpeg -i recording.webm -ac 1 -f wav {audio_out_path} -y -hide_banner -loglevel panic -ar 48000

def load_audio(audio_path):
  wave, sample_rate = torchaudio.load(audio_path)
  return np.array(wave)[0], sample_rate

def play_audio(wave, sample_rate):
  print(wave.shape)
  display(Audio(data=wave, rate=sample_rate))


## Dialog system

In [3]:
class TTSModel():
  def __init__(self):
    self.spec_generator = FastPitchModel.from_pretrained("nvidia/tts_en_fastpitch")
    self.model = HifiGanModel.from_pretrained(model_name="nvidia/tts_hifigan")

  def generate_speech(self, text):
    parsed = self.spec_generator.parse(text)
    spectrogram = self.spec_generator.generate_spectrogram(tokens=parsed)
    audio = self.model.convert_spectrogram_to_audio(spec=spectrogram).detach().numpy()
    return audio

class ASRModel():
  def __init__(self):
    self.processor = WhisperProcessor.from_pretrained("openai/whisper-small")
    self.model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
    self.model.config.forced_decoder_ids = None

  def recognize_speech(self, audio, sample_rate=48000):
    audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=16000)
    input_features = self.processor(audio, sampling_rate=16000, return_tensors="pt").input_features
    predicted_ids = self.model.generate(input_features)
    transcription = self.processor.batch_decode(predicted_ids, skip_special_tokens=True)
    return transcription[0]

class SpokenDialogSystem():
  def __init__(self):
    self.tts_model = TTSModel()
    self.asr_model = ASRModel()
    self.current_state = "STATE_START"

  #Inicio del sistema
  def start(self):
    start_audio = self.tts_model.generate_speech("Would you like to hear a fun fact?")
    display(Audio(start_audio, rate=22050))

  # Main dialog function
  def listen_and_respond(self, audio):
    transcription = self.asr_model.recognize_speech(audio)
    dialog_act = self.understand_transcription(transcription)
    response_code = self.advance_dialog(dialog_act)
    response_text = self.generate_language(response_code)
    response_audio = self.tts_model.generate_speech(response_text)
    display(Audio(response_audio, rate=22050))

  #Language comprehension
  
  def understand_transcription(self, transcription):
    transcription = transcription.lower()
    print(transcription)

    if "yes" in transcription:
      return "ACT_YES"
    if "no" in transcription:
      return "ACT_NO"
    else:
      return "ACT_UNKNOWN"

  #Dialog manager
  def advance_dialog(self, dialog_act):

    if self.current_state == "STATE_START":

      if dialog_act == "ACT_YES":
        return "RESPONSE_TELL_FACT"

      elif dialog_act == "ACT_NO":
        self.current_state = "STATE_FINISH"
        return "RESPONSE_OKAY"

      else:
        return "RESPONSE_REPEAT"

    else:
      return "RESPONSE_EMPTY"

  #Language generation
  def generate_language(self, response_code):
    if response_code == "RESPONSE_TELL_FACT":
      return self.get_new_fun_fact() + "... Would you like to hear another fun fact?"

    elif response_code == "RESPONSE_OKAY":
      return "Okay, I won't tell you any more fun facts."

    elif response_code == "RESPONSE_REPEAT":
      return "Sorry, I didn't understand you. Would you like to hear a fun fact? Yes or no?"

    else:
      raise ValueError(f"ERROR: Unknown response code {response_code}")

  def get_new_fun_fact(self):
    facts = ["The shortest war in history lasted 38 to 45 minutes.",
             "Bananas are berries, but strawberries aren't.",
             "A group of flamingos is called a flamboyance."]

    return random.choice(facts)


In [43]:
#Step 0. New system instance (this could take a while, be patient!)
SDS = SpokenDialogSystem()

[NeMo W 2023-10-29 18:38:23 en_us_arpabet:66] apply_to_oov_word=None, This means that some of words will remain unchanged if they are not handled by any of the rules in self.parse_one_word(). This may be intended if phonemes and chars are both valid inputs, otherwise, you may see unexpected deletions in your input.
[NeMo W 2023-10-29 18:38:23 modelPT:161] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    dataset:
      _target_: nemo.collections.tts.torch.data.TTSDataset
      manifest_filepath: /ws/LJSpeech/nvidia_ljspeech_train_clean_ngc.json
      sample_rate: 22050
      sup_data_path: /raid/LJSpeech/supplementary
      sup_data_types:
      - align_prior_matrix
      - pitch
      n_fft: 1024
      win_length: 1024
      hop_length: 256
      window: hann
      n_mels: 80
      lowfreq: 0
      highfreq: 8000
      max_duration: null
      

[NeMo I 2023-10-29 18:38:23 features:289] PADDING: 1
[NeMo I 2023-10-29 18:38:24 save_restore_connector:249] Model FastPitchModel was successfully restored from /home/ana/.cache/huggingface/hub/models--nvidia--tts_en_fastpitch/snapshots/2c8305b7b41b33fd6367f0635796dc3a7a33cbf9/tts_en_fastpitch.nemo.


[NeMo W 2023-10-29 18:38:33 modelPT:161] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    dataset:
      _target_: nemo.collections.tts.data.datalayers.MelAudioDataset
      manifest_filepath: /home/fkreuk/data/train_finetune.txt
      min_duration: 0.75
      n_segments: 8192
    dataloader_params:
      drop_last: false
      shuffle: true
      batch_size: 64
      num_workers: 4
    
[NeMo W 2023-10-29 18:38:33 modelPT:168] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    dataset:
      _target_: nemo.collections.tts.data.datalayers.MelAudioDataset
      manifest_filepath: /home/fkreuk/data/val_finetune.txt
      min_duration: 3
      n_segments: 66150


[NeMo I 2023-10-29 18:38:33 features:289] PADDING: 0


[NeMo W 2023-10-29 18:38:33 features:266] Using torch_stft is deprecated and has been removed. The values have been forcibly set to False for FilterbankFeatures and AudioToMelSpectrogramPreprocessor. Please set exact_pad to True as needed.


[NeMo I 2023-10-29 18:38:33 features:289] PADDING: 0


    


[NeMo I 2023-10-29 18:38:35 save_restore_connector:249] Model HifiGanModel was successfully restored from /home/ana/.cache/huggingface/hub/models--nvidia--tts_hifigan/snapshots/3ba1fed954276287015654bf4c78060ffc9a4772/tts_hifigan.nemo.


In [44]:
#Step 1. Start dialog
SDS.start()

[NeMo W 2023-10-29 18:38:49 fastpitch:291] parse() is meant to be called in eval mode.
[NeMo W 2023-10-29 18:38:49 fastpitch:368] generate_spectrogram() is meant to be called in eval mode.


In [45]:
#Step 2. Record audio
recorder = record_audio()

AudioRecorder(audio=Audio(value=b'', format='webm'), stream=CameraStream(constraints={'audio': True, 'video': …

In [50]:
#Step 3. Save audio
recording_path = "recording.wav"
save_recording(recorder, recording_path)
wave, sr = load_audio(recording_path)

In [51]:
#Step 4. Answer generation
SDS.listen_and_respond(wave)

[NeMo W 2023-10-29 18:40:28 fastpitch:291] parse() is meant to be called in eval mode.
[NeMo W 2023-10-29 18:40:28 fastpitch:368] generate_spectrogram() is meant to be called in eval mode.


 no.
