# **INSTALLING ALL THE LIBS:**

In [None]:
from time import time
start_install = time()

In [None]:
!apt-get update && apt-get install -y libsndfile1 ffmpeg
!pip install Cython
!pip install nemo_toolkit[all]

In [None]:
!pip install transformers[torch]

In [None]:
print(f"TOTAL INSTALLING TIME IS: {time() - start_install}s")

# **STARTING CONFIGURATION:**

In [None]:
start_importing = time()

In [None]:
import torch

In [None]:
import nemo.collections.asr as nemo_asr

model = nemo_asr.models.ASRModel.from_pretrained(model_name="QuartzNet15x5Base-En")

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-large")
model2 = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-large")

In [None]:
import soundfile as sf
from nemo.collections.tts.models.base import SpectrogramGenerator, Vocoder

spec_generator = SpectrogramGenerator.from_pretrained("tts_en_fastpitch")
# Download and load the pretrained waveglow model
vocoder = Vocoder.from_pretrained("tts_squeezewave")

In [None]:
print(f"TOTAL EXECUTING TIME IS: {time() - start_importing}s")

# **EXECUTING ASSISTANT**

In [None]:
assistant_time = time()

In [None]:
BATCH_SIZE = 1

In [None]:
files = ["../input/vidtimit-audiovideo-dataset/fadg0/fadg0/audio/si649.wav"]

In [None]:
asking = model.transcribe(paths2audio_files=files, batch_size=BATCH_SIZE, logprobs=False)
asking[0]

In [None]:
# encode the new user input, add the eos_token and return a tensor in Pytorch
new_user_input_ids = tokenizer.encode(asking[0] + tokenizer.eos_token, return_tensors='pt')
# append the new user input tokens to the chat history
#bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids
bot_input_ids = new_user_input_ids
# generated a response while limiting the total chat history to 1000 tokens, 
chat_history_ids = model2.generate(
    bot_input_ids,
    max_length=512,
    #num_beams=5, # 3
    #do_sample=True,
    top_k=0, #100
    temperature=0.65,
    top_p=0.95,
    early_stopping=True,
    pad_token_id=tokenizer.eos_token_id)

# pretty print last ouput tokens from bot
answers = tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)
answers

In [None]:
# All spectrogram generators start by parsing raw strings to a tokenized version of the string
# They then take the tokenized string and produce a spectrogram
# Finally, a vocoder converts the spectrogram to audio
# They then take the tokenized string and produce a spectrogram
# Finally, a vocoder converts the spectrogram to audio
audio = vocoder.convert_spectrogram_to_audio(
    spec=spec_generator.generate_spectrogram(
        tokens=spec_generator.parse(answers)
    )
)

# Save the audio to disk in a file called speech.wav
# Note vocoder return a batch of audio. In this example, we just take the first and only sample.
sf.write("speech.wav", audio.detach().numpy()[0], 22050)

In [None]:
print(f"TOTAL ASSISTANT WORK TIME IS: {time() - assistant_time}s")