## Tacotron 2 inference code 
Edit the variables **checkpoint_path** and **text** to match yours and run the entire code to generate plots of mel outputs, alignments and audio synthesis from the generated mel-spectrogram using Griffin-Lim.

#### Import libraries and setup matplotlib

In [None]:
!git clone https://github.com/EdvardOlsen/DialogueBot.git
!cp -r DialogueBot/* .
!chmod 777 setup.sh 
!./setup.sh

In [6]:
from denoiser import Denoiser
import matplotlib
%matplotlib inline
import matplotlib.pylab as plt

import IPython.display as ipd

import sys
sys.path.append('waveglow/')
import numpy as np
import torch

from hparams import create_hparams
from model import Tacotron2
from layers import TacotronSTFT, STFT
from audio_processing import griffin_lim
from train import load_model
from text import text_to_sequence
import speech_recognition as sr
import transformers
import IPython
from transformers import AutoModelForCausalLM, AutoTokenizer

In [7]:
def plot_data(data, figsize=(16, 4)):
    fig, axes = plt.subplots(1, len(data), figsize=figsize)
    for i in range(len(data)):
        axes[i].imshow(data[i], aspect='auto', origin='bottom', 
                       interpolation='none')

#### Setup hparams

In [None]:
hparams = create_hparams()
hparams.sampling_rate = 22050

#### Load model from checkpoint

In [None]:
from google.colab import drive

drive.mount('/content/drive/')

In [10]:
checkpoint_path = "/content/drive/MyDrive/tacotron/..."
model = load_model(hparams)
model.load_state_dict(torch.load(checkpoint_path)['state_dict'])
_ = model.cuda().eval().half()

#### Load WaveGlow for mel2audio synthesis and denoiser

In [None]:
waveglow_path = '/content/drive/MyDrive/tacotron/...'
waveglow = torch.load(waveglow_path)['model']
waveglow.cuda().eval().half()
for k in waveglow.convinv:
    k.float()
denoiser = Denoiser(waveglow)

#### Prepare text input

In [12]:
import soundfile as sf
text = "Hi my name is DialogueBot I am happy to meet you."
def generate_voice(text):
  sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
  sequence = torch.autograd.Variable(
      torch.from_numpy(sequence)).cuda().long()
  mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)
  with torch.no_grad():
      audio = waveglow.infer(mel_outputs_postnet, sigma=0.666)
  audio = denoiser(audio, strength=0.01)[:, 0]
  audio = np.float32(audio[0].data.cpu().numpy())
  sf.write('audio.wav', audio, samplerate = hparams.sampling_rate)
  return 'audio.wav'

In [None]:
IPython.display.Audio('audio.wav')

In [30]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-large")
modelgpt = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-large")

In [19]:
def recognise(audio_file):
  with sr.AudioFile(audio_file) as source:
    audio = r.record(source)  
  return r.recognize_google(audio)

In [None]:
def generate_answer_text(text):
  new_user_input_ids = tokenizer.encode(text + tokenizer.eos_token, return_tensors='pt')
  answer_ids = modelgpt.generate(new_user_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id)
  return tokenizer.decode(answer_ids[:, new_user_input_ids.shape[-1]:][0], skip_special_tokens=True)

generate_answer_text('hi how are you')

In [None]:
generate_answer_text('hehe')

In [22]:
def generate_answer_audio(audiofile):
  text = recognise(audiofile)
  print(text)
  return generate_answer_text(text)

In [None]:
def make_response(oggfile):
  convert_to_wav(oggfile)
  text_user = recognise('tmp.wav')
  print(text_user)
  text_answer = generate_answer_text(text_user)
  wav = generate_voice(text_answer)
  return convert_to_ogg(wav)

make_response('new_file.ogg')

In [31]:
token=''
import telebot
from urllib.request import urlretrieve
import librosa
import soundfile as sf
i = 0
r = sr.Recognizer()

def convert_to_wav(filename):
  data, sr = librosa.load(filename)
  sf.write('tmp.wav', data, sr)
  return 'tmp.wav'

def convert_to_ogg(filename):
  data, sr = librosa.load(filename)
  sf.write('tmp.ogg', data, sr)
  return 'tmp.ogg'

def make_response(oggfile):
  convert_to_wav(oggfile)
  text_user = recognise('tmp.wav')
  text_answer = generate_answer_text(text_user)
  wav = generate_voice(text_answer)
  return convert_to_ogg(wav)

bot = telebot.TeleBot(token)
@bot.message_handler(content_types=["audio", "text", "voice"])
def handle(message, i=10): 
    try:
      file_info = bot.get_file(message.voice.file_id)
      downloaded_file = bot.download_file(file_info.file_path)
      with open('new_file.ogg', 'wb') as new_file:
        new_file.write(downloaded_file)
      new_name = convert_to_wav('new_file.ogg')
      answer = make_response(new_name)
      bot.send_audio(chat_id=message.chat.id, audio=open('audio.wav', 'rb'))

    except ValueError:
      bot.send_message(chat_id=message.chat.id, text='hehe')

In [None]:
bot.polling()