<a href="https://colab.research.google.com/github/AshWN23/handwriting/blob/master/DeepVoice3_single_speaker_TTS_en_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# DeepVoice3: Single-speaker text-to-speech demo

In this notebook, you can try DeepVoice3-based single-speaker text-to-speech (en) using a model trained on [LJSpeech dataset](https://keithito.com/LJ-Speech-Dataset/). The notebook is supposed to be executed on [Google colab](https://colab.research.google.com) so you don't have to setup your machines locally.

**Estimated time to complete**: 5 miniutes.

- Code: https://github.com/r9y9/deepvoice3_pytorch
- Audio samples: https://r9y9.github.io/deepvoice3_pytorch/

## Setup

### Install dependencies

In [None]:
import os
from os.path import exists, join, expanduser

# Clone
name = "deepvoice3_pytorch"
if not exists(name):
  ! git clone https://github.com/r9y9/$name

In [None]:
# Change working directory to the project dir
# os.chdir(join(expanduser("~"), name)) # commented out the old line
os.chdir(name) # Change to the current directory where the repository was cloned

!git checkout 7a10ac6763eda92595e257543494b6a95f64229b --quiet

# Install dependencices
!pip install -q -e '.[bin]'

In [None]:
%pylab inline
! pip install -q librosa nltk

import torch
import numpy as np
import librosa
import librosa.display
import IPython
from IPython.display import Audio
# need this for English text processing frontend
import nltk
! python -m nltk.downloader cmudict

### Download a pre-trained model

In [None]:
preset = "20180505_deepvoice3_ljspeech.json"
checkpoint_path = "20180505_deepvoice3_checkpoint_step000640000.pth"

In [None]:
if not exists(preset):
  !curl -O -L "https://www.dropbox.com/s/0ck82unm0bo0rxd/20180505_deepvoice3_ljspeech.json"
if not exists(checkpoint_path):
  !curl -O -L "https://www.dropbox.com/s/5ucl9remrwy5oeg/20180505_deepvoice3_checkpoint_step000640000.pth"

## Synthesis

### Setup hyper parameters

In [None]:
import hparams
import json
import tensorflow as tf
from tensorflow.python.training.hparams import HParams


# Load parameters from preset
with open(preset) as f:
  preset_content = f.read()
  if preset_content:
    print(type(preset_content))
    print(preset_content)
    # Create a new HParams object and parse the JSON content
    hp = HParams()
    hp.parse_json(preset_content)
    # Assign the parsed hparams to the global hparams object
    hparams.hparams = hp
  else:
    print("Error: Could not read preset file or file is empty.")

# Inject frontend text processor
import synthesis
import train
from deepvoice3_pytorch import frontend
synthesis._frontend = getattr(frontend, "en")
train._frontend =  getattr(frontend, "en")

# alises
fs = hparams.hparams.sample_rate
hop_length = hparams.hparams.hop_size

In [None]:
# Uninstall the current TensorFlow version
!pip uninstall -y tensorflow tensorflow-gpu

# Install a compatible TensorFlow version (e.g., 1.15)
!pip install tensorflow==1.15 tensorflow-gpu==1.15

In [None]:
!cat 20180505_deepvoice3_ljspeech.json

### Define utility functions

In [None]:
def tts(model, text, p=0, speaker_id=None, fast=True, figures=True):
  """
  Synthesizes speech from text using the DeepVoice3 model.

  Args:
    model: The loaded DeepVoice3 model.
    text (str): The input text to synthesize.
    p (float, optional): Probability of replacing words with their pronunciation. Defaults to 0.
    speaker_id (int, optional): The ID of the speaker for multi-speaker models. Defaults to None.
    fast (bool, optional): Whether to use a faster synthesis mode. Defaults to True.
    figures (bool, optional): Whether to display attention plot and spectrogram. Defaults to True.

  Returns:
    None: The function directly displays the audio output.
  """
  from synthesis import tts as _tts
  waveform, alignment, spectrogram, mel = _tts(model, text, p, speaker_id, fast)
  if figures:
      visualize(alignment, spectrogram)
  IPython.display.display(Audio(waveform, rate=fs))

def visualize(alignment, spectrogram):
  label_fontsize = 16
  figure(figsize=(16,16))

  subplot(2,1,1)
  imshow(alignment.T, aspect="auto", origin="lower", interpolation=None)
  xlabel("Decoder timestamp", fontsize=label_fontsize)
  ylabel("Encoder timestamp", fontsize=label_fontsize)
  colorbar()

  subplot(2,1,2)
  librosa.display.specshow(spectrogram.T, sr=fs,
                           hop_length=hop_length, x_axis="time", y_axis="linear")
  xlabel("Time", fontsize=label_fontsize)
  ylabel("Hz", fontsize=label_fontsize)
  tight_layout()
  colorbar()

In [None]:
# Uninstall any existing TensorFlow installations
!pip uninstall -y tensorflow tensorflow-gpu

# Install a specific TensorFlow version (e.g., 1.15.0)
!pip install tensorflow==1.15.0 tensorflow-gpu==1.15.0

In [None]:
# Uninstall any existing TensorFlow installations
!pip uninstall -y tensorflow tensorflow-gpu

# Install a different compatible TensorFlow version (e.g., 1.14)
!pip install tensorflow==1.14 tensorflow-gpu==1.14

### Load the model checkpoint

In [None]:
from train import build_model
from train import restore_parts, load_checkpoint

model = build_model()
model = load_checkpoint(checkpoint_path, model, None, True)

In [None]:
!cat train.py

In [None]:
!cat hparams.py

In [None]:
!cat deepvoice3_pytorch/hparams.py

In [None]:
!pip install docopt

### Generate speech

In [None]:
# Try your favorite senteneces:)
texts = [
    "Scientists at the CERN laboratory say they have discovered a new particle.",
    "There's a way to measure the acute emotional intelligence that has never gone out of style.",
    "President Trump met with other leaders at the Group of 20 conference.",
    "The Senate's bill to repeal and replace the Affordable Care Act is now imperiled.",
    "Generative adversarial network or variational auto-encoder.",
    "The buses aren't the problem, they actually provide a solution.",
    "peter piper picked a peck of pickled peppers how many peppers did peter piper pick.",
    "Some have accepted this as a miracle without any physical explanation.",
]

# Iterate through the list of texts and generate speech for each
for idx, text in enumerate(texts):
  print(idx, text)
  # Call the tts function to synthesize speech
  # figures=False means it won't display the attention plot and spectrogram for each sentence
  tts(model, text, figures=False)

In [None]:
!pip install lws

In [None]:
# With attention plot
text = "Generative adversarial network or variational auto-encoder."
tts(model, text, figures=True)

In [None]:
# Replace '/path/to/your/downloaded/lws.whl' with the actual path or URL of the lws wheel file
!pip install /path/to/your/downloaded/lws.whl

For details, please visit https://github.com/r9y9/deepvoice3_pytorch