In [1]:
!python --version

Python 3.7.11


### Preparing the Data

In [2]:
trixie_data_path = "Dataset/audio/Trixie.tar"

In [3]:
# Options for Dataloading
Pony = 'Trixie' # <-- Pick a voice e.g 'Rainbow-Dash' or 'Rarity'

skip_noisy = True # Disable to train with Noisy Data included
percentage_training_data = 0.95 # 95% of Data will be used for training, 5% for Validation
allowed_emotions = """
Anxious
Angry
Annoyed
Amused
Confused
Crazy
Disgust
Exhausted
Fear
Happy
Neutral
Sad
Serious
Singing
Shouting
Surprised
Smug
Love
Sarcastic
Tired
Whispering
Whining
""".split("\n")[1:-1]

In [4]:
#=== load the repo and data (Thanks Synthbot) ===
!apt -qq install -y sox
!git clone "https://github.com/synthbot-anon/synthbot.git" /content/synthbot
!(cd /content/synthbot; git checkout experimental)
!pip install pysoundfile 
!pip install librosa

sox is already the newest version (14.4.1-5+deb8u4ubuntu0.1).
0 upgraded, 0 newly installed, 0 to remove and 2 not upgraded.
fatal: destination path '/content/synthbot' already exists and is not an empty directory.
Already on 'experimental'
Your branch is up-to-date with 'origin/experimental'.
You should consider upgrading via the '/usr/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/usr/bin/python -m pip install --upgrade pip' command.[0m


In [5]:
import sys
sys.path.append('/content/synthbot/src')
from ponysynth.corpus import ClipperArchive, phoneme_transcription
import librosa
import subprocess
from tqdm.notebook import tqdm

In [6]:
archive_fn = "/home/deepankaracharyya/TTS-exp/Dataset/audio/Trixie.tar"
archive = ClipperArchive(archive_fn)

In [2]:
cd /home/deepankaracharyya/TTS-exp/Dataset/audio/

/home/deepankaracharyya/TTS-exp/Dataset/audio


In [9]:
!pip install ipywidgets --user

You should consider upgrading via the '/usr/bin/python -m pip install --upgrade pip' command.[0m


In [10]:
!jupyter nbextension enable --py --sys-prefix widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


In [11]:
#=== output the data in NVIDIA/Tacotron2's required format
# Note that NVIDIA/Tacotron2 doesn't seem to use the test set, so this only
# creates the training and validation sets.
data_path = 'audio'
!mkdir {data_path}

allowed_emotions = [x.lower() for x in allowed_emotions]
!mkdir {data_path}
!mkdir {data_path+"/out"}
all_clips = []; all_clips_arpa = []; skipped_count=0; too_short_count=0; emotion_skip=0
for key in archive.keys(): # write the audio files for processing in bash terminal
  audio = archive.read_audio(key)
  audio_fn = '{}/{}.wav'.format(data_path, key)
  audio_fn_ = '{}/{}.wav'.format(data_path+"/out", key)
  with open(audio_fn, 'wb') as audio_out:
    audio_out.write(audio.read())

mkdir: cannot create directory ‘audio’: File exists
mkdir: cannot create directory ‘audio’: File exists
mkdir: cannot create directory ‘audio/out’: File exists


In [12]:
%%script bash
# trim all 48Khz files
mkdir /audio/out
cd audio;
for input in *.wav; do
  output="out/$input"
  sox "$input" "$output" silence 1 0.05 0.1% reverse silence 1 0.05 0.1% reverse;
done

mkdir: cannot create directory ‘/audio/out’: No such file or directory


In [7]:
import os

should_continue = 0 # should run "continue" command inside outer loop.
for key in archive.keys():
  label = archive.read_label(key)
  audio_fn = '{}/{}.wav'.format(data_path, key)
  audio_fn_ = '{}/{}.wav'.format(data_path+"/out", key)
  if (label['noise'] in ['Very Noisy','Noisy']) and skip_noisy: os.remove(audio_fn_); skipped_count+=1; continue

  for tag in label['tags']:
    if tag.lower() not in allowed_emotions:
        try: os.remove(audio_fn_)
        except: pass
        print(tag+" emotion not in list"); emotion_skip+=1; should_continue = 1; break # this is supposed to break the outer loop
  if should_continue: should_continue = 0; continue

  audio = archive.read_audio(key)
  transcript = label['utterance']['content']
  
  if os.stat(audio_fn_).st_size < 71602:
    #print("Skipping Audio, Duration: "+str(len(librosa.core.load(audio_fn_, sr=48000)[0])/48000))
    try: os.remove(audio_fn_)
    except: pass
    too_short_count+=1; continue # Skips files based on size.

  filelist_line = "{}|{}".format(audio_fn_, transcript)
  all_clips.append(filelist_line)
  filelist_line = "{}|{}".format(audio_fn_, phoneme_transcription(label))
  all_clips_arpa.append(filelist_line)
print(str(skipped_count)+" Files are too Noisy.")
print(str(emotion_skip)+" Files contain an emotion not in permitted emotions")
print(str(too_short_count)+" Files are too short")
print(str(len(list(archive.keys()))-(skipped_count+too_short_count+emotion_skip))+" Files kept in dataset.")

NameError: name 'archive' is not defined

In [15]:
# shuffle the training data
import random
random.seed(0)
random.shuffle(all_clips)

# get train, test, validation splits
num_clips = len(all_clips)
train_end = int(num_clips * percentage_training_data)

train = all_clips[:train_end]; validation = all_clips[train_end:]; train_arpa = all_clips_arpa[:train_end]; validation_arpa = all_clips_arpa[train_end:]

# dump the info to filelist files
with open('clipper_train_filelist.txt', 'w') as train_out:
  train_out.write('\n'.join(train)+'\n'+'\n'.join(train_arpa)+"")
with open('clipper_val_filelist.txt', 'w') as val_out:
  val_out.write('\n'.join(validation)+'\n'+'\n'.join(validation_arpa)+"")

In [16]:
BRANCH = 'r1.8.2'

In [17]:
!apt-get install sox libsndfile1 ffmpeg
!pip install wget unidecode pynini==2.1.4
!pip install hydra-core==1.1

Reading package lists... Done
Building dependency tree       
Reading state information... Done
libsndfile1 is already the newest version (1.0.25-10ubuntu0.16.04.3).
ffmpeg is already the newest version (7:2.8.17-0ubuntu0.1).
sox is already the newest version (14.4.1-5+deb8u4ubuntu0.1).
0 upgraded, 0 newly installed, 0 to remove and 2 not upgraded.
You should consider upgrading via the '/usr/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/usr/bin/python -m pip install --upgrade pip' command.[0m


In [18]:
!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]

You should consider upgrading via the '/usr/bin/python -m pip install --upgrade pip' command.[0m


In [19]:
!nvidia-smi

Sun Jun 12 07:18:17 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.46       Driver Version: 495.46       CUDA Version: 11.5     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   44C    P0    26W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [30]:
!pip install protobuf==3.19.0

Collecting protobuf==3.19.0
  Downloading protobuf-3.19.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 7.1 MB/s eta 0:00:01
[?25hInstalling collected packages: protobuf
  Attempting uninstall: protobuf
    Found existing installation: protobuf 3.17.3
    Uninstalling protobuf-3.17.3:
      Successfully uninstalled protobuf-3.17.3
Successfully installed protobuf-3.19.0
You should consider upgrading via the '/usr/bin/python -m pip install --upgrade pip' command.[0m


In [76]:
#imports
import os
import json
import librosa

import torch
import IPython.display as ipd
from matplotlib.pyplot import imshow
from matplotlib import pyplot as plt
from pathlib import Path

In [77]:
from nemo.collections.tts.models import HifiGanModel
from nemo.collections.tts.models import FastPitchModel

In [78]:
pwd

'/home/deepankaracharyya/TTS-exp/Dataset/audio'

In [79]:
#creating the json files for training and validation
with open('clipper_train_filelist.txt') as f :
  records = f.readlines()

print("Number of records : ",len(records))

Number of records :  948


In [80]:
import random

In [83]:
train_manifest = 'fastpitch_train.json'

train_rec = []
random.shuffle(records)
#count = 0

for i in records :
  # if count > 160 :
  #   break
  
  i = i.split('|')
  audio_filepath = i[0]
  text = i[-1].strip('\n')
  #if '{' in text :
    #print(text)
    #continue
  #count = count + 1

  duration = librosa.get_duration(filename=audio_filepath)
  r = {
       "audio_filepath" : audio_filepath,
       "text" : text,
       "duration" : round(duration,1),
       "text_no_preprocessing" : text
    }

  train_rec.append(r)

with open(train_manifest, "w") as f:
    for s in train_rec:
        f.write(json.dumps(s) + '\n')
        
print("Training Data : ", len(train_rec))

Training Data :  948


In [84]:
with open('clipper_val_filelist.txt') as f :
  records_val = f.readlines()

print("Number of records : ", len(records_val))

Number of records :  50


In [85]:
val_manifest = 'fastpitch_val.json'
#count = 0
val_rec = []
random.shuffle(records_val)

for i in records_val:
  # if count > 20 :
  #   break
    
  i = i.split('|')
  audio_filepath = i[0]
  text = i[-1].strip('\n')

  # if '{' in text :
  #   print(text)
  #   continue
  # count = count + 1
  duration = librosa.get_duration(filename=audio_filepath)
  r = {
       "audio_filepath" : audio_filepath,
       "text" : text,
       "duration" : round(duration,1),
       "text_no_preprocessing" : text
    }

  val_rec.append(r)

with open(val_manifest, "w") as f:
    for s in val_rec:
        f.write(json.dumps(s) + '\n')

In [19]:
home_path = !(echo $HOME)
home_path = home_path[0]
print(home_path)

FastPitchModel.from_pretrained("tts_en_fastpitch")
nemo_files = [p for p in Path(f"{home_path}/.cache/torch/NeMo/").glob("**/tts_en_fastpitch_align.nemo")]
print(f"Copying {nemo_files[0]} to ./")
Path("./tts_en_fastpitch_align.nemo").write_bytes(nemo_files[0].read_bytes())

/root
[NeMo I 2022-06-12 08:00:46 cloud:66] Downloading from: https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_fastpitch/versions/1.8.1/files/tts_en_fastpitch_align.nemo to /root/.cache/torch/NeMo/NeMo_1.10.0rc0/tts_en_fastpitch_align/26d7e09971f1d611e24df90c7a9d9b38/tts_en_fastpitch_align.nemo
100% [......................................................................] 187023360 / 187023360[NeMo I 2022-06-12 08:00:52 common:789] Instantiating model from pre-trained checkpoint
[NeMo I 2022-06-12 08:00:56 tokenize_and_classify:87] Creating ClassifyFst grammars.


[NeMo W 2022-06-12 08:01:26 experimental:28] Module <class 'nemo.collections.tts.torch.g2ps.IPAG2P'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2022-06-12 08:01:27 g2ps:87] apply_to_oov_word=None, This means that some of words will remain unchanged if they are not handled by any of the rules in self.parse_one_word(). This may be intended if phonemes and chars are both valid inputs, otherwise, you may see unexpected deletions in your input.
[NeMo W 2022-06-12 08:01:27 modelPT:149] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    dataset:
      _target_: nemo.collections.tts.torch.data.TTSDataset
      manifest_filepath: /ws/LJSpeech/nvidia_ljspeech_train_clean_ngc.json
      sample_rate: 22050
      sup_data_path: /raid/LJSpeech/supplementary
      sup_data_types:
      - align_prior_matri

[NeMo I 2022-06-12 08:01:27 features:200] PADDING: 1
[NeMo I 2022-06-12 08:01:31 save_restore_connector:243] Model FastPitchModel was successfully restored from /root/.cache/torch/NeMo/NeMo_1.10.0rc0/tts_en_fastpitch_align/26d7e09971f1d611e24df90c7a9d9b38/tts_en_fastpitch_align.nemo.
Copying /root/.cache/torch/NeMo/NeMo_1.10.0rc0/tts_en_fastpitch_align/26d7e09971f1d611e24df90c7a9d9b38/tts_en_fastpitch_align.nemo to ./


187023360

In [21]:
!wget https://raw.githubusercontent.com/nvidia/NeMo/main/examples/tts/fastpitch_finetune.py

!mkdir -p conf \
&& cd conf \
&& wget https://raw.githubusercontent.com/nvidia/NeMo/main/examples/tts/conf/fastpitch_align_v1.05.yaml \
&& cd ..

--2022-06-12 08:04:30--  https://raw.githubusercontent.com/nvidia/NeMo/main/examples/tts/fastpitch_finetune.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1751 (1.7K) [text/plain]
Saving to: ‘fastpitch_finetune.py’


2022-06-12 08:04:30 (38.9 MB/s) - ‘fastpitch_finetune.py’ saved [1751/1751]

--2022-06-12 08:04:30--  https://raw.githubusercontent.com/nvidia/NeMo/main/examples/tts/conf/fastpitch_align_v1.05.yaml
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6692 (6.5K) [text/plain]
Saving to: ‘fastpitch_align_v1.05.yaml’


20

In [28]:
# additional files
!mkdir -p tts_dataset_files && cd tts_dataset_files \
&& wget https://raw.githubusercontent.com/NVIDIA/NeMo/main/scripts/tts_dataset_files/cmudict-0.7b_nv22.01 \
&& wget https://raw.githubusercontent.com/NVIDIA/NeMo/main/scripts/tts_dataset_files/heteronyms-030921 \
&& wget https://github.com/NVIDIA/NeMo/blob/77be76349901d0dd99ad6778509ef5c0663a65c3/nemo_text_processing/text_normalization/en/data/whitelist_lj_speech.tsv \
&& cd ..

--2022-06-12 08:11:27--  https://raw.githubusercontent.com/NVIDIA/NeMo/main/scripts/tts_dataset_files/cmudict-0.7b_nv22.01
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3721964 (3.5M) [text/plain]
Saving to: ‘cmudict-0.7b_nv22.01.1’


2022-06-12 08:11:27 (68.6 MB/s) - ‘cmudict-0.7b_nv22.01.1’ saved [3721964/3721964]

--2022-06-12 08:11:27--  https://raw.githubusercontent.com/NVIDIA/NeMo/main/scripts/tts_dataset_files/heteronyms-030921
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3132 (3.1K) [text/plain]
Saving to: ‘heteronyms

In [None]:
# TODO(oktai15): remove +model.text_tokenizer.add_blank_at=true when we update FastPitch checkpoint

!(python fastpitch_finetune.py --config-name=fastpitch_align_v1.05.yaml \
  train_dataset=./fastpitch_train.json \
  validation_datasets=./fastpitch_val.json \
  sup_data_path=./fastpitch_sup_data \
  phoneme_dict_path=tts_dataset_files/cmudict-0.7b_nv22.01 \
  heteronyms_path=tts_dataset_files/heteronyms-030921 \
  whitelist_path=tts_dataset_files/whitelist.tsv \
  exp_manager.exp_dir=./trixie_logs_fp \
  +init_from_nemo_model=./tts_en_fastpitch_align.nemo \
  +trainer.max_steps=2000 ~trainer.max_epochs \
  trainer.check_val_every_n_epoch=5 \
  model.train_ds.dataloader_params.batch_size=16 model.validation_ds.dataloader_params.batch_size=16 \
  model.n_speakers=1 model.pitch_mean=130 model.pitch_std=25 \
  model.pitch_fmin=30 model.pitch_fmax=512 model.optim.lr=2e-4 \
  ~model.optim.sched model.optim.name=adam trainer.devices=1 trainer.strategy=null \
  +model.text_tokenizer.add_blank_at=true \
)

[NeMo W 2022-06-12 11:23:14 optimizers:55] Apex was not found. Using the lamb or fused_adam optimizer will error out.
[NeMo W 2022-06-12 11:23:17 experimental:28] Module <class 'nemo.collections.tts.torch.tts_tokenizers.IPATokenizer'> is experimental, not ready for production and is not fully supported. Use at your own risk.
Using 16bit native Automatic Mixed Precision (AMP)
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
[NeMo I 2022-06-12 11:23:19 exp_manager:287] Experiments will be logged at trixie_logs_fp/FastPitch/2022-06-12_11-23-19
[NeMo I 2022-06-12 11:23:19 exp_manager:661] TensorboardLogger has been set up
      rank_zero_deprecation("`Trainer.weights_save_path` has been deprecated in v1.6 and will be removed in v1.8.")
    
[NeMo I 2022-06-12 11:23:22 tokenize_and_classify:87] Creating ClassifyFst grammars.
[NeMo W 2022-06-12 11:23:58 experimental:28] Module <class 'nemo.collect

In [None]:
print("DONE")

In [43]:
FP_chck_pt = "./trixie_logs_fp/FastPitch/2022-06-12_08-12-40/checkpoints/FastPitch--v_loss=2.4638-epoch=24-last.ckpt"
spec_model = FastPitchModel.load_from_checkpoint(FP_chck_pt)
spec_model.eval().cuda()

[NeMo I 2022-06-12 09:12:39 tokenize_and_classify:87] Creating ClassifyFst grammars.


[NeMo W 2022-06-12 09:13:11 g2ps:87] apply_to_oov_word=None, This means that some of words will remain unchanged if they are not handled by any of the rules in self.parse_one_word(). This may be intended if phonemes and chars are both valid inputs, otherwise, you may see unexpected deletions in your input.
[NeMo W 2022-06-12 09:13:11 modelPT:149] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    dataset:
      _target_: nemo.collections.tts.torch.data.TTSDataset
      manifest_filepath: ./fastpitch_train.json
      sample_rate: 22050
      sup_data_path: ./fastpitch_sup_data
      sup_data_types:
      - align_prior_matrix
      - pitch
      n_fft: 1024
      win_length: 1024
      hop_length: 256
      window: hann
      n_mels: 80
      lowfreq: 0
      highfreq: 8000
      max_duration: null
      min_duration: 0.1
      ignore_file: null
  

[NeMo I 2022-06-12 09:13:11 features:200] PADDING: 1


FastPitchModel(
  (mel_loss): MelLoss()
  (pitch_loss): PitchLoss()
  (duration_loss): DurationLoss()
  (aligner): AlignmentEncoder(
    (softmax): Softmax(dim=3)
    (log_softmax): LogSoftmax(dim=3)
    (key_proj): Sequential(
      (0): ConvNorm(
        (conv): Conv1d(384, 768, kernel_size=(3,), stride=(1,), padding=(1,))
      )
      (1): ReLU()
      (2): ConvNorm(
        (conv): Conv1d(768, 80, kernel_size=(1,), stride=(1,))
      )
    )
    (query_proj): Sequential(
      (0): ConvNorm(
        (conv): Conv1d(80, 160, kernel_size=(3,), stride=(1,), padding=(1,))
      )
      (1): ReLU()
      (2): ConvNorm(
        (conv): Conv1d(160, 80, kernel_size=(1,), stride=(1,))
      )
      (3): ReLU()
      (4): ConvNorm(
        (conv): Conv1d(80, 80, kernel_size=(1,), stride=(1,))
      )
    )
  )
  (forward_sum_loss): ForwardSumLoss(
    (log_softmax): LogSoftmax(dim=3)
    (ctc_loss): CTCLoss()
  )
  (bin_loss): BinLoss()
  (preprocessor): AudioToMelSpectrogramPreprocessor(
  

In [44]:
vocoder = HifiGanModel.from_pretrained("tts_hifigan")
vocoder = vocoder.eval().cuda()

#getting the pre-trained model for hifigan
nemo_files = [p for p in Path(f"{home_path}/.cache/torch/NeMo/").glob("**/tts_hifigan.nemo")]
print(f"Copying {nemo_files[0]} to ./")
Path("./tts_hifigan.nemo").write_bytes(nemo_files[0].read_bytes())

[NeMo I 2022-06-12 09:13:40 cloud:66] Downloading from: https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_hifigan/versions/1.0.0rc1/files/tts_hifigan.nemo to /root/.cache/torch/NeMo/NeMo_1.10.0rc0/tts_hifigan/e6da322f0f7e7dcf3f1900a9229a7e69/tts_hifigan.nemo
100% [......................................................................] 315386678 / 315386678[NeMo I 2022-06-12 09:13:49 common:789] Instantiating model from pre-trained checkpoint


[NeMo W 2022-06-12 09:13:52 modelPT:149] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    dataset:
      _target_: nemo.collections.tts.data.datalayers.MelAudioDataset
      manifest_filepath: /home/fkreuk/data/train_finetune.txt
      min_duration: 0.75
      n_segments: 8192
    dataloader_params:
      drop_last: false
      shuffle: true
      batch_size: 64
      num_workers: 4
    
[NeMo W 2022-06-12 09:13:52 modelPT:156] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    dataset:
      _target_: nemo.collections.tts.data.datalayers.MelAudioDataset
      manifest_filepath: /home/fkreuk/data/val_finetune.txt
      min_duration: 3
      n_segments: 66150


[NeMo I 2022-06-12 09:13:52 features:200] PADDING: 0


[NeMo W 2022-06-12 09:13:52 features:178] Using torch_stft is deprecated and has been removed. The values have been forcibly set to False for FilterbankFeatures and AudioToMelSpectrogramPreprocessor. Please set exact_pad to True as needed.


[NeMo I 2022-06-12 09:13:52 features:200] PADDING: 0
[NeMo I 2022-06-12 09:13:53 save_restore_connector:243] Model HifiGanModel was successfully restored from /root/.cache/torch/NeMo/NeMo_1.10.0rc0/tts_hifigan/e6da322f0f7e7dcf3f1900a9229a7e69/tts_hifigan.nemo.
Copying /root/.cache/torch/NeMo/NeMo_1.10.0rc0/tts_hifigan/e6da322f0f7e7dcf3f1900a9229a7e69/tts_hifigan.nemo to ./


315386678

In [45]:
def infer(spec_gen_model, vocoder_model, str_input, speaker=None):
    """
    Synthesizes spectrogram and audio from a text string given a spectrogram synthesis and vocoder model.
    
    Args:
        spec_gen_model: Spectrogram generator model (FastPitch in our case)
        vocoder_model: Vocoder model (HiFiGAN in our case)
        str_input: Text input for the synthesis
        speaker: Speaker ID
    
    Returns:
        spectrogram and waveform of the synthesized audio.
    """
    with torch.no_grad():
        parsed = spec_gen_model.parse(str_input)
        if speaker is not None:
            speaker = torch.tensor([speaker]).long().to(device=spec_gen_model.device)
        spectrogram = spec_gen_model.generate_spectrogram(tokens=parsed, speaker=speaker)
        audio = vocoder_model.convert_spectrogram_to_audio(spec=spectrogram)
        
    if spectrogram is not None:
        if isinstance(spectrogram, torch.Tensor):
            spectrogram = spectrogram.to('cpu').numpy()
        if len(spectrogram.shape) == 3:
            spectrogram = spectrogram[0]
    if isinstance(audio, torch.Tensor):
        audio = audio.to('cpu').numpy()
    return spectrogram, audio

In [48]:
spec, audio = infer(spec_model, vocoder,"Hello Alex, how are you doing today ?")
print("\n\n\n\n")
ipd.display(ipd.Audio(audio, rate=22050))
%matplotlib inline
#imshow(spec, origin="lower", aspect="auto")
plt.show()








## Fine tuning HiFiGAN

In [49]:
#helper functions
def load_wav(audio_file, target_sr=None):
    with sf.SoundFile(audio_file, 'r') as f:
        samples = f.read(dtype='float32')
        sample_rate = f.samplerate
        if target_sr is not None and target_sr != sample_rate:
            samples = librosa.core.resample(samples, orig_sr=sample_rate, target_sr=target_sr)
    return samples.transpose()

In [50]:
from nemo.collections.tts.torch.helpers import BetaBinomialInterpolator
import librosa
import soundfile as sf
import numpy as np

In [51]:
#dataset preparation
# Get records from the training manifest
manifest_path = "./fastpitch_train.json"
records = []
with open(manifest_path, "r") as f:
    for i, line in enumerate(f):
        records.append(json.loads(line))

beta_binomial_interpolator = BetaBinomialInterpolator()
spec_model.eval()

device = spec_model.device

save_dir = Path("./train_mels")
save_dir.mkdir(exist_ok=True, parents=True)

# Generate a spectrograms (we need to use ground truth alignment for correct matching between audio and mels)
for i, r in enumerate(records):
    audio = load_wav(r["audio_filepath"])
    audio = torch.from_numpy(audio).unsqueeze(0).to(device)
    audio_len = torch.tensor(audio.shape[1], dtype=torch.long, device=device).unsqueeze(0)
    
        
    with torch.no_grad():
        if "normalized_text" in r:
            text = spec_model.parse(r["normalized_text"], normalize=False)
        else:
            text = spec_model.parse(r['text'])
        
        text_len = torch.tensor(text.shape[-1], dtype=torch.long, device=device).unsqueeze(0)
    
        spect, spect_len = spec_model.preprocessor(input_signal=audio, length=audio_len)

        # Generate attention prior and spectrogram inputs for HiFi-GAN
        attn_prior = torch.from_numpy(
          beta_binomial_interpolator(spect_len.item(), text_len.item())
        ).unsqueeze(0).to(text.device)
            
        spectrogram = spec_model.forward(
          text=text, 
          input_lens=text_len, 
          spec=spect, 
          mel_lens=spect_len, 
          attn_prior=attn_prior,
        )[0]
        
        save_path = save_dir / f"mel_{i}.npy"
        np.save(save_path, spectrogram[0].to('cpu').numpy())
        r["mel_filepath"] = str(save_path)

hifigan_manifest_path = "hifigan_train_ft.json"
with open(hifigan_manifest_path, "w") as f:
    for r in records:
        f.write(json.dumps(r) + '\n')

In [52]:
# Get records from the training manifest
manifest_path_validation = "./fastpitch_val.json"
records_val = []
with open(manifest_path_validation, "r") as f:
    for i, line in enumerate(f):
        records_val.append(json.loads(line))

save_dir_val = Path("./dev_mels")
save_dir_val.mkdir(exist_ok=True, parents=True)

# Generate a spectrograms (we need to use ground truth alignment for correct matching between audio and mels)
for i, r in enumerate(records_val):
    audio = load_wav(r["audio_filepath"])
    audio = torch.from_numpy(audio).unsqueeze(0).to(device)
    audio_len = torch.tensor(audio.shape[1], dtype=torch.long, device=device).unsqueeze(0)
    
       
    with torch.no_grad():
        if "normalized_text" in r:
            text = spec_model.parse(r["normalized_text"], normalize=False)
        else:
            text = spec_model.parse(r['text'])
        
        text_len = torch.tensor(text.shape[-1], dtype=torch.long, device=device).unsqueeze(0)
    
        spect, spect_len = spec_model.preprocessor(input_signal=audio, length=audio_len)

        # Generate attention prior and spectrogram inputs for HiFi-GAN
        attn_prior = torch.from_numpy(
          beta_binomial_interpolator(spect_len.item(), text_len.item())
        ).unsqueeze(0).to(text.device)
            
        spectrogram = spec_model.forward(
          text=text, 
          input_lens=text_len, 
          spec=spect, 
          mel_lens=spect_len, 
          attn_prior=attn_prior,
        )[0]
        
        save_path_val = save_dir_val / f"mel_{i}.npy"
        np.save(save_path_val, spectrogram[0].to('cpu').numpy())
        r["mel_filepath"] = str(save_path_val)

hifigan_val_manifest_path = "hifigan_val_ft.json"
with open(hifigan_val_manifest_path, "w") as f:
    for r in records_val:
        f.write(json.dumps(r) + '\n')

In [53]:
! mv hifigan.zip -t ./conf/
! cd conf && unzip hifigan.zip && cd ..
! wget https://raw.githubusercontent.com/nvidia/NeMo/main/examples/tts/hifigan_finetune.py
! cd conf && cd hifigan && wget https://raw.githubusercontent.com/nvidia/NeMo/main/examples/tts/conf/hifigan/hifigan.yaml && cd .. && cd ..

Archive:  hifigan.zip
   creating: hifigan/
   creating: hifigan/model/
   creating: hifigan/model/validation_ds/
 extracting: hifigan/model/validation_ds/val_ds.yaml  
 extracting: hifigan/model/validation_ds/val_ds_finetune.yaml  
   creating: hifigan/model/train_ds/
 extracting: hifigan/model/train_ds/train_ds.yaml  
 extracting: hifigan/model/train_ds/train_ds_finetune.yaml  
   creating: hifigan/model/generator/
 extracting: hifigan/model/generator/v1.yaml  
 extracting: hifigan/model/generator/v1_44100.yaml  
 extracting: hifigan/model/generator/v2.yaml  
 extracting: hifigan/model/generator/v3.yaml  
 extracting: hifigan/hifigan.yaml    
 extracting: hifigan/hifigan_44100.yaml  
--2022-06-12 09:25:48--  https://raw.githubusercontent.com/nvidia/NeMo/main/examples/tts/hifigan_finetune.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.19

In [54]:
del spec_model
del vocoder

In [58]:
#finetuning the hifigan model
!(python hifigan_finetune.py \
--config-name=hifigan.yaml \
model.train_ds.dataloader_params.batch_size=16 \
model.max_steps=1000 \
model.optim.lr=0.0002 \
~model.optim.sched \
train_dataset=./hifigan_train_ft.json \
validation_datasets=./hifigan_val_ft.json \
exp_manager.exp_dir=trixie_logs_fp \
+init_from_nemo_model=tts_hifigan.nemo \
trainer.check_val_every_n_epoch=2 \
)

[NeMo W 2022-06-12 09:55:15 optimizers:55] Apex was not found. Using the lamb or fused_adam optimizer will error out.
[NeMo W 2022-06-12 09:55:15 experimental:28] Module <class 'nemo.collections.tts.torch.tts_tokenizers.IPATokenizer'> is experimental, not ready for production and is not fully supported. Use at your own risk.
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
[NeMo I 2022-06-12 09:55:16 exp_manager:287] Experiments will be logged at trixie_logs_fp/HifiGan/2022-06-12_09-55-16
[NeMo I 2022-06-12 09:55:16 exp_manager:661] TensorboardLogger has been set up
      rank_zero_deprecation("`Trainer.weights_save_path` has been deprecated in v1.6 and will be removed in v1.8.")
    
[NeMo W 2022-06-12 09:55:16 exp_manager:896] The checkpoint callback was told to monitor a validation value and trainer's max_steps was set to 1000. Please ensure that max_steps will run for at least 2 epochs t

In [59]:
print("DONE")

DONE


In [75]:
!cp ./trixie_logs_fp/FastPitch/2022-06-12_08-12-40/checkpoints/FastPitch--v_loss=2.4638-epoch=24-last.ckpt -t /home/deepankaracharyya/TTS-exp/Model-checkpoints/Iteration-01_Trixie_Whole_Data
!cp ./trixie_logs_fp/HifiGan/2022-06-12_09-55-16/checkpoints/HifiGan--val_loss=0.4848-epoch=7-last.ckpt -t  /home/deepankaracharyya/TTS-exp/Model-checkpoints/Iteration-01_Trixie_Whole_Data

In [64]:
FP_chck_pt = "./trixie_logs_fp/FastPitch/2022-06-12_08-12-40/checkpoints/FastPitch--v_loss=2.4638-epoch=24-last.ckpt"
spec_model = FastPitchModel.load_from_checkpoint(FP_chck_pt)
spec_model.eval().cuda()

HFG_chck_pt = "./trixie_logs_fp/HifiGan/2022-06-12_09-55-16/checkpoints/HifiGan--val_loss=0.4848-epoch=7-last.ckpt"
vocoder = HifiGanModel.load_from_checkpoint(HFG_chck_pt)
vocoder = vocoder.eval().cuda()

[NeMo I 2022-06-12 10:17:27 tokenize_and_classify:87] Creating ClassifyFst grammars.


[NeMo W 2022-06-12 10:18:02 g2ps:87] apply_to_oov_word=None, This means that some of words will remain unchanged if they are not handled by any of the rules in self.parse_one_word(). This may be intended if phonemes and chars are both valid inputs, otherwise, you may see unexpected deletions in your input.
[NeMo W 2022-06-12 10:18:02 modelPT:149] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    dataset:
      _target_: nemo.collections.tts.torch.data.TTSDataset
      manifest_filepath: ./fastpitch_train.json
      sample_rate: 22050
      sup_data_path: ./fastpitch_sup_data
      sup_data_types:
      - align_prior_matrix
      - pitch
      n_fft: 1024
      win_length: 1024
      hop_length: 256
      window: hann
      n_mels: 80
      lowfreq: 0
      highfreq: 8000
      max_duration: null
      min_duration: 0.1
      ignore_file: null
  

[NeMo I 2022-06-12 10:18:02 features:200] PADDING: 1


[NeMo W 2022-06-12 10:18:03 modelPT:149] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    dataset:
      _target_: nemo.collections.tts.torch.data.VocoderDataset
      manifest_filepath: ./hifigan_train_ft.json
      sample_rate: 22050
      n_segments: 8192
      max_duration: null
      min_duration: 0.75
    dataloader_params:
      drop_last: false
      shuffle: true
      batch_size: 16
      num_workers: 4
    
[NeMo W 2022-06-12 10:18:03 modelPT:156] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    dataset:
      _target_: nemo.collections.tts.torch.data.VocoderDataset
      manifest_filepath: ./hifigan_val_ft.json
      sample_rate: 22050
      n_s

[NeMo I 2022-06-12 10:18:03 features:200] PADDING: 0
[NeMo I 2022-06-12 10:18:03 features:208] STFT using exact pad
[NeMo I 2022-06-12 10:18:03 features:200] PADDING: 0
[NeMo I 2022-06-12 10:18:03 features:208] STFT using exact pad


In [68]:
def custom_infer(transcript, spec_model, vocoder):
  spec, audio = infer(spec_model, vocoder, transcript)
  print("\n\n\n\n")
  ipd.display(ipd.Audio(audio, rate=22050))
  %matplotlib inline
  plt.show()

In [70]:
custom_infer("Its a wonderful day to train some models", spec_model, vocoder)








In [72]:
!pwd

/home/deepankaracharyya/TTS-exp/Dataset/audio
