In [4]:
#imports
import os
import json
import librosa
from tqdm import tqdm

import torch
import IPython.display as ipd
from matplotlib.pyplot import imshow
from matplotlib import pyplot as plt
from pathlib import Path

from nemo.collections.tts.models import FastPitchModel
from nemo.collections.tts.models import HifiGanModel
from nemo.collections.tts.parts.utils.tts_dataset_utils import BetaBinomialInterpolator
import soundfile as sf
import numpy as np

AttributeError: module 'llvmlite.binding.ffi' has no attribute 'register_lock_callback'

In [2]:
#check the performance
fastpitch_path = '/workspace/nemo/vol/TTS_voices/FP_checkpoints/MilitaryMale_MilitaryMaleFastPitch--val_loss=0.9569-epoch=729.ckpt'
spec_model = FastPitchModel.load_from_checkpoint(fastpitch_path)
spec_model.eval().cuda()

[NeMo W 2023-06-16 06:07:38 en_us_arpabet:66] apply_to_oov_word=None, This means that some of words will remain unchanged if they are not handled by any of the rules in self.parse_one_word(). This may be intended if phonemes and chars are both valid inputs, otherwise, you may see unexpected deletions in your input.
[NeMo W 2023-06-16 06:07:38 modelPT:161] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    dataset:
      _target_: nemo.collections.tts.data.dataset.TTSDataset
      manifest_filepath: fastpitch_train.json
      sample_rate: 22050
      sup_data_path: fastpitch_sup_data
      sup_data_types:
      - align_prior_matrix
      - pitch
      n_fft: 1024
      win_length: 1024
      hop_length: 256
      window: hann
      n_mels: 80
      lowfreq: 0
      highfreq: 8000
      max_duration: null
      min_duration: 0.1
      ignore_file: 

[NeMo I 2023-06-16 06:07:38 features:289] PADDING: 1


FastPitchModel(
  (mel_loss_fn): MelLoss()
  (pitch_loss_fn): PitchLoss()
  (duration_loss_fn): DurationLoss()
  (energy_loss_fn): EnergyLoss()
  (aligner): AlignmentEncoder(
    (cond_input): ConditionalInput()
    (softmax): Softmax(dim=3)
    (log_softmax): LogSoftmax(dim=3)
    (key_proj): Sequential(
      (0): ConvNorm(
        (conv): Conv1d(384, 768, kernel_size=(3,), stride=(1,), padding=(1,))
      )
      (1): ReLU()
      (2): ConvNorm(
        (conv): Conv1d(768, 80, kernel_size=(1,), stride=(1,))
      )
    )
    (query_proj): Sequential(
      (0): ConvNorm(
        (conv): Conv1d(80, 160, kernel_size=(3,), stride=(1,), padding=(1,))
      )
      (1): ReLU()
      (2): ConvNorm(
        (conv): Conv1d(160, 80, kernel_size=(1,), stride=(1,))
      )
      (3): ReLU()
      (4): ConvNorm(
        (conv): Conv1d(80, 80, kernel_size=(1,), stride=(1,))
      )
    )
  )
  (forward_sum_loss_fn): ForwardSumLoss(
    (log_softmax): LogSoftmax(dim=-1)
    (ctc_loss): CTCLoss()


In [3]:
#helper functions
def load_wav(audio_file, target_sr=None):
    with sf.SoundFile(audio_file, 'r') as f:
        samples = f.read(dtype='float32')
        sample_rate = f.samplerate
        if target_sr is not None and target_sr != sample_rate:
            samples = librosa.core.resample(samples, orig_sr=sample_rate, target_sr=target_sr)
    return samples.transpose()

In [4]:
from tqdm import tqdm

In [7]:
import random

voice = "military_male"
base_path = f'/workspace/nemo/vol/TTS_voices/chunks/{voice}_processed'
with open(f'/workspace/nemo/vol/TTS_voices/texts/{voice}_train_filelist.txt') as f :
    records = f.readlines()
print("Number of records : ",len(records))

train_manifest = f'./fastpitch_train.json'

train_rec = []
random.shuffle(records)
count = 0

for i in records :  
    i = i.split('|')
    audio_filepath = os.path.join(base_path,i[0])
    text = i[-1].strip('\n')
    if '{' in text :
        continue
    count = count + 1
    duration = librosa.get_duration(filename=audio_filepath)
    r = {
       "audio_filepath" : audio_filepath,
       "text" : text,
       "duration" : round(duration,1),
       "text_no_preprocessing" : text
    }
    train_rec.append(r)

with open(train_manifest, "w") as f:
    for s in train_rec:
        f.write(json.dumps(s) + '\n')
        
print("Training Data : ", len(train_rec))

with open(f'/workspace/nemo/vol/TTS_voices/texts/{voice}_val_filelist.txt') as f :
    records = f.readlines()
print("Number of records : ",len(records))

train_manifest = f'./fastpitch_val.json'

train_rec = []
random.shuffle(records)
count = 0

for i in records :  
    i = i.split('|')
    audio_filepath = os.path.join(base_path,i[0])
    text = i[-1].strip('\n')
    if '{' in text :
        continue
    count = count + 1
    duration = librosa.get_duration(filename=audio_filepath)
    r = {
       "audio_filepath" : audio_filepath,
       "text" : text,
       "duration" : round(duration,1),
       "text_no_preprocessing" : text
    }
    train_rec.append(r)

with open(train_manifest, "w") as f:
    for s in train_rec:
        f.write(json.dumps(s) + '\n')
        
print("Val Data : ", len(train_rec))


Number of records :  216
Training Data :  216
Number of records :  25
Val Data :  25


In [5]:
#dataset preparation
# Get records from the training manifest
manifest_path = "./fastpitch_train.json"
records = []
with open(manifest_path, "r") as f:
    for i, line in enumerate(f):
        records.append(json.loads(line))

beta_binomial_interpolator = BetaBinomialInterpolator()
spec_model.eval()

device = spec_model.device

save_dir = Path("/workspace/nemo/vol/TTS_voices/train_mels/female_child")
save_dir.mkdir(exist_ok=True, parents=True)

# Generate a spectrograms (we need to use ground truth alignment for correct matching between audio and mels)
for i, r in tqdm(enumerate(records)):
    audio = load_wav(r["audio_filepath"])
    audio = torch.from_numpy(audio).unsqueeze(0).to(device)
    audio_len = torch.tensor(audio.shape[1], dtype=torch.long, device=device).unsqueeze(0)
    
        
    with torch.no_grad():
        if "normalized_text" in r:
            text = spec_model.parse(r["normalized_text"], normalize=False)
        else:
            text = spec_model.parse(r['text'])
        
        text_len = torch.tensor(text.shape[-1], dtype=torch.long, device=device).unsqueeze(0)
    
        spect, spect_len = spec_model.preprocessor(input_signal=audio, length=audio_len)

        # Generate attention prior and spectrogram inputs for HiFi-GAN
        attn_prior = torch.from_numpy(
          beta_binomial_interpolator(spect_len.item(), text_len.item())
        ).unsqueeze(0).to(text.device)
            
        spectrogram = spec_model.forward(
          text=text, 
          input_lens=text_len, 
          spec=spect, 
          mel_lens=spect_len, 
          attn_prior=attn_prior,
        )[0]
        
        save_path = save_dir / f"mel_{i}.npy"
        np.save(save_path, spectrogram[0].to('cpu').numpy())
        r["mel_filepath"] = str(save_path)

hifigan_manifest_path = "./hifigan_train_ft.json"
with open(hifigan_manifest_path, "w") as f:
    for r in records:
        f.write(json.dumps(r) + '\n')

3it [00:01,  2.86it/s][NeMo W 2023-06-13 13:55:06 tts_tokenizers:477] Text: [ほ待ってあー] contains unknown char/phoneme: [ほ].Original text: [ほ待ってあー]. Symbol will be skipped.
[NeMo W 2023-06-13 13:55:06 tts_tokenizers:477] Text: [ほ待ってあー] contains unknown char/phoneme: [待].Original text: [ほ待ってあー]. Symbol will be skipped.
[NeMo W 2023-06-13 13:55:06 tts_tokenizers:477] Text: [ほ待ってあー] contains unknown char/phoneme: [っ].Original text: [ほ待ってあー]. Symbol will be skipped.
[NeMo W 2023-06-13 13:55:06 tts_tokenizers:477] Text: [ほ待ってあー] contains unknown char/phoneme: [て].Original text: [ほ待ってあー]. Symbol will be skipped.
[NeMo W 2023-06-13 13:55:06 tts_tokenizers:477] Text: [ほ待ってあー] contains unknown char/phoneme: [あ].Original text: [ほ待ってあー]. Symbol will be skipped.
[NeMo W 2023-06-13 13:55:06 tts_tokenizers:477] Text: [ほ待ってあー] contains unknown char/phoneme: [ー].Original text: [ほ待ってあー]. Symbol will be skipped.
915it [00:31, 29.02it/s]


In [7]:
# Get records from the training manifest
manifest_path_validation = "./fastpitch_val.json"
records_val = []
with open(manifest_path_validation, "r") as f:
    for i, line in enumerate(f):
        records_val.append(json.loads(line))

save_dir_val = Path("/workspace/nemo/vol/TTS_voices/dev_mels/female_child")
save_dir_val.mkdir(exist_ok=True, parents=True)

# Generate a spectrograms (we need to use ground truth alignment for correct matching between audio and mels)
for i, r in enumerate(records_val):
    audio = load_wav(r["audio_filepath"])
    audio = torch.from_numpy(audio).unsqueeze(0).to(device)
    audio_len = torch.tensor(audio.shape[1], dtype=torch.long, device=device).unsqueeze(0)
    
       
    with torch.no_grad():
        if "normalized_text" in r:
            text = spec_model.parse(r["normalized_text"], normalize=False)
        else:
            text = spec_model.parse(r['text'])
        
        text_len = torch.tensor(text.shape[-1], dtype=torch.long, device=device).unsqueeze(0)
    
        spect, spect_len = spec_model.preprocessor(input_signal=audio, length=audio_len)

        # Generate attention prior and spectrogram inputs for HiFi-GAN
        attn_prior = torch.from_numpy(
          beta_binomial_interpolator(spect_len.item(), text_len.item())
        ).unsqueeze(0).to(text.device)
            
        spectrogram = spec_model.forward(
          text=text, 
          input_lens=text_len, 
          spec=spect, 
          mel_lens=spect_len, 
          attn_prior=attn_prior,
        )[0]
        
        save_path_val = save_dir_val / f"mel_{i}.npy"
        np.save(save_path_val, spectrogram[0].to('cpu').numpy())
        r["mel_filepath"] = str(save_path_val)

hifigan_val_manifest_path = "./hifigan_val_ft.json"
with open(hifigan_val_manifest_path, "w") as f:
    for r in records_val:
        f.write(json.dumps(r) + '\n')

In [9]:
! cd conf && unzip hifigan.zip
! cd conf && cd hifigan && wget https://raw.githubusercontent.com/nvidia/NeMo/main/examples/tts/conf/hifigan/hifigan.yaml && cd .. 

Archive:  hifigan.zip
   creating: hifigan/
   creating: hifigan/model/
   creating: hifigan/model/validation_ds/
 extracting: hifigan/model/validation_ds/val_ds.yaml  
 extracting: hifigan/model/validation_ds/val_ds_finetune.yaml  
   creating: hifigan/model/train_ds/
 extracting: hifigan/model/train_ds/train_ds.yaml  
 extracting: hifigan/model/train_ds/train_ds_finetune.yaml  
   creating: hifigan/model/generator/
 extracting: hifigan/model/generator/v1.yaml  
 extracting: hifigan/model/generator/v1_44100.yaml  
 extracting: hifigan/model/generator/v2.yaml  
 extracting: hifigan/model/generator/v3.yaml  
 extracting: hifigan/hifigan.yaml    
 extracting: hifigan/hifigan_44100.yaml  
--2023-06-13 14:00:04--  https://raw.githubusercontent.com/nvidia/NeMo/main/examples/tts/conf/hifigan/hifigan.yaml
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|

In [10]:
from nemo.collections.tts.models import HifiGanModel
vocoder = HifiGanModel.from_pretrained("tts_en_hifigan")
vocoder = vocoder.eval().cuda()

[NeMo I 2023-06-13 14:00:14 cloud:68] Downloading from: https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_hifigan/versions/1.0.0rc1/files/tts_hifigan.nemo to /root/.cache/torch/NeMo/NeMo_1.19.0rc0/tts_hifigan/e6da322f0f7e7dcf3f1900a9229a7e69/tts_hifigan.nemo
100% [......................................................................] 315386678 / 315386678[NeMo I 2023-06-13 14:00:21 common:913] Instantiating model from pre-trained checkpoint


[NeMo W 2023-06-13 14:00:22 modelPT:161] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    dataset:
      _target_: nemo.collections.tts.data.datalayers.MelAudioDataset
      manifest_filepath: /home/fkreuk/data/train_finetune.txt
      min_duration: 0.75
      n_segments: 8192
    dataloader_params:
      drop_last: false
      shuffle: true
      batch_size: 64
      num_workers: 4
    
[NeMo W 2023-06-13 14:00:22 modelPT:168] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    dataset:
      _target_: nemo.collections.tts.data.datalayers.MelAudioDataset
      manifest_filepath: /home/fkreuk/data/val_finetune.txt
      min_duration: 3
      n_segments: 66150


[NeMo I 2023-06-13 14:00:23 features:291] PADDING: 0


[NeMo W 2023-06-13 14:00:23 features:268] Using torch_stft is deprecated and has been removed. The values have been forcibly set to False for FilterbankFeatures and AudioToMelSpectrogramPreprocessor. Please set exact_pad to True as needed.


[NeMo I 2023-06-13 14:00:23 features:291] PADDING: 0
[NeMo I 2023-06-13 14:00:24 save_restore_connector:249] Model HifiGanModel was successfully restored from /root/.cache/torch/NeMo/NeMo_1.19.0rc0/tts_hifigan/e6da322f0f7e7dcf3f1900a9229a7e69/tts_hifigan.nemo.


In [11]:
home_path = !(echo $HOME)
home_path = home_path[0]
print(home_path)

nemo_files = [p for p in Path(f"{home_path}/.cache/torch/NeMo/").glob("**/tts_hifigan.nemo")]
print(f"Copying {nemo_files[0]} to ./")
Path("./tts_hifigan.nemo").write_bytes(nemo_files[0].read_bytes())

/root
Copying /root/.cache/torch/NeMo/NeMo_1.19.0rc0/tts_hifigan/e6da322f0f7e7dcf3f1900a9229a7e69/tts_hifigan.nemo to ./


315386678

In [12]:
!pip install wandb -qU
import wandb
wandb.login()

You should consider upgrading via the '/usr/bin/python -m pip install --upgrade pip' command.[0m


E0613 14:00:48.160705 140293351618368 jupyter.py:231] Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mconvdev[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [13]:
!wandb login

[34m[1mwandb[0m: Currently logged in as: [33mconvdev[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [1]:
wandb_api_key = "108bad0089a140932fbbe1c9e2ae182a1a228ffe"

In [13]:
!wget https://raw.githubusercontent.com/nvidia/NeMo/main/examples/tts/hifigan_finetune.py

--2023-06-13 14:01:01--  https://raw.githubusercontent.com/nvidia/NeMo/main/examples/tts/hifigan_finetune.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1192 (1.2K) [text/plain]
Saving to: ‘hifigan_finetune.py.1’


2023-06-13 14:01:01 (149 MB/s) - ‘hifigan_finetune.py.1’ saved [1192/1192]



In [None]:
!(HYDRA_FULL_ERROR=1 python hifigan_finetune.py \
--config-name=hifigan.yaml \
model.train_ds.dataloader_params.batch_size=32 \
model.max_steps=100000 \
model.optim.lr=1e-4 \
~model.optim.sched \
train_dataset=hifigan_train_ft.json \
validation_datasets=hifigan_val_ft.json \
exp_manager.exp_dir=hifigan_ft \
exp_manager.create_wandb_logger=true \
exp_manager.wandb_logger_kwargs.name='Child_Female_HF' \
exp_manager.wandb_logger_kwargs.project="TTS_convai"  \
+init_from_pretrained_model=nvidia/tts_hifigan \
trainer.check_val_every_n_epoch=5 \
trainer.log_every_n_steps=3 \
)
# model/train_ds=train_ds_finetune \
# model/validation_ds=val_ds_finetune \

[NeMo W 2023-06-13 14:20:01 experimental:27] Module <class 'nemo.collections.asr.modules.audio_modules.SpectrogramToMultichannelFeatures'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2023-06-13 14:20:01 experimental:27] Module <class 'nemo.collections.tts.parts.utils.callbacks.LoggingCallback'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2023-06-13 14:20:01 experimental:27] Module <class 'nemo.collections.tts.models.fastpitch_ssl.FastPitchModel_SSL'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2023-06-13 14:20:01 experimental:27] Module <class 'nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPATokenizer'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2023-06-13 14:20:01 experimental:27] Module <class 'nemo.collections.tts.models.radtts.RadTTSModel'> 

In [None]:
sudo docker run -it -v /mnt/files-storage/:/workspace/nemo/vol/ --net=host -p 8888:8888 --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 nemo /bin/bash

In [None]:
 http://216.153.50.131:8888/?token=add9bb6aba138a61bcd40ecd00faa290d575a5477cf4fab7