In [27]:
import os
import json
import torch
import subprocess
from pathlib import Path
from typing import List

import numpy as np  
from scipy.io import wavfile
import IPython.display as ipd
from datasets import load_dataset, DatasetDict
from nemo.collections.asr.parts.utils.manifest_utils import read_manifest, write_manifest

In [31]:
# Python wrapper to invoke the given bash script with the given input args
def run_script(script, args):
    args = ' \\'.join(args)
    cmd = f"python {script} \\{args}"

    print(cmd.replace(" \\", "\n"))
    print()
    !$cmd

In [32]:
def run_script(script, args:List[str]):
    cmd = ["python", script] + args
    subprocess.run(cmd)

In [3]:
NEMO_ROOT_DIR = Path('/home/antonio/Documents/Mestrado/NeMo')
DATA_ROOT = NEMO_ROOT_DIR/'data'
DATA_DIR = DATA_ROOT/'cml-portuguese'
AUDIO_DIR = DATA_DIR/'audio'

NEMO_DIR = Path(NEMO_ROOT_DIR)
NEMO_EXAMPLES_DIR = NEMO_DIR / "examples" / "tts"
NEMO_CONFIG_DIR = NEMO_EXAMPLES_DIR / "conf"
NEMO_SCRIPT_DIR = NEMO_DIR / "scripts" / "dataset_processing" / "tts"

print(
    NEMO_DIR, 
    NEMO_EXAMPLES_DIR,
    NEMO_CONFIG_DIR,
    NEMO_SCRIPT_DIR,
    sep='\n'
)

assert NEMO_DIR.exists() and NEMO_EXAMPLES_DIR.exists() and NEMO_CONFIG_DIR.exists() and NEMO_SCRIPT_DIR.exists(),\
    'Required paths does not exists.'

/home/antonio/Documents/Mestrado/NeMo
/home/antonio/Documents/Mestrado/NeMo/examples/tts
/home/antonio/Documents/Mestrado/NeMo/examples/tts/conf
/home/antonio/Documents/Mestrado/NeMo/scripts/dataset_processing/tts


In [4]:
if not DATA_ROOT.exists():
    DATA_ROOT.mkdir(parents=True, exist_ok=True)

if not DATA_DIR.exists():
    DATA_DIR.mkdir(parents=True, exist_ok=True)

if not AUDIO_DIR.exists():
    AUDIO_DIR.mkdir(parents=True, exist_ok=True)

In [5]:
cml_ds = load_dataset("ylacombe/cml-tts", "portuguese")

Resolving data files:   0%|          | 0/373 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/26 [00:00<?, ?it/s]

In [6]:
#TODO need to check if `int16` is correct! 
def ndarray_to_wav(array:np.ndarray, filename:str, filepath:Path, sample_rate:int) -> None:
    wavfile.write(
        filename=filepath/filename,
        rate=sample_rate,
        data=array
    )

# Creating test manifest

```python
    features: [
        'audio', 'wav_filesize', 'text', 
        'transcript_wav2vec', 'levenshtein', 'duration', 'num_words', 'speaker_id'],

    DatasetDict({
        train: Dataset({
            features,
            num_rows: 34265
        })
        dev: Dataset({
            features,
            num_rows: 1134
        })
        test: Dataset({
            features,
            num_rows: 1297
        })
    })
```

In [10]:
def create_manifest(dataset:DatasetDict, split:str):
    with open(DATA_DIR/f'{split}.json', 'w+') as f:
        for sample in dataset[split].select(range(10)):
            audio = sample['audio']

            ndarray_to_wav(
                array=audio['array'],
                filename=audio['path'],
                filepath=AUDIO_DIR,
                sample_rate=audio['sampling_rate']
            )

            # {"audio_filepath": str, "duration": float, "text": str, "speaker": 225}
            json.dump({
                "audio_filepath": audio['path'],
                "duration": sample['duration'],
                "text": sample['text'].lower(),
                "speaker": sample['speaker_id']
            },f, ensure_ascii=False)
            
            f.write('\n')

In [11]:
create_manifest(cml_ds, 'train')
create_manifest(cml_ds, 'dev')
# create_manifest('test')

# Manifest Processing

In [13]:
def update_metadata(data_type):
    input_filepath = DATA_DIR / f"{data_type}.json"
    output_filepath = DATA_DIR / f"{data_type}_raw.json"

    entries = read_manifest(input_filepath)
    for entry in entries:
        # Provide relative path instead of absolute path
        entry["audio_filepath"] = entry["audio_filepath"].replace("audio/", "")
        # Prepend speaker ID with the name of the dataset: 'cml'
        entry["speaker"] = f"cml_{entry['speaker']}"

    write_manifest(output_path=output_filepath, target_manifest=entries, ensure_ascii=False)

In [14]:
update_metadata("dev")
update_metadata("train")
# update_metadata("test")

# Normalize Text

The script used to normalize the `text` doesn't have support to portuguese. This might be a problem. 

[Reference](https://github.com/NVIDIA/NeMo-text-processing/blob/main/nemo_text_processing/text_normalization/normalize.py)

# Audio Processing

Vad configs:

```yaml
_target_: nemo.collections.tts.parts.preprocessing.audio_trimming.VadAudioTrimmer

model_name: "vad_multilingual_marblenet"
vad_sample_rate: 16000
vad_threshold: 0.5
device: "cpu"
speech_frame_threshold: 3
trim_win_length: 4096
trim_hop_length: 1024
pad_seconds: 0.2
```

In [15]:
audio_preprocessing_script = NEMO_SCRIPT_DIR / "preprocess_audio.py"

# Directory with raw audio data
input_audio_dir = DATA_DIR / "audio"
# Directory to write preprocessed audio to
output_audio_dir = DATA_DIR / "audio_preprocessed"
# Whether to overwrite existing audio, if it exists in the output directory
overwrite_audio = True
# Whether to overwrite output manifest, if it exists
overwrite_manifest = True
# Number of threads to parallelize audio processing across
num_workers = 4
# Downsample data from 48khz to 44.1khz for compatibility
output_sample_rate = 44100
# Format of output audio files. Use "flac" to compress to a smaller file size.
output_format = "flac"
# Method for silence trimming. Can use "energy.yaml" or "vad.yaml".
# We use VAD for VCTK because the audio has background noise.
trim_config_path = NEMO_CONFIG_DIR / "trim" / "vad.yaml"
# Volume level (0, 1] to normalize audio to
volume_level = 0.95
# Filter out audio shorter than min_duration or longer than max_duration seconds.
# We set these bounds relatively low/high, as we can place stricter limits at training time
min_duration = 0.25
max_duration = 30.0
# Output file with entries that are filtered out based on duration
filter_file = DATA_DIR / "filtered.json"

def preprocess_audio(data_type):
    input_filepath = DATA_DIR / f"{data_type}_raw.json"
    output_filepath = DATA_DIR / f"{data_type}_manifest.json"

    args = [
        f"--input_manifest={input_filepath}",
        f"--output_manifest={output_filepath}",
        f"--input_audio_dir={input_audio_dir}",
        f"--output_audio_dir={output_audio_dir}",
        f"--num_workers={num_workers}",
        f"--output_sample_rate={output_sample_rate}",
        f"--output_format={output_format}",
        f"--trim_config_path={trim_config_path}",
        f"--volume_level={volume_level}",
        f"--min_duration={min_duration}",
        f"--max_duration={max_duration}",
        f"--filter_file={filter_file}",
    ]
    if overwrite_manifest:
        args.append("--overwrite_manifest")
    if overwrite_audio:
        args.append("--overwrite_audio")

    run_script(audio_preprocessing_script, args)

In [29]:
preprocess_audio("train")
preprocess_audio("dev")
# preprocess_audio("test")

/home/antonio/Documents/Mestrado/NeMo/scripts/dataset_processing/tts/preprocess_audio.py
['--input_manifest=/home/antonio/Documents/Mestrado/NeMo/data/cml-portuguese/train_raw.json', '--output_manifest=/home/antonio/Documents/Mestrado/NeMo/data/cml-portuguese/train_manifest.json', '--input_audio_dir=/home/antonio/Documents/Mestrado/NeMo/data/cml-portuguese/audio', '--output_audio_dir=/home/antonio/Documents/Mestrado/NeMo/data/cml-portuguese/audio_preprocessed', '--num_workers=4', '--output_sample_rate=44100', '--output_format=flac', '--trim_config_path=/home/antonio/Documents/Mestrado/NeMo/examples/tts/conf/trim/vad.yaml', '--volume_level=0.95', '--min_duration=0.25', '--max_duration=30.0', '--filter_file=/home/antonio/Documents/Mestrado/NeMo/data/cml-portuguese/filtered.json', '--overwrite_manifest', '--overwrite_audio']
[NeMo I 2025-02-08 21:24:34 cloud:68] Downloading from: https://api.ngc.nvidia.com/v2/models/nvidia/nemo/vad_multilingual_marblenet/versions/1.10.0/files/vad_multilin

[NeMo W 2025-02-08 21:24:36 modelPT:176] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: /manifests/ami_train_0.63.json,/manifests/freesound_background_train.json,/manifests/freesound_laughter_train.json,/manifests/fisher_2004_background.json,/manifests/fisher_2004_speech_sampled.json,/manifests/google_train_manifest.json,/manifests/icsi_all_0.63.json,/manifests/musan_freesound_train.json,/manifests/musan_music_train.json,/manifests/musan_soundbible_train.json,/manifests/mandarin_train_sample.json,/manifests/german_train_sample.json,/manifests/spanish_train_sample.json,/manifests/french_train_sample.json,/manifests/russian_train_sample.json
    sample_rate: 16000
    labels:
    - background
    - speech
    batch_size: 256
    shuffle: true
    is_tarred: false
    tarred_audio_filepaths: null
    tarred_shard_strategy: sca

[NeMo I 2025-02-08 21:24:36 save_restore_connector:275] Model EncDecClassificationModel was successfully restored from /home/antonio/.cache/torch/NeMo/NeMo_2.1.0/vad_multilingual_marblenet/670f425c7f186060b7a7268ba6dfacb2/vad_multilingual_marblenet.nemo.


100%|██████████| 10/10 [00:01<00:00,  5.88it/s]


[NeMo I 2025-02-08 21:24:38 preprocess_audio:262] Duration of original audio: 0.03 hours
[NeMo I 2025-02-08 21:24:38 preprocess_audio:263] Duration of processed audio: 0.03 hours
/home/antonio/Documents/Mestrado/NeMo/scripts/dataset_processing/tts/preprocess_audio.py
['--input_manifest=/home/antonio/Documents/Mestrado/NeMo/data/cml-portuguese/dev_raw.json', '--output_manifest=/home/antonio/Documents/Mestrado/NeMo/data/cml-portuguese/dev_manifest.json', '--input_audio_dir=/home/antonio/Documents/Mestrado/NeMo/data/cml-portuguese/audio', '--output_audio_dir=/home/antonio/Documents/Mestrado/NeMo/data/cml-portuguese/audio_preprocessed', '--num_workers=4', '--output_sample_rate=44100', '--output_format=flac', '--trim_config_path=/home/antonio/Documents/Mestrado/NeMo/examples/tts/conf/trim/vad.yaml', '--volume_level=0.95', '--min_duration=0.25', '--max_duration=30.0', '--filter_file=/home/antonio/Documents/Mestrado/NeMo/data/cml-portuguese/filtered.json', '--overwrite_manifest', '--overwrite

[NeMo W 2025-02-08 21:24:45 modelPT:176] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: /manifests/ami_train_0.63.json,/manifests/freesound_background_train.json,/manifests/freesound_laughter_train.json,/manifests/fisher_2004_background.json,/manifests/fisher_2004_speech_sampled.json,/manifests/google_train_manifest.json,/manifests/icsi_all_0.63.json,/manifests/musan_freesound_train.json,/manifests/musan_music_train.json,/manifests/musan_soundbible_train.json,/manifests/mandarin_train_sample.json,/manifests/german_train_sample.json,/manifests/spanish_train_sample.json,/manifests/french_train_sample.json,/manifests/russian_train_sample.json
    sample_rate: 16000
    labels:
    - background
    - speech
    batch_size: 256
    shuffle: true
    is_tarred: false
    tarred_audio_filepaths: null
    tarred_shard_strategy: sca

[NeMo I 2025-02-08 21:24:45 save_restore_connector:275] Model EncDecClassificationModel was successfully restored from /home/antonio/.cache/torch/NeMo/NeMo_2.1.0/vad_multilingual_marblenet/670f425c7f186060b7a7268ba6dfacb2/vad_multilingual_marblenet.nemo.


100%|██████████| 10/10 [00:01<00:00,  9.53it/s]


[NeMo I 2025-02-08 21:24:47 preprocess_audio:262] Duration of original audio: 0.03 hours
[NeMo I 2025-02-08 21:24:47 preprocess_audio:263] Duration of processed audio: 0.03 hours


Validating the process

In [13]:
audio_file = "2961_3118_001027.wav"
audio_filepath = input_audio_dir / audio_file
processed_audio_filepath = output_audio_dir / audio_file.replace(".wav", ".flac")

print("Original audio.")
ipd.display(ipd.Audio(audio_filepath))

print("Processed audio.")
ipd.display(ipd.Audio(processed_audio_filepath))

Original audio.


Processed audio.


# Speaker Mapping

In [30]:
speaker_map_script = NEMO_SCRIPT_DIR / "create_speaker_map.py"

train_manifest_filepath = DATA_DIR / "train_manifest.json"
dev_manifest_filepath = DATA_DIR / "dev_manifest.json"
speaker_filepath = DATA_DIR / "speakers.json"

args = [
    f"--manifest_path={train_manifest_filepath}",
    f"--manifest_path={dev_manifest_filepath}",
    f"--speaker_map_path={speaker_filepath}"
]

run_script(speaker_map_script, args)

/home/antonio/Documents/Mestrado/NeMo/scripts/dataset_processing/tts/create_speaker_map.py
['--manifest_path=/home/antonio/Documents/Mestrado/NeMo/data/cml-portuguese/train_manifest.json', '--manifest_path=/home/antonio/Documents/Mestrado/NeMo/data/cml-portuguese/dev_manifest.json', '--speaker_map_path=/home/antonio/Documents/Mestrado/NeMo/data/cml-portuguese/speakers.json']


# Feature Computation

In [33]:
feature_script = NEMO_SCRIPT_DIR / "compute_features.py"

sample_rate = 44100

if sample_rate == 22050:
    feature_config_filename = "feature_22050.yaml"
elif sample_rate == 44100:
    feature_config_filename = "feature_44100.yaml"
else:
    raise ValueError(f"Unsupported sampling rate {sample_rate}")

feature_config_path = NEMO_CONFIG_DIR / "feature" / feature_config_filename
audio_dir = DATA_DIR / "audio_preprocessed"
feature_dir = DATA_DIR / "features"
num_workers = 4

def compute_features(data_type):
    input_filepath = DATA_DIR / f"{data_type}_manifest.json"

    args = [
        f"--feature_config_path={feature_config_path}",
        f"--manifest_path={input_filepath}",
        f"--audio_dir={audio_dir}",
        f"--feature_dir={feature_dir}",
        f"--num_workers={num_workers}",
        f"--overwrite"
    ]

    run_script(feature_script, args)

Statistics:
- To compute `200` samples, divided between `dev` and `train`, it tooks `5:33min`. 

In [34]:
compute_features("dev")
compute_features("train")

[NeMo I 2025-02-08 21:58:59 features:305] PADDING: 1
[NeMo I 2025-02-08 21:59:00 features:305] PADDING: 1
[NeMo I 2025-02-08 21:59:00 features:305] PADDING: 1
Computing: pitch


100%|██████████| 10/10 [00:24<00:00,  2.40s/it]


Computing: energy


100%|██████████| 10/10 [00:00<00:00, 108.01it/s]


[NeMo I 2025-02-08 21:59:41 features:305] PADDING: 1
[NeMo I 2025-02-08 21:59:42 features:305] PADDING: 1
[NeMo I 2025-02-08 21:59:42 features:305] PADDING: 1
Computing: pitch


100%|██████████| 10/10 [00:08<00:00,  1.12it/s]


Computing: energy


100%|██████████| 10/10 [00:00<00:00, 376.81it/s]


# Feature Statistics

In [35]:
feature_stats_script = NEMO_SCRIPT_DIR / "compute_feature_stats.py"

train_manifest_filepath = DATA_DIR / "train_manifest.json"
dev_manifest_filepath = DATA_DIR / "dev_manifest.json"
output_stats_path = DATA_DIR / "feature_stats.json"

args = [
    f"--feature_config_path={feature_config_path}",
    f"--manifest_path={train_manifest_filepath}",
    f"--manifest_path={dev_manifest_filepath}",
    f"--audio_dir={audio_dir}",
    f"--audio_dir={audio_dir}",
    f"--feature_dir={feature_dir}",
    f"--feature_dir={feature_dir}",
    f"--stats_path={output_stats_path}",
    f"--overwrite"
]

run_script(feature_stats_script, args)

[NeMo I 2025-02-08 22:01:40 features:305] PADDING: 1
[NeMo I 2025-02-08 22:01:41 features:305] PADDING: 1
[NeMo I 2025-02-08 22:01:41 features:305] PADDING: 1
Found featurizers for ['pitch', 'energy'].


100%|██████████| 10/10 [00:00<00:00, 260.63it/s]
100%|██████████| 10/10 [00:00<00:00, 3058.41it/s]


# HiFi-GAN Training

In [47]:
dataset_name = "cml"
audio_dir = DATA_DIR / "audio_preprocessed"
train_manifest_filepath = DATA_DIR / "train_manifest.json"
dev_manifest_filepath = DATA_DIR / "dev_manifest.json"

In [58]:
hifigan_training_script = NEMO_EXAMPLES_DIR / "hifigan.py"

# The total number of training steps will be (epochs * steps_per_epoch)
epochs = 10
steps_per_epoch = 10

sample_rate = 44100

# Config files specifying all HiFi-GAN parameters
hifigan_config_dir = NEMO_CONFIG_DIR / "hifigan_dataset"

if sample_rate == 22050:
    hifigan_config_filename = "hifigan_22050.yaml"
elif sample_rate == 44100:
    hifigan_config_filename = "hifigan_44100.yaml"
else:
    raise ValueError(f"Unsupported sampling rate {sample_rate}")

# Name of the experiment that will determine where it is saved locally and in TensorBoard and WandB
run_id = "test_run"
exp_dir = DATA_DIR / "exps"
hifigan_exp_output_dir = exp_dir / "HifiGan" / run_id
# Directory where predicted audio will be stored periodically throughout training
hifigan_log_dir = hifigan_exp_output_dir / "logs"

if torch.cuda.is_available():
    accelerator="gpu"
    batch_size = 16
else:
    accelerator="cpu"
    batch_size = 2

args = [
    f"--config-path={hifigan_config_dir}",
    f"--config-name={hifigan_config_filename}",
    f"max_epochs={epochs}",
    f"weighted_sampling_steps_per_epoch={steps_per_epoch}",
    f"batch_size={batch_size}",
    f"log_dir={hifigan_log_dir}",
    f"exp_manager.exp_dir={exp_dir}",
    f"+exp_manager.version={run_id}",
    f"trainer.accelerator={accelerator}",
    f"+train_ds_meta.{dataset_name}.manifest_path={train_manifest_filepath}",
    f"+train_ds_meta.{dataset_name}.audio_dir={audio_dir}",
    f"+val_ds_meta.{dataset_name}.manifest_path={dev_manifest_filepath}",
    f"+val_ds_meta.{dataset_name}.audio_dir={audio_dir}",
    f"+log_ds_meta.{dataset_name}.manifest_path={dev_manifest_filepath}",
    f"+log_ds_meta.{dataset_name}.audio_dir={audio_dir}"
]

In [49]:
# If an error occurs, log the entire stacktrace.
os.environ["HYDRA_FULL_ERROR"] = "1"

In [50]:
run_script(hifigan_training_script, args)

python /home/antonio/NeMo/examples/tts/hifigan.py
--config-path=/home/antonio/NeMo/examples/tts/conf/hifigan_dataset
--config-name=hifigan_22050.yaml
max_epochs=10
weighted_sampling_steps_per_epoch=10
batch_size=16
log_dir=/home/antonio/NeMo/data/cml-portuguese/exps/HifiGan/test_run/logs
exp_manager.exp_dir=/home/antonio/NeMo/data/cml-portuguese/exps
+exp_manager.version=test_run
trainer.accelerator=gpu
+train_ds_meta.cml.manifest_path=/home/antonio/NeMo/data/cml-portuguese/train_manifest.json
+train_ds_meta.cml.audio_dir=/home/antonio/NeMo/data/cml-portuguese/audio_preprocessed
+val_ds_meta.cml.manifest_path=/home/antonio/NeMo/data/cml-portuguese/dev_manifest.json
+val_ds_meta.cml.audio_dir=/home/antonio/NeMo/data/cml-portuguese/audio_preprocessed
+log_ds_meta.cml.manifest_path=/home/antonio/NeMo/data/cml-portuguese/dev_manifest.json
+log_ds_meta.cml.audio_dir=/home/antonio/NeMo/data/cml-portuguese/audio_preprocessed



    See https://hydra.cc/docs/1.2/upgrades/1.1_to_1.2/changes_to_job_working_dir/ for more information.
      ret = run_job(
    
[NeMo W 2025-02-06 00:43:14 nemo_logging:361] /opt/conda/envs/nemo/lib/python3.10/site-packages/lightning/fabric/connector.py:571: `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!
    
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
[NeMo I 2025-02-06 00:43:14 exp_manager:450] ExpManager schema
[NeMo I 2025-02-06 00:43:14 exp_manager:451] {'explicit_log_dir': None, 'exp_dir': None, 'name': None, 'version': None, 'use_datetime_version': True, 'resume_if_exists': False, 'resume_past_end': False, 'resume_ignore_no_checkpoint': False, 'resume_from_checkpoint': None, 'create_tensorboard_logger': True, 'summary_writer_kwargs': None, 'create_wandb_logger': False, 'wandb_logger_

# FastPitch Training 

In [60]:
dataset_file_dir

PosixPath('/home/antonio/NeMo/scripts/tts_dataset_files')

In [59]:
fastpitch_training_script = NEMO_EXAMPLES_DIR / "fastpitch.py"

# The total number of training steps will be (epochs * steps_per_epoch)
epochs = 10
steps_per_epoch = 10

num_speakers = 11
sample_rate = 44100

# Config files specifying all FastPitch parameters
fastpitch_config_dir = NEMO_CONFIG_DIR / "fastpitch"

if sample_rate == 22050:
    fastpitch_config_filename = "fastpitch_22050.yaml"
elif sample_rate == 44100:
    fastpitch_config_filename = "fastpitch_44100.yaml"
else:
    raise ValueError(f"Unsupported sampling rate {sample_rate}")

# Metadata files and directories
dataset_file_dir = NEMO_DIR / "scripts" / "tts_dataset_files"
phoneme_dict_path = dataset_file_dir / "ipa_cmudict-0.7b_nv23.01.txt"
heteronyms_path = dataset_file_dir / "heteronyms-052722"

speaker_path = DATA_DIR / "speakers.json"
feature_dir = DATA_DIR / "features"
stats_path = DATA_DIR / "feature_stats.json"

def get_latest_checkpoint(checkpoint_dir):
    output_path = None
    for checkpoint_path in checkpoint_dir.iterdir():
        checkpoint_name = str(checkpoint_path.name)
        if checkpoint_name.endswith(".nemo"):
            output_path = checkpoint_path
            break
        if checkpoint_name.endswith("last.ckpt"):
            output_path = checkpoint_path

    if not output_path:
        raise ValueError(f"Could not find latest checkpoint in {checkpoint_dir}")

    return output_path

# HiFi-GAN model for generating audio predictions from FastPitch output
vocoder_type = "hifigan"
vocoder_checkpoint_path = get_latest_checkpoint(hifigan_exp_output_dir / "checkpoints")

run_id = "test_run"
exp_dir = DATA_DIR / "exps"
fastpitch_exp_output_dir = exp_dir / "FastPitch" / run_id
fastpitch_log_dir = fastpitch_exp_output_dir / "logs"

if torch.cuda.is_available():
    accelerator="gpu"
    batch_size = 8
else:
    accelerator="cpu"
    batch_size = 4

args = [
    f"--config-path={fastpitch_config_dir}",
    f"--config-name={fastpitch_config_filename}",
    f"n_speakers={num_speakers}",
    f"speaker_path={speaker_path}",
    f"max_epochs={epochs}",
    f"weighted_sampling_steps_per_epoch={steps_per_epoch}",
    f"phoneme_dict_path={phoneme_dict_path}",
    f"heteronyms_path={heteronyms_path}",
    f"feature_stats_path={stats_path}",
    f"log_dir={fastpitch_log_dir}",
    f"vocoder_type={vocoder_type}",
    f"vocoder_checkpoint_path=\\'{vocoder_checkpoint_path}\\'",
    f"trainer.accelerator={accelerator}",
    f"exp_manager.exp_dir={exp_dir}",
    f"+exp_manager.version={run_id}",
    f"+train_ds_meta.{dataset_name}.manifest_path={train_manifest_filepath}",
    f"+train_ds_meta.{dataset_name}.audio_dir={audio_dir}",
    f"+train_ds_meta.{dataset_name}.feature_dir={feature_dir}",
    f"+val_ds_meta.{dataset_name}.manifest_path={dev_manifest_filepath}",
    f"+val_ds_meta.{dataset_name}.audio_dir={audio_dir}",
    f"+val_ds_meta.{dataset_name}.feature_dir={feature_dir}",
    f"+log_ds_meta.{dataset_name}.manifest_path={dev_manifest_filepath}",
    f"+log_ds_meta.{dataset_name}.audio_dir={audio_dir}",
    f"+log_ds_meta.{dataset_name}.feature_dir={feature_dir}"
]

In [56]:
run_script(fastpitch_training_script, args)

python /home/antonio/NeMo/examples/tts/fastpitch.py
--config-path=/home/antonio/NeMo/examples/tts/conf/fastpitch
--config-name=fastpitch_22050.yaml
n_speakers=11
speaker_path=/home/antonio/NeMo/data/cml-portuguese/speakers.json
max_epochs=10
weighted_sampling_steps_per_epoch=10
phoneme_dict_path=/home/antonio/NeMo/scripts/tts_dataset_files/ipa_cmudict-0.7b_nv23.01.txt
heteronyms_path=/home/antonio/NeMo/scripts/tts_dataset_files/heteronyms-052722
feature_stats_path=/home/antonio/NeMo/data/cml-portuguese/feature_stats.json
log_dir=/home/antonio/NeMo/data/cml-portuguese/exps/FastPitch/test_run/logs
vocoder_type=hifigan
vocoder_checkpoint_path=\'/home/antonio/NeMo/data/cml-portuguese/exps/HifiGan/test_run/checkpoints/HifiGan.nemo\'
trainer.accelerator=gpu
exp_manager.exp_dir=/home/antonio/NeMo/data/cml-portuguese/exps
+exp_manager.version=test_run
+train_ds_meta.cml.manifest_path=/home/antonio/NeMo/data/cml-portuguese/train_manifest.json
+train_ds_meta.cml.audio_dir=/home/antonio/NeMo/data