diff --git a/everyvoice/base_cli/helpers.py b/everyvoice/base_cli/helpers.py index a0b75f08..b4d6502f 100644 --- a/everyvoice/base_cli/helpers.py +++ b/everyvoice/base_cli/helpers.py @@ -35,39 +35,26 @@ def load_config_base_command( - name: Enum, model_config: Union[ DFAlignerConfig, EveryVoiceConfig, FastSpeech2Config, HiFiGANConfig ], - configs, # Must include the above in model-specific command config_args: List[str], config_path: Path, ): from everyvoice.utils import update_config_from_cli_args - if config_path: - config = model_config.load_config_from_path(config_path) - elif name: - config = model_config.load_config_from_path(configs[name.value]) - else: - logger.error( - "You must either choose a of a preconfigured dataset, or provide a to a preprocessing configuration file." - ) - exit() + config = model_config.load_config_from_path(config_path) config = update_config_from_cli_args(config_args, config) return config def preprocess_base_command( - name: Enum, model_config: Union[ DFAlignerConfig, EveryVoiceConfig, FastSpeech2Config, HiFiGANConfig ], - configs, - steps, - preprocess_categories, + steps: List[str], # Must include the above in model-specific command config_args: List[str], config_path: Path, @@ -78,34 +65,24 @@ def preprocess_base_command( ): from everyvoice.preprocessor import Preprocessor - config = load_config_base_command( - name, model_config, configs, config_args, config_path - ) - to_process = [x.name for x in steps] + config = load_config_base_command(model_config, config_args, config_path) preprocessor = Preprocessor(config) - if not steps: - logger.info( - f"No specific preprocessing data requested, processing everything from dataset '{name}'" - ) - to_process = list(preprocess_categories.__members__.keys()) if isinstance(config, FastSpeech2Config) and config.model.use_phonological_feats: - to_process.append("pfs") + steps.append("pfs") preprocessor.preprocess( output_path=output_path, cpus=cpus, overwrite=overwrite, - to_process=to_process, + to_process=steps, debug=debug, ) - return preprocessor, config, to_process + return preprocessor, config, steps def train_base_command( - name: Enum, model_config: Union[ DFAlignerConfig, EveryVoiceConfig, FastSpeech2Config, HiFiGANConfig ], - configs, data_module: Union[ AlignerDataModule, E2EDataModule, FastSpeech2DataModule, HiFiGANDataModule ], @@ -119,9 +96,7 @@ def train_base_command( nodes: int, strategy: str, ): - config = load_config_base_command( - name, model_config, configs, config_args, config_path - ) + config = load_config_base_command(model_config, config_args, config_path) logger.info("Loading modules for training...") pbar = tqdm(range(4)) pbar.set_description("Loading pytorch and friends") diff --git a/everyvoice/base_cli/interfaces.py b/everyvoice/base_cli/interfaces.py index c835ff8e..9d3b48d9 100644 --- a/everyvoice/base_cli/interfaces.py +++ b/everyvoice/base_cli/interfaces.py @@ -21,12 +21,28 @@ def load_config_base_command_interface( def preprocess_base_command_interface( - config_args: List[str] = typer.Option(None, "--config-args", "-c"), - config_path: Path = typer.Option( - None, "--config-path", "-p", exists=True, dir_okay=False, file_okay=True + config_path: Path = typer.Argument( + ..., + exists=True, + dir_okay=False, + file_okay=True, + help="The path to your model configuration file.", + ), + config_args: List[str] = typer.Option( + None, "-c", "--config-args", help="Overwrite the configuration" + ), + output_path: Optional[Path] = typer.Option( + "filelist.psv", + "-o", + "--output", + help="The path to where the processed data filelist should be written", + ), + cpus: Optional[int] = typer.Option( + min(4, mp.cpu_count()), + "-C", + "--cpus", + help="How many CPUs to use when preprocessing", ), - output_path: Optional[Path] = typer.Option("filelist.psv", "-o", "--output"), - cpus: Optional[int] = typer.Option(min(4, mp.cpu_count()), "-C", "--cpus"), overwrite: bool = typer.Option(False, "-O", "--overwrite"), debug: bool = typer.Option(False, "-D", "--debug"), ): @@ -34,9 +50,15 @@ def preprocess_base_command_interface( def train_base_command_interface( - config_args: List[str] = typer.Option(None, "--config", "-c"), - config_path: Path = typer.Option( - None, "--config-path", "-p", exists=True, dir_okay=False, file_okay=True + config_path: Path = typer.Argument( + ..., + exists=True, + dir_okay=False, + file_okay=True, + help="The path to your model configuration file.", + ), + config_args: List[str] = typer.Option( + None, "-c", "--config-args", help="Overwrite the configuration" ), accelerator: str = typer.Option( "auto", diff --git a/everyvoice/cli.py b/everyvoice/cli.py index 27f1ef56..a864afbd 100644 --- a/everyvoice/cli.py +++ b/everyvoice/cli.py @@ -2,7 +2,6 @@ import typer -from everyvoice.config import CONFIGS from everyvoice.model.feature_prediction.FastSpeech2_lightning.fs2.cli import ( preprocess as preprocess_fs2, ) @@ -101,7 +100,7 @@ def new_dataset(): This command will preprocess all of the data you need for use with EveryVoice. - By default every step of the preprocessor will be done, but you can run specific commands by adding them as arguments for example: **everyvoice preprocess energy pitch** + By default every step of the preprocessor will be done, but you can run specific commands by adding them as options for example: **everyvoice preprocess path/to/config.yaml -s energy -s pitch** """, )(preprocess_fs2) @@ -145,22 +144,18 @@ def new_dataset(): help=""" # Synthesize Help - - **text-to-spec** --- this is the most common model to run + - **text-to-spec** --- this is the most common model to run for performing normal speech synthesis. - - **spec-to-wav** --- this is the model that turns your spectral features into audio. It is also known as a 'vocoder'. You will typically not need to train your own version. Please refer to [https://pathtocheckpoints](https://pathtocheckpoints) for more information. + - **spec-to-wav** --- this is the model that turns your spectral features into audio. this type of synthesis is also known as copy synthesis and unless you know what you are doing, you probably don't want to do this. """, ) synthesize_group.command( name="text-to-wav", - short_help="Given some text and a trained model, generate some audio", - help="Given some text and a trained model, generate some audio.", )(synthesize_fs2) synthesize_group.command( name="spec-to-wav", - short_help="Given some Mel spectrograms and a trained model, generate some audio", - help="Given some Mel spectrograms and a trained model, generate some audio.", )(synthesize_hfg) app.add_typer( @@ -170,11 +165,6 @@ def new_dataset(): ) -_config_keys = {k: k for k in CONFIGS.keys()} - -CONFIGS_ENUM = Enum("CONFIGS", _config_keys) # type: ignore - - class TestSuites(str, Enum): all = "all" configs = "configs" diff --git a/everyvoice/config/__init__.py b/everyvoice/config/__init__.py index ed4386eb..e69de29b 100644 --- a/everyvoice/config/__init__.py +++ b/everyvoice/config/__init__.py @@ -1,16 +0,0 @@ -from pathlib import Path -from typing import Dict - -from everyvoice.config import __file__ as everyvoice_file - -CONFIGS: Dict[str, Path] = { - "base": Path(everyvoice_file).parent / "base" / "base_composed.yaml", - "lj": Path(everyvoice_file).parent / "lj" / "lj.yaml", - "istft": Path(everyvoice_file).parent / "lj" / "lj_istft.yaml", - "openslr": Path(everyvoice_file).parent / "openslr" / "openslr.yaml", -} - - -class ConfigError(Exception): - def __init__(self, msg): - super().__init__(msg) diff --git a/everyvoice/config/base/base_composed.yaml b/everyvoice/config/base/base_composed.yaml deleted file mode 100644 index 69832164..00000000 --- a/everyvoice/config/base/base_composed.yaml +++ /dev/null @@ -1,4 +0,0 @@ -aligner: "./config/default/aligner.yaml" -feature_prediction: "./config/default/fastspeech2.yaml" -vocoder: "./config/default/hifigan.yaml" -training: "./model/e2e/config/training.yaml" diff --git a/everyvoice/config/base/base_shared.yaml b/everyvoice/config/base/base_shared.yaml deleted file mode 100644 index 923107e3..00000000 --- a/everyvoice/config/base/base_shared.yaml +++ /dev/null @@ -1,185 +0,0 @@ -aligner: - model: - lstm_dim: 512 - conv_dim: 512 - training: - optimizer: - learning_rate: 1e-4 - eps: 1e-8 - weight_decay: 0.01 - betas: [0.9, 0.98] - name: adamw - binned_sampler: True - plot_steps: 1000 - extraction_method: "beam" - batch_size: 32 - save_top_k_ckpts: 3 - ckpt_steps: null - ckpt_epochs: 1 - max_epochs: 1000 - seed: 1234 - finetune_checkpoint: null - filelist: "./preprocessed/YourDataSet/processed_filelist.psv" - filelist_loader: "everyvoice.utils.generic_dict_loader" - logger: - name: "BaseExperiment" - save_dir: "./logs_and_checkpoints" - sub_dir: "everyvoice.utils.get_current_time" - version: "base" - val_data_workers: 0 - train_data_workers: 0 - preprocessing: "./config/default/preprocessing.yaml" - text: "./config/default/text_eng.yaml" -feature_prediction: - model: - encoder: - layers: 4 - heads: 2 - hidden_dim: 256 - feedforward_dim: 1024 - conv_filter_size: 1024 - conv_kernel_size: 9 - dropout: 0.2 - depthwise: True - conformer: True - decoder: - layers: 6 - heads: 2 - hidden_dim: 256 - feedforward_dim: 1024 - conv_filter_size: 1024 - conv_kernel_size: 9 - dropout: 0.2 - depthwise: True - conformer: True - variance_adaptor: - variance_predictors: - pitch: - level: "phone" - transform: "none" - loss: "mse" - n_layers: 5 - loss_weights: 5e-2 - kernel_size: 3 - dropout: 0.5 - hidden_dim: 256 - n_bins: 256 - depthwise: True - energy: - level: "phone" - transform: "none" - loss: "mse" - n_layers: 5 - loss_weights: 5e-2 - kernel_size: 3 - dropout: 0.5 - hidden_dim: 256 - n_bins: 256 - depthwise: True - duration: - transform: "none" - loss: "mse" - n_layers: 5 - loss_weights: 5e-2 - kernel_size: 3 - dropout: 0.5 - hidden_dim: 256 - n_bins: 256 - depthwise: True - learn_alignment: False - max_length: 1000 - mel_loss: "mse" - mel_loss_weight: 5e-1 - phonological_feats_size: 38 - use_phonological_feats: False - use_postnet: True - multilingual: True - multispeaker: - embedding_type: "id" - every_layer: False - dvector_gmm: False - training: - use_weighted_sampler: False - freeze_layers: - all_layers: False - encoder: False - decoder: False - postnet: False - variance: - energy: False - duration: False - pitch: False - optimizer: - learning_rate: 1e-4 - eps: 1e-8 - weight_decay: 0.01 - betas: [0.9, 0.98] - warmup_steps: 4000 - early_stopping: - metric: "none" - patience: 4 - tf: - ratio: 1.0 - linear_schedule: False - linear_schedule_start: 0 - linear_schedule_end: 20 - linear_schedule_end_ratio: 0.0 - batch_size: 16 - save_top_k_ckpts: 5 - ckpt_steps: null - ckpt_epochs: 1 - max_epochs: 1000 - seed: 1234 - finetune_checkpoint: null - filelist: "./preprocessed/YourDataSet/processed_filelist.psv" - filelist_loader: "everyvoice.utils.generic_dict_loader" - logger: - name: "BaseExperiment" - save_dir: "./logs_and_checkpoints" - sub_dir: "everyvoice.utils.get_current_time" - version: "base" - val_data_workers: 0 - train_data_workers: 0 - preprocessing: "./config/default/preprocessing.yaml" - text: "./config/default/text_eng.yaml" -vocoder: - model: - resblock: "1" - upsample_rates: [8, 8, 2, 2] - upsample_kernel_sizes: [16, 16, 4, 4] - upsample_initial_channel: 512 - resblock_kernel_sizes: [3, 7, 11] - resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]] - depthwise_separable_convolutions: - generator: False - activation_function: "everyvoice.utils.original_hifigan_leaky_relu" - istft_layer: False - training: - generator_warmup_steps: 0 - gan_type: "original" - optimizer: - learning_rate: 1e-4 - eps: 1e-8 - weight_decay: 0.01 - betas: [0.9, 0.98] - name: adamw - wgan_clip_value: 0.01 - use_weighted_sampler: False - batch_size: 16 - save_top_k_ckpts: 5 - ckpt_steps: null - ckpt_epochs: 1 - max_epochs: 1000 - seed: 1234 - finetune_checkpoint: null - filelist: "./preprocessed/YourDataSet/processed_filelist.psv" - filelist_loader: "everyvoice.utils.generic_dict_loader" - logger: - name: "BaseExperiment" - save_dir: "./logs_and_checkpoints" - sub_dir: "everyvoice.utils.get_current_time" - version: "base" - val_data_workers: 0 - train_data_workers: 0 - preprocessing: "./config/default/preprocessing.yaml" -training: "./model/e2e/config/training.yaml" diff --git a/everyvoice/config/default/aligner.yaml b/everyvoice/config/default/aligner.yaml deleted file mode 100644 index 1c9f73e9..00000000 --- a/everyvoice/config/default/aligner.yaml +++ /dev/null @@ -1,31 +0,0 @@ -model: - lstm_dim: 512 - conv_dim: 512 -training: - optimizer: - learning_rate: 1e-4 - eps: 1e-8 - weight_decay: 0.01 - betas: [0.9, 0.98] - name: adamw - binned_sampler: True - plot_steps: 1000 - extraction_method: "beam" - batch_size: 32 - save_top_k_ckpts: 3 - ckpt_steps: null - ckpt_epochs: 1 - max_epochs: 1000 - seed: 1234 - finetune_checkpoint: null - filelist: "./preprocessed/YourDataSet/processed_filelist.psv" - filelist_loader: "everyvoice.utils.generic_dict_loader" - logger: - name: "BaseExperiment" - save_dir: "./logs_and_checkpoints" - sub_dir: "everyvoice.utils.get_current_time" - version: "base" - val_data_workers: 0 - train_data_workers: 0 -preprocessing: "./config/default/preprocessing.yaml" -text: "./config/default/text_eng.yaml" diff --git a/everyvoice/config/default/fastspeech2.yaml b/everyvoice/config/default/fastspeech2.yaml deleted file mode 100644 index 8ad0df37..00000000 --- a/everyvoice/config/default/fastspeech2.yaml +++ /dev/null @@ -1,112 +0,0 @@ -model: - encoder: - layers: 4 - heads: 2 - hidden_dim: 256 - feedforward_dim: 1024 - conv_filter_size: 1024 - conv_kernel_size: 9 - dropout: 0.2 - depthwise: True - conformer: True - decoder: - layers: 6 - heads: 2 - hidden_dim: 256 - feedforward_dim: 1024 - conv_filter_size: 1024 - conv_kernel_size: 9 - dropout: 0.2 - depthwise: True - conformer: True - variance_adaptor: - variance_predictors: - pitch: - level: "phone" - transform: "none" - loss: "mse" - n_layers: 5 - loss_weights: 5e-2 - kernel_size: 3 - dropout: 0.5 - hidden_dim: 256 - n_bins: 256 - depthwise: True - energy: - level: "phone" - transform: "none" - loss: "mse" - n_layers: 5 - loss_weights: 5e-2 - kernel_size: 3 - dropout: 0.5 - hidden_dim: 256 - n_bins: 256 - depthwise: True - duration: - transform: "none" - loss: "mse" - n_layers: 5 - loss_weights: 5e-2 - kernel_size: 3 - dropout: 0.5 - hidden_dim: 256 - n_bins: 256 - depthwise: True - learn_alignment: False - max_length: 1000 - mel_loss: "mse" - mel_loss_weight: 1 - phonological_feats_size: 38 - use_phonological_feats: False - use_postnet: True - multilingual: True - multispeaker: - embedding_type: "id" - every_layer: False - dvector_gmm: False -training: - use_weighted_sampler: False - freeze_layers: - all_layers: False - encoder: False - decoder: False - postnet: False - variance: - energy: False - duration: False - pitch: False - optimizer: - learning_rate: 1e-4 - eps: 1e-8 - weight_decay: 0.01 - betas: [0.9, 0.98] - warmup_steps: 4000 - early_stopping: - metric: "none" - patience: 4 - tf: - ratio: 1.0 - linear_schedule: False - linear_schedule_start: 0 - linear_schedule_end: 20 - linear_schedule_end_ratio: 0.0 - batch_size: 16 - save_top_k_ckpts: 5 - ckpt_steps: null - ckpt_epochs: 1 - max_epochs: 1000 - seed: 1234 - finetune_checkpoint: null - filelist: "./preprocessed/YourDataSet/processed_filelist.psv" - filelist_loader: "everyvoice.utils.generic_dict_loader" - logger: - name: "BaseExperiment" - save_dir: "./logs_and_checkpoints" - sub_dir: "everyvoice.utils.get_current_time" - version: "base" - val_data_workers: 0 - train_data_workers: 0 - vocoder_path: null -preprocessing: "./config/default/preprocessing.yaml" -text: "./config/default/text_eng.yaml" diff --git a/everyvoice/config/default/hifigan.yaml b/everyvoice/config/default/hifigan.yaml deleted file mode 100644 index 30ed591a..00000000 --- a/everyvoice/config/default/hifigan.yaml +++ /dev/null @@ -1,39 +0,0 @@ -model: - resblock: "1" - upsample_rates: [8, 8, 2, 2] - upsample_kernel_sizes: [16, 16, 4, 4] - upsample_initial_channel: 512 - resblock_kernel_sizes: [3, 7, 11] - resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]] - depthwise_separable_convolutions: - generator: False - activation_function: "everyvoice.utils.original_hifigan_leaky_relu" - istft_layer: False -training: - generator_warmup_steps: 0 - gan_type: "original" - optimizer: - learning_rate: 1e-4 - eps: 1e-8 - weight_decay: 0.01 - betas: [0.9, 0.98] - name: adamw - wgan_clip_value: 0.01 - use_weighted_sampler: False - batch_size: 16 - save_top_k_ckpts: 5 - ckpt_steps: null - ckpt_epochs: 1 - max_epochs: 1000 - seed: 1234 - finetune_checkpoint: null - filelist: "./preprocessed/YourDataSet/processed_filelist.psv" - filelist_loader: "everyvoice.utils.generic_dict_loader" - logger: - name: "BaseExperiment" - save_dir: "./logs_and_checkpoints" - sub_dir: "everyvoice.utils.get_current_time" - version: "base" - val_data_workers: 0 - train_data_workers: 0 -preprocessing: "./config/default/preprocessing.yaml" diff --git a/everyvoice/config/default/istft.yaml b/everyvoice/config/default/istft.yaml deleted file mode 100644 index 38c2b264..00000000 --- a/everyvoice/config/default/istft.yaml +++ /dev/null @@ -1,39 +0,0 @@ -model: - resblock: "1" - upsample_rates: [8, 8] - upsample_kernel_sizes: [16, 16] - upsample_initial_channel: 512 - resblock_kernel_sizes: [3, 7, 11] - resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]] - depthwise_separable_convolutions: - generator: False - activation_function: "everyvoice.utils.original_hifigan_leaky_relu" - istft_layer: True -training: - generator_warmup_steps: 0 - gan_type: "original" - optimizer: - learning_rate: 1e-4 - eps: 1e-8 - weight_decay: 0.01 - betas: [0.9, 0.98] - name: adamw - wgan_clip_value: 0.01 - use_weighted_sampler: False - batch_size: 16 - save_top_k_ckpts: 5 - ckpt_steps: null - ckpt_epochs: 1 - max_epochs: 1000 - seed: 1234 - finetune_checkpoint: null - filelist: "./preprocessed/YourDataSet/processed_filelist.psv" - filelist_loader: "everyvoice.utils.generic_dict_loader" - logger: - name: "Base-iSTFT-Experiment" - save_dir: "./logs_and_checkpoints" - sub_dir: "everyvoice.utils.get_current_time" - version: "base" - val_data_workers: 0 - train_data_workers: 0 -preprocessing: "./config/default/preprocessing.yaml" diff --git a/everyvoice/config/default/preprocessing.yaml b/everyvoice/config/default/preprocessing.yaml deleted file mode 100644 index 6e8d3c62..00000000 --- a/everyvoice/config/default/preprocessing.yaml +++ /dev/null @@ -1,44 +0,0 @@ -dataset: "YourDataSet" -pitch_type: "pyworld" -pitch_phone_averaging: True -energy_phone_averaging: True -value_separator: "--" -save_dir: "./preprocessed/YourDataSet" -audio: - min_audio_length: 0.25 - max_audio_length: 11.0 - max_wav_value: 32768.0 - norm_db: -3.0 - sil_threshold: 1.0 - sil_duration: 0.1 - input_sampling_rate: 22050 - output_sampling_rate: 22050 - alignment_sampling_rate: 22050 - target_bit_depth: 16 - alignment_bit_depth: 16 - fft_window_frames: 1024 - fft_hop_frames: 256 - f_min: 0 - f_max: 8000 - n_fft: 1024 - n_mels: 80 - spec_type: "mel-librosa" - vocoder_segment_size: 8192 -source_data: - - label: "LJ_TEST" - data_dir: "./data/lj/wavs" - textgrid_dir: "./tests/data/lj/textgrids" - filelist_loader: "everyvoice.utils.load_lj_metadata_hifigan" # TODO: sort out callables - filelist: "./filelists/lj_test.psv" - sox_effects: - - ["channels", "1"] # convert to mono - - ["rate", "16000"] # resample - - ["norm", "-3.0"] # normalize to -3 dB - - # remove silence throughout the file - - "silence" - - "1" # Above periods silence; ie. allow 1 second of silence at beginning - - "0.1" # Above periods silence duration - - "1.0%" # Above periods silence threshold - - "-1" # See https://linux.die.net/man/1/sox#:~:text=To%20remove%20silence,of%20the%20audio. - - "0.1" # Below periods silence duration - - "1.0%" # Below periods silence threshold diff --git a/everyvoice/config/default/text_eng.yaml b/everyvoice/config/default/text_eng.yaml deleted file mode 100644 index 1100ab30..00000000 --- a/everyvoice/config/default/text_eng.yaml +++ /dev/null @@ -1,9 +0,0 @@ -cleaners: - - everyvoice.utils.lower - - everyvoice.utils.collapse_whitespace - - everyvoice.utils.nfc_normalize -symbols: - silence: [""] - pad: "_" - punctuation: "-';:,.!?¡¿—…\"«»“” " - lowercase_letters: ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'] diff --git a/everyvoice/config/lj/lj.yaml b/everyvoice/config/lj/lj.yaml deleted file mode 100644 index b958f800..00000000 --- a/everyvoice/config/lj/lj.yaml +++ /dev/null @@ -1,17 +0,0 @@ -aligner: - extend_from: "./config/default/aligner.yaml" - override_with: - training: - filelist: "./preprocessed/LJ/processed_filelist.psv" - preprocessing: "./config/lj/preprocessing.yaml" -feature_prediction: - extend_from: "./config/default/fastspeech2.yaml" - override_with: - training: - filelist: "./preprocessed/LJ/processed_filelist.psv" -vocoder: - extend_from: "./config/default/hifigan.yaml" - override_with: - training: - filelist: "./preprocessed/LJ/processed_filelist.psv" -training: "./model/e2e/config/lj_training.yaml" diff --git a/everyvoice/config/lj/lj_istft.yaml b/everyvoice/config/lj/lj_istft.yaml deleted file mode 100644 index 6c7824e2..00000000 --- a/everyvoice/config/lj/lj_istft.yaml +++ /dev/null @@ -1,17 +0,0 @@ -aligner: - extend_from: "./config/default/aligner.yaml" - override_with: - training: - filelist: "./preprocessed/LJ/processed_filelist.psv" - preprocessing: "./config/lj/preprocessing.yaml" -feature_prediction: - extend_from: "./config/default/fastspeech2.yaml" - override_with: - training: - filelist: "./preprocessed/LJ/processed_filelist.psv" -vocoder: - extend_from: "./config/default/istft.yaml" - override_with: - training: - filelist: "./preprocessed/LJ/processed_filelist.psv" -training: "./model/e2e/config/lj_training.yaml" diff --git a/everyvoice/config/lj/preprocessing.yaml b/everyvoice/config/lj/preprocessing.yaml deleted file mode 100644 index 2c5a25a3..00000000 --- a/everyvoice/config/lj/preprocessing.yaml +++ /dev/null @@ -1,33 +0,0 @@ -dataset: "LJ" -pitch_type: "pyworld" -pitch_phone_averaging: True -energy_phone_averaging: True -value_separator: "--" -save_dir: "./preprocessed/LJ" -audio: - min_audio_length: 0.25 - max_audio_length: 11.0 - max_wav_value: 32768.0 - norm_db: -3.0 - sil_threshold: 1.0 - sil_duration: 0.1 - input_sampling_rate: 22050 - output_sampling_rate: 22050 - alignment_sampling_rate: 22050 - target_bit_depth: 16 - alignment_bit_depth: 16 - fft_window_frames: 1024 - fft_hop_frames: 256 - f_min: 0 - f_max: 8000 - n_fft: 1024 - n_mels: 80 - spec_type: "mel-librosa" - vocoder_segment_size: 8192 -source_data: - - label: "LJ" - data_dir: "./data/lj/wavs" - textgrid_dir: "./data/lj/textgrids" - filelist_loader: "everyvoice.utils.load_lj_metadata_hifigan" # TODO: sort out callables - filelist: "./filelists/lj_full.psv" - sox_effects: [] diff --git a/everyvoice/config/openslr/openslr.yaml b/everyvoice/config/openslr/openslr.yaml deleted file mode 100644 index 786aafd5..00000000 --- a/everyvoice/config/openslr/openslr.yaml +++ /dev/null @@ -1,22 +0,0 @@ -aligner: - extend_from: "./config/default/aligner.yaml" - override_with: - training: - filelist: "./preprocessed/OpenSLR/processed_filelist.psv" - preprocessing: "./config/openslr/preprocessing.yaml" -feature_prediction: - extend_from: "./config/default/fastspeech2.yaml" - override_with: - training: - filelist: "./preprocessed/OpenSLR/processed_filelist.psv" - preprocessing: "./config/openslr/preprocessing.yaml" -vocoder: - extend_from: "./config/default/istft.yaml" - override_with: - model: - upsample_rates: [8, 8, 2] # because takes 24k inputs and 48k outputs we have to add another upsample layer - upsample_kernel_sizes: [16, 16, 4] - training: - filelist: "./preprocessed/OpenSLR/processed_filelist.psv" - preprocessing: "./config/openslr/preprocessing.yaml" -training: "./model/e2e/config/training.yaml" diff --git a/everyvoice/config/openslr/preprocessing.yaml b/everyvoice/config/openslr/preprocessing.yaml deleted file mode 100644 index 674c8ebd..00000000 --- a/everyvoice/config/openslr/preprocessing.yaml +++ /dev/null @@ -1,51 +0,0 @@ -dataset: "OpenSLR" -pitch_type: "pyworld" -pitch_phone_averaging: True -energy_phone_averaging: True -value_separator: "--" -save_dir: "./preprocessed/OpenSLR" -audio: - min_audio_length: 0.25 - max_audio_length: 11.0 - max_wav_value: 32768.0 - norm_db: -3.0 - sil_threshold: 1.0 - sil_duration: 0.1 - input_sampling_rate: 24000 - output_sampling_rate: 48000 - alignment_sampling_rate: 24000 - target_bit_depth: 16 - alignment_bit_depth: 16 - fft_window_frames: 1024 - fft_hop_frames: 256 - f_min: 0 - f_max: 8000 - n_fft: 1024 - n_mels: 80 - spec_type: "mel-librosa" - vocoder_segment_size: 16384 -source_data: - - label: "afr" - data_dir: "./data/OpenSLR/af_za/za/afr/wavs" - textgrid_dir: - filelist_loader: "everyvoice.utils.load_lj_metadata_hifigan" # TODO: sort out callables - filelist: "./filelists/af_full.psv" - sox_effects: "./config/openslr/sox_effects.json" - - label: "sso" - data_dir: "./data/OpenSLR/st_za/za/sso/wavs" - textgrid_dir: - filelist_loader: "everyvoice.utils.load_lj_metadata_hifigan" # TODO: sort out callables - filelist: "./filelists/st_full.psv" - sox_effects: "./config/openslr/sox_effects.json" - - label: "tsn" - data_dir: "./data/OpenSLR/tn_za/za/tsn/wavs" - textgrid_dir: - filelist_loader: "everyvoice.utils.load_lj_metadata_hifigan" # TODO: sort out callables - filelist: "./filelists/tn_full.psv" - sox_effects: "./config/openslr/sox_effects.json" - - label: "xho" - data_dir: "./data/OpenSLR/xh_za/za/xho/wavs" - textgrid_dir: - filelist_loader: "everyvoice.utils.load_lj_metadata_hifigan" # TODO: sort out callables - filelist: "./filelists/xh_full.psv" - sox_effects: "./config/openslr/sox_effects.json" diff --git a/everyvoice/config/openslr/sox_effects.json b/everyvoice/config/openslr/sox_effects.json deleted file mode 100644 index cb353e6c..00000000 --- a/everyvoice/config/openslr/sox_effects.json +++ /dev/null @@ -1,14 +0,0 @@ -[ - ["channels", "1"], - ["rate", "16000"], - ["norm", "-3.0"], - [ - "silence", - "1", - "0.1", - "1.0%", - "-1", - "0.1", - "1.0%" - ] -] diff --git a/everyvoice/config/test/preprocessing.yaml b/everyvoice/config/test/preprocessing.yaml deleted file mode 100644 index 54255280..00000000 --- a/everyvoice/config/test/preprocessing.yaml +++ /dev/null @@ -1,44 +0,0 @@ -dataset: "YourDataSet" -pitch_type: "pyworld" -pitch_phone_averaging: True -energy_phone_averaging: True -value_separator: "--" -save_dir: "./preprocessed/YourDataSet" -audio: - min_audio_length: 0.25 - max_audio_length: 11.0 - max_wav_value: 32768.0 - norm_db: -3.0 - sil_threshold: 1.0 - sil_duration: 0.1 - input_sampling_rate: 22050 - output_sampling_rate: 22050 - alignment_sampling_rate: 22050 - target_bit_depth: 16 - alignment_bit_depth: 16 - fft_window_frames: 1024 - fft_hop_frames: 256 - f_min: 0 - f_max: 8000 - n_fft: 1024 - n_mels: 80 - spec_type: "mel-librosa" - vocoder_segment_size: 8192 -source_data: - - label: "LJ_TEST" - data_dir: "./tests/data/lj/wavs" - textgrid_dir: "./tests/data/lj/textgrids" - filelist_loader: "everyvoice.utils.load_lj_metadata_hifigan" # TODO: sort out callables - filelist: "./filelists/lj_test.psv" - sox_effects: - - ["channels", "1"] # convert to mono - - ["rate", "16000"] # resample - - ["norm", "-3.0"] # normalize to -3 dB - - # remove silence throughout the file - - "silence" - - "1" # Above periods silence; ie. allow 1 second of silence at beginning - - "0.1" # Above periods silence duration - - "1.0%" # Above periods silence threshold - - "-1" # See https://linux.die.net/man/1/sox#:~:text=To%20remove%20silence,of%20the%20audio. - - "0.1" # Below periods silence duration - - "1.0%" # Below periods silence threshold diff --git a/everyvoice/exceptions.py b/everyvoice/exceptions.py new file mode 100644 index 00000000..cf1e1475 --- /dev/null +++ b/everyvoice/exceptions.py @@ -0,0 +1,12 @@ +class InvalidConfiguration(Exception): + def __init__(self, msg): + super().__init__(self) + self.msg = msg + + def __str__(self): + return self.msg + + +class ConfigError(Exception): + def __init__(self, msg): + super().__init__(msg) diff --git a/everyvoice/model/aligner/DeepForcedAligner b/everyvoice/model/aligner/DeepForcedAligner index 9efe0b98..9abe9006 160000 --- a/everyvoice/model/aligner/DeepForcedAligner +++ b/everyvoice/model/aligner/DeepForcedAligner @@ -1 +1 @@ -Subproject commit 9efe0b98a8cfb3730d78fb13ef56f4a4e98f0a8d +Subproject commit 9abe900667fc710a664974239c399aa458547f31 diff --git a/everyvoice/model/e2e/cli.py b/everyvoice/model/e2e/cli.py index f7e61cc4..318b2740 100644 --- a/everyvoice/model/e2e/cli.py +++ b/everyvoice/model/e2e/cli.py @@ -1,32 +1,24 @@ -from enum import Enum - import typer from merge_args import merge_args from everyvoice.base_cli.interfaces import train_base_command_interface -from everyvoice.model.e2e.config import CONFIGS, EveryVoiceConfig +from everyvoice.model.e2e.config import EveryVoiceConfig app = typer.Typer( pretty_exceptions_show_locals=False, help="End-to-end training: jointly train the FastSpeech2 and HiFiGAN networks", ) -_config_keys = {k: k for k in CONFIGS.keys()} - -CONFIGS_ENUM = Enum("CONFIGS", _config_keys) # type: ignore - @app.command() @merge_args(train_base_command_interface) -def train(name: CONFIGS_ENUM = typer.Option(None, "--name", "-n"), **kwargs): +def train(**kwargs): from everyvoice.base_cli.helpers import train_base_command from everyvoice.model.e2e.dataset import E2EDataModule from everyvoice.model.e2e.model import EveryVoice train_base_command( - name=name, model_config=EveryVoiceConfig, - configs=CONFIGS, model=EveryVoice, data_module=E2EDataModule, monitor="validation/mel_spec_error", diff --git a/everyvoice/model/e2e/config/__init__.py b/everyvoice/model/e2e/config/__init__.py index 946af118..8568f0e7 100644 --- a/everyvoice/model/e2e/config/__init__.py +++ b/everyvoice/model/e2e/config/__init__.py @@ -1,5 +1,5 @@ from pathlib import Path -from typing import Dict, Union +from typing import Union from pydantic import Field, FilePath, validator from pydantic.fields import ModelField @@ -8,11 +8,7 @@ from everyvoice.model.aligner.config import AlignerConfig from everyvoice.model.feature_prediction.config import FeaturePredictionConfig from everyvoice.model.vocoder.config import VocoderConfig -from everyvoice.utils import ( - load_config_from_json_or_yaml_path, - rel_path_to_abs_path, - return_configs_from_dir, -) +from everyvoice.utils import load_config_from_json_or_yaml_path, rel_path_to_abs_path class E2ETrainingConfig(BaseTrainingConfig): @@ -43,7 +39,3 @@ def load_config_from_path( """Load a config from a path""" config = load_config_from_json_or_yaml_path(path) return EveryVoiceConfig(**config) - - -CONFIG_DIR = Path(__file__).parent -CONFIGS: Dict[str, Path] = return_configs_from_dir(CONFIG_DIR) diff --git a/everyvoice/model/e2e/config/base.yaml b/everyvoice/model/e2e/config/base.yaml deleted file mode 100644 index 69832164..00000000 --- a/everyvoice/model/e2e/config/base.yaml +++ /dev/null @@ -1,4 +0,0 @@ -aligner: "./config/default/aligner.yaml" -feature_prediction: "./config/default/fastspeech2.yaml" -vocoder: "./config/default/hifigan.yaml" -training: "./model/e2e/config/training.yaml" diff --git a/everyvoice/model/e2e/config/lj.yaml b/everyvoice/model/e2e/config/lj.yaml deleted file mode 100644 index 86b52f46..00000000 --- a/everyvoice/model/e2e/config/lj.yaml +++ /dev/null @@ -1,13 +0,0 @@ -aligner: - extend_from: "./config/default/aligner.yaml" - override_with: - training: - filelist: "./preprocessed/LJ/processed_filelist.psv" - preprocessing: "./config/lj/preprocessing.yaml" -feature_prediction: "./model/feature_prediction/FastSpeech2_lightning/fs2/config/lj.yaml" -vocoder: - extend_from: "./config/default/istft.yaml" - override_with: - training: - filelist: "./preprocessed/LJ/processed_filelist.psv" -training: "./model/e2e/config/lj_training.yaml" diff --git a/everyvoice/model/e2e/config/lj_training.yaml b/everyvoice/model/e2e/config/lj_training.yaml deleted file mode 100644 index 1baad7dd..00000000 --- a/everyvoice/model/e2e/config/lj_training.yaml +++ /dev/null @@ -1,18 +0,0 @@ -batch_size: 16 -save_top_k_ckpts: 3 -ckpt_steps: null -ckpt_epochs: 1 -max_epochs: 1000 -seed: 1234 -finetune_checkpoint: null -vocoder_checkpoint: "./logs_and_checkpoints/LJ/base-vocoder/checkpoints/last.ckpt" -feature_prediction_checkpoint: "./logs_and_checkpoints/LJ/fastspeech2-base/checkpoints/last.ckpt" -filelist: "./preprocessed/LJ/processed_filelist.psv" -filelist_loader: "everyvoice.utils.generic_dict_loader" -logger: - name: "LJ" - save_dir: "./logs_and_checkpoints" - sub_dir: "everyvoice.utils.get_current_time" - version: "e2e" -val_data_workers: 0 -train_data_workers: 4 diff --git a/everyvoice/model/e2e/config/training.yaml b/everyvoice/model/e2e/config/training.yaml deleted file mode 100644 index 07107405..00000000 --- a/everyvoice/model/e2e/config/training.yaml +++ /dev/null @@ -1,18 +0,0 @@ -batch_size: 16 -save_top_k_ckpts: 3 -ckpt_steps: null -ckpt_epochs: 1 -max_epochs: 1000 -seed: 1234 -finetune_checkpoint: null -feature_prediction_checkpoint: null -vocoder_checkpoint: null -filelist: "./preprocessed/LJ/processed_filelist.psv" -filelist_loader: "everyvoice.utils.generic_dict_loader" -logger: - name: "BaseExperiment" - save_dir: "./logs_and_checkpoints" - sub_dir: "everyvoice.utils.get_current_time" - version: "e2e" -val_data_workers: 0 -train_data_workers: 4 diff --git a/everyvoice/model/feature_prediction/FastSpeech2_lightning b/everyvoice/model/feature_prediction/FastSpeech2_lightning index 8b9d88f2..87324d4b 160000 --- a/everyvoice/model/feature_prediction/FastSpeech2_lightning +++ b/everyvoice/model/feature_prediction/FastSpeech2_lightning @@ -1 +1 @@ -Subproject commit 8b9d88f2738a6c0725ae0a356d19c38da5c29792 +Subproject commit 87324d4be22a6072ed9ac92ea72591a4642ef9f5 diff --git a/everyvoice/model/vocoder/HiFiGAN_iSTFT_lightning b/everyvoice/model/vocoder/HiFiGAN_iSTFT_lightning index 20e1c9d6..b12df5fe 160000 --- a/everyvoice/model/vocoder/HiFiGAN_iSTFT_lightning +++ b/everyvoice/model/vocoder/HiFiGAN_iSTFT_lightning @@ -1 +1 @@ -Subproject commit 20e1c9d66d9d9224cb83930bbbadf4aff1f980e4 +Subproject commit b12df5fef312129bbed24da5c3aa03888545ce03 diff --git a/everyvoice/preprocessor/__init__.py b/everyvoice/preprocessor/__init__.py index aab1a310..1d5c0826 100644 --- a/everyvoice/preprocessor/__init__.py +++ b/everyvoice/preprocessor/__init__.py @@ -28,8 +28,8 @@ from torchaudio.sox_effects import apply_effects_tensor from tqdm import tqdm -from everyvoice.config import ConfigError from everyvoice.config.preprocessing_config import PitchCalculationMethod +from everyvoice.exceptions import ConfigError from everyvoice.model.aligner.config import AlignerConfig from everyvoice.model.feature_prediction.config import FeaturePredictionConfig from everyvoice.model.vocoder.config import VocoderConfig diff --git a/everyvoice/tests/test_configs.py b/everyvoice/tests/test_configs.py index 933f5b31..c13ee52c 100755 --- a/everyvoice/tests/test_configs.py +++ b/everyvoice/tests/test_configs.py @@ -1,17 +1,23 @@ #!/usr/bin/env python import json +import tempfile from pathlib import Path from unittest import TestCase, main import yaml +from everyvoice import exceptions from everyvoice.config.preprocessing_config import Dataset, PreprocessingConfig from everyvoice.model.aligner.config import AlignerConfig from everyvoice.model.e2e.config import E2ETrainingConfig, EveryVoiceConfig from everyvoice.model.feature_prediction.config import FeaturePredictionConfig from everyvoice.model.vocoder.config import VocoderConfig -from everyvoice.utils import expand_config_string_syntax, lower +from everyvoice.utils import ( + expand_config_string_syntax, + load_config_from_json_or_yaml_path, + lower, +) class ConfigTest(TestCase): @@ -82,6 +88,12 @@ def test_changes(self): self.assertEqual(self.config.feature_prediction.text.cleaners, [lower]) self.assertEqual(self.config.feature_prediction.text.symbols.pad, "FOO") + def test_load_empty_config(self): + with tempfile.NamedTemporaryFile(prefix="test", mode="w", suffix=".yaml") as tf: + tf.write(" ") + with self.assertRaises(exceptions.InvalidConfiguration): + load_config_from_json_or_yaml_path(Path(tf.name)) + def test_change_with_indices(self): """Text the --config-args can also work with arrays""" config = FeaturePredictionConfig() diff --git a/everyvoice/text/__init__.py b/everyvoice/text/__init__.py index 32f12583..f40ec00a 100644 --- a/everyvoice/text/__init__.py +++ b/everyvoice/text/__init__.py @@ -6,7 +6,7 @@ from loguru import logger from nltk.tokenize import RegexpTokenizer -from everyvoice.config import ConfigError +from everyvoice.exceptions import ConfigError from everyvoice.model.aligner.config import AlignerConfig from everyvoice.model.feature_prediction.config import FeaturePredictionConfig from everyvoice.text.features import get_features diff --git a/everyvoice/utils/__init__.py b/everyvoice/utils/__init__.py index 74bb743b..4d6df2d4 100644 --- a/everyvoice/utils/__init__.py +++ b/everyvoice/utils/__init__.py @@ -15,6 +15,7 @@ from pympi.Praat import TextGrid import everyvoice +from everyvoice import exceptions # Regular expression matching whitespace: _whitespace_re = re.compile(r"\s+") @@ -57,6 +58,8 @@ def load_config_from_json_or_yaml_path(path: Path): raise ValueError(f"Config file '{path}' does not exist") with open(path, "r", encoding="utf8") as f: config = json.load(f) if path.suffix == ".json" else yaml.safe_load(f) + if not config: + raise exceptions.InvalidConfiguration(f"Your configuration at {path} was empty") return config