fix: fix preprocessing and convert bool options to flags, use `unidec…

…ode` to decode non-ASCII filenames in `pre-resample` (#147) BREAKING CHANGE: Preprocessing default parameters changed and some options became flags.
voicepaw · Mar 27, 2023 · 98d7ee2 · 98d7ee2
1 parent 1694f44
commit 98d7ee2
Show file tree

Hide file tree

Showing 9 changed files with 115 additions and 43 deletions.
diff --git a/README.md b/README.md
@@ -126,7 +126,7 @@ svc --model-path <model-path> source.wav
 
 #### Local
 
-Place your dataset like `dataset_raw/{speaker_id}/**/{wav_file}.{any_format}` (subfolders are acceptable) and run:
+Place your dataset like `dataset_raw/{speaker_id}/**/{wav_file}.{any_format}` (subfolders and non-ASCII filenames are acceptable) and run:
 
 ```shell
 svc pre-resample

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -62,6 +62,7 @@ cm-time = ">=0.1.2"
 pysimplegui = ">=4.6"
 pebble = ">=5.0"
 torchcrepe = ">=0.0.17"
+unidecode = "^1.3.6"
 
 [tool.poetry.group.dev.dependencies]
 pre-commit = ">=3"

diff --git a/src/so_vits_svc_fork/__main__.py b/src/so_vits_svc_fork/__main__.py
@@ -141,7 +141,11 @@ def train(config_path: Path, model_path: Path):
     help="f0 prediction method",
 )
 @click.option(
-    "-a", "--auto-predict-f0", type=bool, default=True, help="auto predict f0"
+    "-a/-na",
+    "--auto-predict-f0/--no-auto-predict-f0",
+    type=bool,
+    default=True,
+    help="auto predict f0",
 )
 @click.option(
     "-r", "--cluster-infer-ratio", type=float, default=0, help="cluster infer ratio"
@@ -157,7 +161,11 @@ def train(config_path: Path, model_path: Path):
 )
 @click.option("-ch", "--chunk-seconds", type=float, default=0.5, help="chunk seconds")
 @click.option(
-    "-ab", "--absolute-thresh", type=bool, default=False, help="absolute thresh"
+    "-ab/-nab",
+    "--absolute-thresh/--no-absolute-thresh",
+    type=bool,
+    default=False,
+    help="absolute thresh",
 )
 def infer(
     # paths
@@ -247,10 +255,10 @@ def infer(
 )
 @click.option("-t", "--transpose", type=int, default=12, help="transpose")
 @click.option(
-    "-a",
-    "--auto-predict-f0",
+    "-a/-na",
+    "--auto-predict-f0/--no-auto-predict-f0",
     type=bool,
-    default=False,
+    default=True,
     help="auto predict f0 (not recommended for realtime since voice pitch will not be stable)",
 )
 @click.option(
@@ -409,8 +417,17 @@ def vc(
     default=-1,
     help="number of jobs (optimal value may depend on your RAM capacity and audio duration per file)",
 )
+@click.option("-d", "--top-db", type=float, default=30, help="top db")
+@click.option("-f", "--frame-seconds", type=float, default=1, help="frame seconds")
+@click.option("-h", "--hop-seconds", type=float, default=0.3, help="hop seconds")
 def pre_resample(
-    input_dir: Path, output_dir: Path, sampling_rate: int, n_jobs: int
+    input_dir: Path,
+    output_dir: Path,
+    sampling_rate: int,
+    n_jobs: int,
+    top_db: int,
+    frame_seconds: float,
+    hop_seconds: float,
 ) -> None:
     """Preprocessing part 1: resample"""
     from .preprocess_resample import preprocess_resample
@@ -422,6 +439,9 @@ def pre_resample(
         output_dir=output_dir,
         sampling_rate=sampling_rate,
         n_jobs=n_jobs,
+        top_db=top_db,
+        frame_seconds=frame_seconds,
+        hop_seconds=hop_seconds,
     )
 
 
@@ -484,15 +504,14 @@ def pre_config(
 )
 @click.option(
     "-n",
-    "--n_jobs",
     "--n-jobs",
     type=int,
     default=4,
     help="number of jobs (optimal value may depend on your VRAM capacity and audio duration per file)",
 )
 @click.option(
-    "-f",
-    "--force-rebuild",
+    "-f/-nf",
+    "--force-rebuild/--no-force-rebuild",
     type=bool,
     default=True,
     help="force rebuild existing preprocessed files",

diff --git a/src/so_vits_svc_fork/preprocess_flist_config.py b/src/so_vits_svc_fork/preprocess_flist_config.py
@@ -2,7 +2,6 @@
 
 import json
 import os
-import re
 from copy import deepcopy
 from logging import getLogger
 from pathlib import Path
@@ -36,9 +35,10 @@ def preprocess_config(
         spk_id += 1
         paths = []
         for path in tqdm(list((input_dir / speaker).glob("**/*.wav"))):
-            pattern = re.compile(r"^[\.a-zA-Z0-9_\/]+$")
-            if not pattern.match(path.name):
-                LOG.warning(f"file name {path} contains non-alphanumeric characters.")
+            if not path.name.isascii():
+                LOG.warning(
+                    f"file name {path} contains non-ascii characters. torch.save() and torch.load() may not work."
+                )
             if get_duration(filename=path) < 0.3:
                 LOG.warning(f"skip {path} because it is too short.")
                 continue
@@ -54,21 +54,21 @@ def preprocess_config(
 
     LOG.info(f"Writing {train_list_path}")
     train_list_path.parent.mkdir(parents=True, exist_ok=True)
-    with train_list_path.open("w") as f:
+    with train_list_path.open("w", encoding="utf-8") as f:
         for fname in train:
             wavpath = fname.as_posix()
             f.write(wavpath + "\n")
 
     LOG.info(f"Writing {val_list_path}")
     val_list_path.parent.mkdir(parents=True, exist_ok=True)
-    with val_list_path.open("w") as f:
+    with val_list_path.open("w", encoding="utf-8") as f:
         for fname in val:
             wavpath = fname.as_posix()
             f.write(wavpath + "\n")
 
     LOG.info(f"Writing {test_list_path}")
     test_list_path.parent.mkdir(parents=True, exist_ok=True)
-    with test_list_path.open("w") as f:
+    with test_list_path.open("w", encoding="utf-8") as f:
         for fname in test:
             wavpath = fname.as_posix()
             f.write(wavpath + "\n")
@@ -85,5 +85,5 @@ def preprocess_config(
     config["data"]["validation_files"] = val_list_path.as_posix()
     LOG.info(f"Writing {config_path}")
     config_path.parent.mkdir(parents=True, exist_ok=True)
-    with config_path.open("w") as f:
+    with config_path.open("w", encoding="utf-8") as f:
         json.dump(config, f, indent=2)
diff --git a/src/so_vits_svc_fork/preprocess_hubert_f0.py b/src/so_vits_svc_fork/preprocess_hubert_f0.py
@@ -12,6 +12,7 @@
 from tqdm import tqdm
 
 from . import utils
+from .preprocess_utils import check_hubert_min_duration
 from .utils import HUBERT_SAMPLING_RATE
 
 LOG = getLogger(__name__)
@@ -26,13 +27,17 @@ def _process_one(
     f0_method: Literal["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"] = "dio",
     force_rebuild: bool = False,
 ):
-    wav, sr = librosa.load(filepath, sr=sampling_rate)
+    audio, sr = librosa.load(filepath, sr=sampling_rate)
+
+    if not check_hubert_min_duration(audio, sr):
+        LOG.info(f"Skip {filepath} because it is too short.")
+        return
 
     # Compute HuBERT content
     soft_path = filepath.parent / (filepath.name + ".soft.pt")
-    if not soft_path.exists() or force_rebuild:
+    if (not soft_path.exists()) or force_rebuild:
         wav16k = librosa.resample(
-            wav, orig_sr=sampling_rate, target_sr=HUBERT_SAMPLING_RATE
+            audio, orig_sr=sampling_rate, target_sr=HUBERT_SAMPLING_RATE
         )
         wav16k = torch.from_numpy(wav16k).to(device)
         c = utils.get_hubert_content(hubert_model, wav_16k_tensor=wav16k)
@@ -42,9 +47,9 @@ def _process_one(
 
     # Compute f0
     f0_path = filepath.parent / (filepath.name + ".f0.npy")
-    if not f0_path.exists() or force_rebuild:
+    if (not f0_path.exists()) or force_rebuild:
         f0 = utils.compute_f0(
-            wav, sampling_rate=sampling_rate, hop_length=hop_length, method=f0_method
+            audio, sampling_rate=sampling_rate, hop_length=hop_length, method=f0_method
         )
         np.save(f0_path, f0)
     else:

diff --git a/src/so_vits_svc_fork/preprocess_resample.py b/src/so_vits_svc_fork/preprocess_resample.py
@@ -9,6 +9,9 @@
 import soundfile
 from joblib import Parallel, delayed
 from tqdm_joblib import tqdm_joblib
+from unidecode import unidecode
+
+from .preprocess_utils import check_hubert_min_duration
 
 LOG = getLogger(__name__)
 
@@ -45,7 +48,15 @@ def is_relative_to(path: Path, *other):
         return False
 
 
-def _preprocess_one(input_path: Path, output_path: Path, sr: int) -> None:
+def _preprocess_one(
+    input_path: Path,
+    output_path: Path,
+    sr: int,
+    *,
+    top_db: int,
+    frame_seconds: float,
+    hop_seconds: float,
+) -> None:
     """Preprocess one audio file."""
 
     try:
@@ -57,17 +68,37 @@ def _preprocess_one(input_path: Path, output_path: Path, sr: int) -> None:
         LOG.warning(f"Failed to load {input_path} due to {e}")
         return
 
+    if not check_hubert_min_duration(audio, sr):
+        LOG.info(f"Skip {input_path} because it is too short.")
+        return
+
     # Adjust volume
     audio /= max(audio.max(), -audio.min())
 
     # Trim silence
-    audio, _ = librosa.effects.trim(audio, top_db=20)
+    audio, _ = librosa.effects.trim(
+        audio,
+        top_db=top_db,
+        frame_length=int(frame_seconds * sr),
+        hop_length=int(hop_seconds * sr),
+    )
+
+    if not check_hubert_min_duration(audio, sr):
+        LOG.info(f"Skip {input_path} because it is too short.")
+        return
 
     soundfile.write(output_path, audio, samplerate=sr, subtype="PCM_16")
 
 
 def preprocess_resample(
-    input_dir: Path | str, output_dir: Path | str, sampling_rate: int, n_jobs: int = -1
+    input_dir: Path | str,
+    output_dir: Path | str,
+    sampling_rate: int,
+    n_jobs: int = -1,
+    *,
+    top_db: int = 30,
+    frame_seconds: float = 0.1,
+    hop_seconds: float = 0.05,
 ) -> None:
     input_dir = Path(input_dir)
     output_dir = Path(output_dir)
@@ -92,6 +123,13 @@ def preprocess_resample(
             continue
         speaker_name = in_path_relative.parts[0]
         file_name = in_path_relative.with_suffix(".wav").name
+        new_filename = unidecode(file_name)
+        if new_filename != file_name:
+            LOG.warning(
+                f"Filename {file_name} contains non-ASCII characters. "
+                f"Replaced with {new_filename}."
+            )
+            file_name = new_filename
         out_path = output_dir / speaker_name / file_name
         out_path = _get_unique_filename(out_path, out_paths)
         out_path.parent.mkdir(parents=True, exist_ok=True)
@@ -102,6 +140,12 @@ def preprocess_resample(
 
     with tqdm_joblib(desc="Preprocessing", total=len(in_and_out_paths)):
         Parallel(n_jobs=n_jobs)(
-            delayed(_preprocess_one)(*args, sr=sampling_rate)
+            delayed(_preprocess_one)(
+                *args,
+                sr=sampling_rate,
+                top_db=top_db,
+                frame_seconds=frame_seconds,
+                hop_seconds=hop_seconds,
+            )
             for args in in_and_out_paths
         )
diff --git a/src/so_vits_svc_fork/preprocess_utils.py b/src/so_vits_svc_fork/preprocess_utils.py
@@ -0,0 +1,5 @@
+from numpy import ndarray
+
+
+def check_hubert_min_duration(audio: ndarray, sr: int) -> bool:
+    return len(audio) / sr >= 0.3
diff --git a/tests/dataset_raw/34j/nested/あ.wav b/tests/dataset_raw/34j/nested/あ.wav