Skip to content

Commit

Permalink
fix: fix preprocessing and convert bool options to flags, use `unidec…
Browse files Browse the repository at this point in the history
…ode` to decode non-ASCII filenames in `pre-resample` (#147)

BREAKING CHANGE: Preprocessing default parameters changed and some options became flags.
  • Loading branch information
34j committed Mar 27, 2023
1 parent 1694f44 commit 98d7ee2
Show file tree
Hide file tree
Showing 9 changed files with 115 additions and 43 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ svc --model-path <model-path> source.wav

#### Local

Place your dataset like `dataset_raw/{speaker_id}/**/{wav_file}.{any_format}` (subfolders are acceptable) and run:
Place your dataset like `dataset_raw/{speaker_id}/**/{wav_file}.{any_format}` (subfolders and non-ASCII filenames are acceptable) and run:

```shell
svc pre-resample
Expand Down
30 changes: 14 additions & 16 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ cm-time = ">=0.1.2"
pysimplegui = ">=4.6"
pebble = ">=5.0"
torchcrepe = ">=0.0.17"
unidecode = "^1.3.6"

[tool.poetry.group.dev.dependencies]
pre-commit = ">=3"
Expand Down
37 changes: 28 additions & 9 deletions src/so_vits_svc_fork/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,11 @@ def train(config_path: Path, model_path: Path):
help="f0 prediction method",
)
@click.option(
"-a", "--auto-predict-f0", type=bool, default=True, help="auto predict f0"
"-a/-na",
"--auto-predict-f0/--no-auto-predict-f0",
type=bool,
default=True,
help="auto predict f0",
)
@click.option(
"-r", "--cluster-infer-ratio", type=float, default=0, help="cluster infer ratio"
Expand All @@ -157,7 +161,11 @@ def train(config_path: Path, model_path: Path):
)
@click.option("-ch", "--chunk-seconds", type=float, default=0.5, help="chunk seconds")
@click.option(
"-ab", "--absolute-thresh", type=bool, default=False, help="absolute thresh"
"-ab/-nab",
"--absolute-thresh/--no-absolute-thresh",
type=bool,
default=False,
help="absolute thresh",
)
def infer(
# paths
Expand Down Expand Up @@ -247,10 +255,10 @@ def infer(
)
@click.option("-t", "--transpose", type=int, default=12, help="transpose")
@click.option(
"-a",
"--auto-predict-f0",
"-a/-na",
"--auto-predict-f0/--no-auto-predict-f0",
type=bool,
default=False,
default=True,
help="auto predict f0 (not recommended for realtime since voice pitch will not be stable)",
)
@click.option(
Expand Down Expand Up @@ -409,8 +417,17 @@ def vc(
default=-1,
help="number of jobs (optimal value may depend on your RAM capacity and audio duration per file)",
)
@click.option("-d", "--top-db", type=float, default=30, help="top db")
@click.option("-f", "--frame-seconds", type=float, default=1, help="frame seconds")
@click.option("-h", "--hop-seconds", type=float, default=0.3, help="hop seconds")
def pre_resample(
input_dir: Path, output_dir: Path, sampling_rate: int, n_jobs: int
input_dir: Path,
output_dir: Path,
sampling_rate: int,
n_jobs: int,
top_db: int,
frame_seconds: float,
hop_seconds: float,
) -> None:
"""Preprocessing part 1: resample"""
from .preprocess_resample import preprocess_resample
Expand All @@ -422,6 +439,9 @@ def pre_resample(
output_dir=output_dir,
sampling_rate=sampling_rate,
n_jobs=n_jobs,
top_db=top_db,
frame_seconds=frame_seconds,
hop_seconds=hop_seconds,
)


Expand Down Expand Up @@ -484,15 +504,14 @@ def pre_config(
)
@click.option(
"-n",
"--n_jobs",
"--n-jobs",
type=int,
default=4,
help="number of jobs (optimal value may depend on your VRAM capacity and audio duration per file)",
)
@click.option(
"-f",
"--force-rebuild",
"-f/-nf",
"--force-rebuild/--no-force-rebuild",
type=bool,
default=True,
help="force rebuild existing preprocessed files",
Expand Down
16 changes: 8 additions & 8 deletions src/so_vits_svc_fork/preprocess_flist_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

import json
import os
import re
from copy import deepcopy
from logging import getLogger
from pathlib import Path
Expand Down Expand Up @@ -36,9 +35,10 @@ def preprocess_config(
spk_id += 1
paths = []
for path in tqdm(list((input_dir / speaker).glob("**/*.wav"))):
pattern = re.compile(r"^[\.a-zA-Z0-9_\/]+$")
if not pattern.match(path.name):
LOG.warning(f"file name {path} contains non-alphanumeric characters.")
if not path.name.isascii():
LOG.warning(
f"file name {path} contains non-ascii characters. torch.save() and torch.load() may not work."
)
if get_duration(filename=path) < 0.3:
LOG.warning(f"skip {path} because it is too short.")
continue
Expand All @@ -54,21 +54,21 @@ def preprocess_config(

LOG.info(f"Writing {train_list_path}")
train_list_path.parent.mkdir(parents=True, exist_ok=True)
with train_list_path.open("w") as f:
with train_list_path.open("w", encoding="utf-8") as f:
for fname in train:
wavpath = fname.as_posix()
f.write(wavpath + "\n")

LOG.info(f"Writing {val_list_path}")
val_list_path.parent.mkdir(parents=True, exist_ok=True)
with val_list_path.open("w") as f:
with val_list_path.open("w", encoding="utf-8") as f:
for fname in val:
wavpath = fname.as_posix()
f.write(wavpath + "\n")

LOG.info(f"Writing {test_list_path}")
test_list_path.parent.mkdir(parents=True, exist_ok=True)
with test_list_path.open("w") as f:
with test_list_path.open("w", encoding="utf-8") as f:
for fname in test:
wavpath = fname.as_posix()
f.write(wavpath + "\n")
Expand All @@ -85,5 +85,5 @@ def preprocess_config(
config["data"]["validation_files"] = val_list_path.as_posix()
LOG.info(f"Writing {config_path}")
config_path.parent.mkdir(parents=True, exist_ok=True)
with config_path.open("w") as f:
with config_path.open("w", encoding="utf-8") as f:
json.dump(config, f, indent=2)
15 changes: 10 additions & 5 deletions src/so_vits_svc_fork/preprocess_hubert_f0.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from tqdm import tqdm

from . import utils
from .preprocess_utils import check_hubert_min_duration
from .utils import HUBERT_SAMPLING_RATE

LOG = getLogger(__name__)
Expand All @@ -26,13 +27,17 @@ def _process_one(
f0_method: Literal["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"] = "dio",
force_rebuild: bool = False,
):
wav, sr = librosa.load(filepath, sr=sampling_rate)
audio, sr = librosa.load(filepath, sr=sampling_rate)

if not check_hubert_min_duration(audio, sr):
LOG.info(f"Skip {filepath} because it is too short.")
return

# Compute HuBERT content
soft_path = filepath.parent / (filepath.name + ".soft.pt")
if not soft_path.exists() or force_rebuild:
if (not soft_path.exists()) or force_rebuild:
wav16k = librosa.resample(
wav, orig_sr=sampling_rate, target_sr=HUBERT_SAMPLING_RATE
audio, orig_sr=sampling_rate, target_sr=HUBERT_SAMPLING_RATE
)
wav16k = torch.from_numpy(wav16k).to(device)
c = utils.get_hubert_content(hubert_model, wav_16k_tensor=wav16k)
Expand All @@ -42,9 +47,9 @@ def _process_one(

# Compute f0
f0_path = filepath.parent / (filepath.name + ".f0.npy")
if not f0_path.exists() or force_rebuild:
if (not f0_path.exists()) or force_rebuild:
f0 = utils.compute_f0(
wav, sampling_rate=sampling_rate, hop_length=hop_length, method=f0_method
audio, sampling_rate=sampling_rate, hop_length=hop_length, method=f0_method
)
np.save(f0_path, f0)
else:
Expand Down
52 changes: 48 additions & 4 deletions src/so_vits_svc_fork/preprocess_resample.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@
import soundfile
from joblib import Parallel, delayed
from tqdm_joblib import tqdm_joblib
from unidecode import unidecode

from .preprocess_utils import check_hubert_min_duration

LOG = getLogger(__name__)

Expand Down Expand Up @@ -45,7 +48,15 @@ def is_relative_to(path: Path, *other):
return False


def _preprocess_one(input_path: Path, output_path: Path, sr: int) -> None:
def _preprocess_one(
input_path: Path,
output_path: Path,
sr: int,
*,
top_db: int,
frame_seconds: float,
hop_seconds: float,
) -> None:
"""Preprocess one audio file."""

try:
Expand All @@ -57,17 +68,37 @@ def _preprocess_one(input_path: Path, output_path: Path, sr: int) -> None:
LOG.warning(f"Failed to load {input_path} due to {e}")
return

if not check_hubert_min_duration(audio, sr):
LOG.info(f"Skip {input_path} because it is too short.")
return

# Adjust volume
audio /= max(audio.max(), -audio.min())

# Trim silence
audio, _ = librosa.effects.trim(audio, top_db=20)
audio, _ = librosa.effects.trim(
audio,
top_db=top_db,
frame_length=int(frame_seconds * sr),
hop_length=int(hop_seconds * sr),
)

if not check_hubert_min_duration(audio, sr):
LOG.info(f"Skip {input_path} because it is too short.")
return

soundfile.write(output_path, audio, samplerate=sr, subtype="PCM_16")


def preprocess_resample(
input_dir: Path | str, output_dir: Path | str, sampling_rate: int, n_jobs: int = -1
input_dir: Path | str,
output_dir: Path | str,
sampling_rate: int,
n_jobs: int = -1,
*,
top_db: int = 30,
frame_seconds: float = 0.1,
hop_seconds: float = 0.05,
) -> None:
input_dir = Path(input_dir)
output_dir = Path(output_dir)
Expand All @@ -92,6 +123,13 @@ def preprocess_resample(
continue
speaker_name = in_path_relative.parts[0]
file_name = in_path_relative.with_suffix(".wav").name
new_filename = unidecode(file_name)
if new_filename != file_name:
LOG.warning(
f"Filename {file_name} contains non-ASCII characters. "
f"Replaced with {new_filename}."
)
file_name = new_filename
out_path = output_dir / speaker_name / file_name
out_path = _get_unique_filename(out_path, out_paths)
out_path.parent.mkdir(parents=True, exist_ok=True)
Expand All @@ -102,6 +140,12 @@ def preprocess_resample(

with tqdm_joblib(desc="Preprocessing", total=len(in_and_out_paths)):
Parallel(n_jobs=n_jobs)(
delayed(_preprocess_one)(*args, sr=sampling_rate)
delayed(_preprocess_one)(
*args,
sr=sampling_rate,
top_db=top_db,
frame_seconds=frame_seconds,
hop_seconds=hop_seconds,
)
for args in in_and_out_paths
)
5 changes: 5 additions & 0 deletions src/so_vits_svc_fork/preprocess_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from numpy import ndarray


def check_hubert_min_duration(audio: ndarray, sr: int) -> bool:
return len(audio) / sr >= 0.3
Binary file added tests/dataset_raw/34j/nested/あ.wav
Binary file not shown.

0 comments on commit 98d7ee2

Please sign in to comment.