fix(preprocess): fix dtype in sf.read() to save memory and fix prepro…

…cess_resample (#132)
voicepaw · Mar 26, 2023 · 0af1e13 · 0af1e13
1 parent 4203f37
commit 0af1e13
Show file tree

Hide file tree

Showing 4 changed files with 46 additions and 31 deletions.
diff --git a/src/so_vits_svc_fork/__main__.py b/src/so_vits_svc_fork/__main__.py
@@ -434,14 +434,26 @@ def vc(
     help="path to output dir",
 )
 @click.option("-s", "--sampling-rate", type=int, default=44100, help="sampling rate")
-def pre_resample(input_dir: Path, output_dir: Path, sampling_rate: int) -> None:
+@click.option(
+    "-n",
+    "--n-jobs",
+    type=int,
+    default=-1,
+    help="number of jobs (optimal value may depend on your RAM capacity and audio duration per file)",
+)
+def pre_resample(
+    input_dir: Path, output_dir: Path, sampling_rate: int, n_jobs: int
+) -> None:
     """Preprocessing part 1: resample"""
     from .preprocess_resample import preprocess_resample
 
     input_dir = Path(input_dir)
     output_dir = Path(output_dir)
     preprocess_resample(
-        input_dir=input_dir, output_dir=output_dir, sampling_rate=sampling_rate
+        input_dir=input_dir,
+        output_dir=output_dir,
+        sampling_rate=sampling_rate,
+        n_jobs=n_jobs,
     )
 
 

diff --git a/src/so_vits_svc_fork/preprocess_resample.py b/src/so_vits_svc_fork/preprocess_resample.py
@@ -5,7 +5,6 @@
 from pathlib import Path
 from typing import Iterable
 
-import audioread.exceptions
 import librosa
 import numpy as np
 import soundfile
@@ -48,37 +47,38 @@ def is_relative_to(path: Path, *other):
         return False
 
 
-def preprocess_resample(
-    input_dir: Path | str, output_dir: Path | str, sampling_rate: int
-) -> None:
-    input_dir = Path(input_dir)
-    output_dir = Path(output_dir)
-    """Preprocess audio files in input_dir and save them to output_dir."""
+def _preprocess_one(input_path: Path, output_path: Path, sampling_rate: int) -> None:
+    """Preprocess one audio file."""
+
+    try:
+        audio, sr = sf.read(input_path, dtype="float32")
 
-    def preprocess_one(input_path: Path, output_path: Path) -> None:
-        """Preprocess one audio file."""
+    # Audioread is the last backend it will attempt, so this is the exception thrown on failure
+    except Exception as e:
+        # Failure due to attempting to load a file that is not audio, so return early
+        LOG.warning(f"Failed to load {input_path} due to {e}")
+        return
 
-        try:
-            audio, sr = sf.read(input_path)
+    # Trim silence
+    audio, _ = librosa.effects.trim(audio, top_db=20)
 
-        # Audioread is the last backend it will attempt, so this is the exception thrown on failure
-        except audioread.exceptions.NoBackendError as e:
-            # Failure due to attempting to load a file that is not audio, so return early
-            LOG.warning(f"Failed to load {input_path} due to {e}")
-            return
+    # Adjust volume
+    peak = np.abs(audio).max()
+    if peak > 1.0:
+        audio = 0.98 * audio / peak
 
-        # Trim silence
-        audio, _ = librosa.effects.trim(audio, top_db=20)
+    # Resample
+    audio = librosa.resample(audio, orig_sr=sr, target_sr=sampling_rate)
+    audio /= max(audio.max(), -audio.min())
+    soundfile.write(output_path, audio, samplerate=sampling_rate, subtype="PCM_16")
 
-        # Adjust volume
-        peak = np.abs(audio).max()
-        if peak > 1.0:
-            audio = 0.98 * audio / peak
 
-        # Resample
-        audio = librosa.resample(audio, orig_sr=sr, target_sr=sampling_rate)
-        audio /= max(audio.max(), -audio.min())
-        soundfile.write(output_path, audio, samplerate=sampling_rate, subtype="PCM_16")
+def preprocess_resample(
+    input_dir: Path | str, output_dir: Path | str, sampling_rate: int, n_jobs: int = -1
+) -> None:
+    input_dir = Path(input_dir)
+    output_dir = Path(output_dir)
+    """Preprocess audio files in input_dir and save them to output_dir."""
 
     in_paths = []
     out_paths = []
@@ -108,4 +108,7 @@ def preprocess_one(input_path: Path, output_path: Path) -> None:
     in_and_out_paths = list(zip(in_paths, out_paths))
 
     with tqdm_joblib(desc="Preprocessing", total=len(in_and_out_paths)):
-        Parallel(n_jobs=-1)(delayed(preprocess_one)(*args) for args in in_and_out_paths)
+        Parallel(n_jobs=n_jobs)(
+            delayed(_preprocess_one)(*args, sampling_rate=sampling_rate)
+            for args in in_and_out_paths
+        )
diff --git a/src/so_vits_svc_fork/preprocess_speaker_diarization.py b/src/so_vits_svc_fork/preprocess_speaker_diarization.py
@@ -21,7 +21,7 @@ def _process_one(
     huggingface_token: str | None = None,
 ) -> None:
     try:
-        audio, sr = sf.read(input_path)
+        audio, sr = sf.read(input_path, dtype="float32")
     except Exception as e:
         LOG.warning(f"Failed to read {input_path}: {e}")
         return

diff --git a/src/so_vits_svc_fork/preprocess_split.py b/src/so_vits_svc_fork/preprocess_split.py
@@ -20,7 +20,7 @@ def _process_one(
     hop_seconds: float = 0.1,
 ):
     try:
-        audio, sr = sf.read(input_path)
+        audio, sr = sf.read(input_path, dtype="float32")
     except Exception as e:
         LOG.warning(f"Failed to read {input_path}: {e}")
         return