In [26]:
#!/usr/bin/env python3
from pathlib import Path
import numpy as np
from scipy.io.wavfile import read, write
from scipy.signal import butter, filtfilt

def lowpass_filter(data: np.ndarray, fs: int, cutoff: float, order: int = 6) -> np.ndarray:
    """
    Apply a zero-phase Butterworth low-pass filter to a 1D or 2D audio array.
    data:    np.ndarray of shape (n_samples,) or (n_samples, n_channels)
    fs:      sampling rate in Hz
    cutoff:  cutoff frequency in Hz
    order:   filter order
    """
    nyq = fs / 2
    b, a = butter(order, cutoff / nyq, btype="low", analog=False)

    if data.ndim == 1:
        return filtfilt(b, a, data)
    else:
        # apply per channel
        filtered = np.zeros_like(data)
        for ch in range(data.shape[1]):
            filtered[:, ch] = filtfilt(b, a, data[:, ch])
        return filtered

# parameters
CUTOFF_HZ = 4700.0  # 4.5 kHz
FILTER_ORDER = 10
LANGS = ["en", "jp"]

for lang in LANGS:

    audio_dir = Path(f"./test_dataset/{lang}/audio")
    if not audio_dir.exists():
        continue

    for wav_path in audio_dir.glob("*.wav"):
        # read
        fs, data = read(wav_path)

        # filter
        filtered = lowpass_filter(data.astype(float), fs, CUTOFF_HZ, FILTER_ORDER)

        # overwrite
        write(wav_path, fs, np.asarray(filtered, data.dtype))
        print(f"Filtered {wav_path}")

print("All files have been low-pass filtered at 4.7 kHz in place.")


Filtered test_dataset/en/audio/en_krishna_032.wav
Filtered test_dataset/en/audio/en_krishna_026.wav
Filtered test_dataset/en/audio/en_krishna_027.wav
Filtered test_dataset/en/audio/en_krishna_033.wav
Filtered test_dataset/en/audio/en_krishna_019.wav
Filtered test_dataset/en/audio/en_krishna_025.wav
Filtered test_dataset/en/audio/en_krishna_031.wav
Filtered test_dataset/en/audio/en_krishna_030.wav
Filtered test_dataset/en/audio/en_krishna_024.wav
Filtered test_dataset/en/audio/en_krishna_018.wav
Filtered test_dataset/en/audio/en_krishna_020.wav
Filtered test_dataset/en/audio/en_krishna_034.wav
Filtered test_dataset/en/audio/en_krishna_008.wav
Filtered test_dataset/en/audio/en_krishna_009.wav
Filtered test_dataset/en/audio/en_krishna_035.wav
Filtered test_dataset/en/audio/en_krishna_021.wav
Filtered test_dataset/en/audio/en_krishna_037.wav
Filtered test_dataset/en/audio/en_krishna_023.wav
Filtered test_dataset/en/audio/en_krishna_022.wav
Filtered test_dataset/en/audio/en_krishna_036.wav


In [27]:
from pathlib import Path
import pandas as pd
from utils.prompt_making import make_prompt

DATA_DIR = Path("./test_dataset")
TRANS_FILES = {
    "en": "english_transcripts.csv",
    "jp": "japanese_transcripts.csv",
}

for lang, fname in TRANS_FILES.items():
    csv_path = DATA_DIR / lang / fname
    orig_dir = csv_path.parent / "audio"

    # read once, skip bad lines if any
    df = pd.read_csv(csv_path, engine="python", on_bad_lines="skip", encoding="utf-8")

    # iterate fast with itertuples
    for row in df.itertuples(index=False):
        audio_path = orig_dir / f"{row.id}.wav"
        make_prompt(
            name=f"{row.id}",
            audio_prompt_path=str(audio_path),
            transcript=row.text
        )


In [28]:
#!/usr/bin/env python3
from pathlib import Path
import pandas as pd
from scipy.io.wavfile import write as write_wav
from tqdm.auto import tqdm

import utils.generation as gen
from utils.generation import SAMPLE_RATE
from train_utils.icefall.utils import load_checkpoint

# 1) Preload models once
gen.preload_models()

# 2) Checkpoints
checkpoint_paths = {
    "original":   "./checkpoints/vallex-checkpoint.pt",
    "pure":       "./checkpoints/checkpoint-30-pure.pt",
    "LoRA_whole": "./checkpoints/checkpoint-30-whole.pt",
    "LoRA_AR":    "./checkpoints/checkpoint-30-AR.pt",
    "LoRA_NAR":   "./checkpoints/checkpoint-30-NAR.pt",
}

# 3) CSV mappings
DATA_DIR = Path("./test_dataset")
TRANS_FILES = {
    "en": "english_transcripts.csv",
    "jp": "japanese_transcripts.csv",
}

# 4) Accent labels
accents = {"en": "English", "jp": "日本語"}

# 5) Iterate variants → languages → rows with tqdm
for variant, ckpt_path in checkpoint_paths.items():
    print(f"\n=== Generating variant: {variant} ===")
    if variant != "original":
        load_checkpoint(ckpt_path, gen.model, None, None, None)

    for lang, csv_name in TRANS_FILES.items():
        csv_path = DATA_DIR / lang / csv_name
        df = pd.read_csv(csv_path, engine="python", on_bad_lines="skip", encoding="utf-8")

        out_dir = DATA_DIR / lang / variant
        out_dir.mkdir(parents=True, exist_ok=True)

        # tqdm progress bar per file
        for row in tqdm(df.itertuples(index=False),
                        total=len(df),
                        desc=f"{lang.upper()} / {variant}",
                        unit="wav"):
            file_id    = row.id
            transcript = row.text
            out_name   = f"{file_id}_{variant}.wav"
            out_path   = out_dir / out_name

            audio_array = gen.generate_audio(
                text=transcript,
                prompt=file_id,
                language=lang,
                accent=accents[lang]
            )
            write_wav(str(out_path), SAMPLE_RATE, audio_array)


  WeightNorm.apply(module, name, dim)



=== Generating variant: original ===


EN / original:   0%|          | 0/96 [00:00<?, ?wav/s]

VALL-E EOS [438 -> 837]


EN / original:   1%|          | 1/96 [00:15<24:38, 15.56s/wav]

VALL-E EOS [589 -> 1310]


EN / original:   2%|▏         | 2/96 [00:45<37:11, 23.74s/wav]

VALL-E EOS [549 -> 1068]


EN / original:   3%|▎         | 3/96 [01:11<38:23, 24.77s/wav]

VALL-E EOS [274 -> 577]


EN / original:   4%|▍         | 4/96 [01:22<30:10, 19.68s/wav]

VALL-E EOS [422 -> 946]


EN / original:   5%|▌         | 5/96 [01:44<30:49, 20.33s/wav]

VALL-E EOS [274 -> 644]


EN / original:   6%|▋         | 6/96 [01:59<27:42, 18.48s/wav]

VALL-E EOS [553 -> 1215]


EN / original:   7%|▋         | 7/96 [02:38<37:20, 25.18s/wav]

VALL-E EOS [293 -> 716]


EN / original:   8%|▊         | 8/96 [02:54<32:46, 22.35s/wav]

VALL-E EOS [593 -> 1273]


EN / original:   9%|▉         | 9/96 [03:25<36:24, 25.11s/wav]

VALL-E EOS [209 -> 389]


EN / original:  10%|█         | 10/96 [03:32<27:46, 19.38s/wav]

VALL-E EOS [424 -> 869]


EN / original:  11%|█▏        | 11/96 [03:51<27:32, 19.44s/wav]

VALL-E EOS [283 -> 547]


EN / original:  12%|█▎        | 12/96 [04:03<23:55, 17.09s/wav]

VALL-E EOS [413 -> 846]


EN / original:  14%|█▎        | 13/96 [04:29<27:18, 19.75s/wav]

VALL-E EOS [438 -> 1013]


EN / original:  15%|█▍        | 14/96 [05:02<32:31, 23.80s/wav]

VALL-E EOS [483 -> 901]


EN / original:  16%|█▌        | 15/96 [05:22<30:44, 22.77s/wav]

VALL-E EOS [300 -> 786]


EN / original:  17%|█▋        | 16/96 [05:44<29:45, 22.32s/wav]

VALL-E EOS [338 -> 680]


EN / original:  18%|█▊        | 17/96 [06:02<27:40, 21.02s/wav]

VALL-E EOS [345 -> 832]


EN / original:  19%|█▉        | 18/96 [06:23<27:20, 21.03s/wav]

VALL-E EOS [169 -> 460]


EN / original:  20%|█▉        | 19/96 [06:34<23:05, 17.99s/wav]

VALL-E EOS [347 -> 828]


EN / original:  21%|██        | 20/96 [06:53<23:20, 18.43s/wav]

VALL-E EOS [492 -> 977]


EN / original:  22%|██▏       | 21/96 [07:20<26:20, 21.07s/wav]

VALL-E EOS [250 -> 538]


EN / original:  23%|██▎       | 22/96 [07:31<22:12, 18.01s/wav]

VALL-E EOS [263 -> 515]


EN / original:  24%|██▍       | 23/96 [07:41<19:01, 15.63s/wav]

VALL-E EOS [284 -> 809]


EN / original:  25%|██▌       | 24/96 [08:02<20:26, 17.04s/wav]

VALL-E EOS [229 -> 496]


EN / original:  26%|██▌       | 25/96 [08:13<18:09, 15.34s/wav]

VALL-E EOS [382 -> 761]


EN / original:  27%|██▋       | 26/96 [08:30<18:38, 15.98s/wav]

VALL-E EOS [371 -> 817]


EN / original:  28%|██▊       | 27/96 [08:49<19:06, 16.62s/wav]

VALL-E EOS [239 -> 523]


EN / original:  29%|██▉       | 28/96 [08:59<16:40, 14.72s/wav]

VALL-E EOS [370 -> 1004]


EN / original:  30%|███       | 29/96 [09:24<20:02, 17.95s/wav]

VALL-E EOS [389 -> 869]


EN / original:  31%|███▏      | 30/96 [09:45<20:39, 18.78s/wav]

VALL-E EOS [589 -> 1392]


EN / original:  32%|███▏      | 31/96 [10:21<25:58, 23.97s/wav]

VALL-E EOS [302 -> 819]


EN / original:  33%|███▎      | 32/96 [10:39<23:38, 22.16s/wav]

VALL-E EOS [326 -> 603]


EN / original:  34%|███▍      | 33/96 [10:53<20:39, 19.67s/wav]

VALL-E EOS [192 -> 462]


EN / original:  35%|███▌      | 34/96 [11:05<17:59, 17.41s/wav]

VALL-E EOS [716 -> 1490]


EN / original:  36%|███▋      | 35/96 [11:50<26:04, 25.65s/wav]

VALL-E EOS [354 -> 770]


EN / original:  38%|███▊      | 36/96 [12:06<22:50, 22.84s/wav]

VALL-E EOS [368 -> 684]


EN / original:  39%|███▊      | 37/96 [12:19<19:36, 19.94s/wav]

VALL-E EOS [474 -> 1035]


EN / original:  40%|███▉      | 38/96 [12:44<20:32, 21.25s/wav]

VALL-E EOS [394 -> 892]


EN / original:  41%|████      | 39/96 [13:15<23:09, 24.37s/wav]

VALL-E EOS [206 -> 438]


EN / original:  42%|████▏     | 40/96 [13:25<18:33, 19.88s/wav]

VALL-E EOS [304 -> 634]


EN / original:  42%|████▏     | 40/96 [13:40<19:08, 20.50s/wav]


KeyboardInterrupt: 

In [None]:
#!/usr/bin/env python3
import os
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import librosa, librosa.display
import pyworld as pw
import parselmouth
import soundfile as sf
from scipy.io.wavfile import write as write_wav
from language_tool_python import LanguageTool

# ─── 기존 함수들 ───────────────────────────────────────────────────────────

def ensure_dir(d: Path):
    d.mkdir(parents=True, exist_ok=True)

def plot_waveform(y, sr, out_path: Path, title: str):
    plt.figure(figsize=(10,3))
    librosa.display.waveshow(y, sr=sr, color='red')
    plt.title(f"{title} waveform")
    plt.tight_layout()
    plt.savefig(out_path)
    plt.close()

def plot_spectrogram(y, sr, out_path: Path, title: str):
    D = librosa.stft(y, n_fft=1024, hop_length=256)
    S_db = librosa.amplitude_to_db(np.abs(D), ref=np.max)
    plt.figure(figsize=(10,4))
    librosa.display.specshow(S_db, sr=sr, hop_length=256, x_axis='time', y_axis='hz')
    plt.colorbar(format='%+2.0f dB')
    plt.title(f"{title} spectrogram")
    plt.tight_layout()
    plt.savefig(out_path)
    plt.close()

def extract_pitch_sp(y, sr):
    y64 = y.astype(np.float64)
    f0, t = pw.harvest(y64, sr)
    f0v = f0[f0>0]
    return {
        'f0_contour': f0,
        'f0_mean': float(np.nanmean(f0v)),
        'f0_std':  float(np.nanstd(f0v))
    }

def extract_formant_jitter_shimmer_hnr(path):
    snd = parselmouth.Sound(path)
    form = snd.to_formant_burg()
    t = np.linspace(snd.xmin, snd.xmax, 100)
    f1 = [form.get_value_at_time(1, ti) for ti in t]
    f2 = [form.get_value_at_time(2, ti) for ti in t]
    f3 = [form.get_value_at_time(3, ti) for ti in t]
    pp = parselmouth.praat.call(snd, "To PointProcess (periodic, cc)", 75.0, 600.0)
    jitter  = parselmouth.praat.call(pp, "Get jitter (local)", 0, 0, 0.0001, 0.02, 1.3)
    shimmer = parselmouth.praat.call([snd,pp], "Get shimmer (local)", 0, 0, 0.0001, 0.02, 1.3, 1.6)
    hnr_obj = parselmouth.praat.call(snd, "To Harmonicity (cc)", 0.01, 75.0, 0.1, 4.5)
    hnr     = parselmouth.praat.call(hnr_obj, "Get mean", 0, 0)
    return {
        'F1_mean': np.nanmean(f1),
        'F2_mean': np.nanmean(f2),
        'F3_mean': np.nanmean(f3),
        'jitter': float(jitter),
        'shimmer': float(shimmer),
        'hnr': float(hnr)
    }

def extract_errors(text: str):
    tool = LanguageTool('en-US', remote_server='https://api.languagetool.org')
    matches = tool.check(text)
    return {(m.offset, m.offset + m.errorLength, m.ruleId) for m in matches}

def grammar_fidelity(orig_text: str, synth_text: str):
    orig_errs  = extract_errors(orig_text)
    synth_errs = extract_errors(synth_text)
    tp = len(orig_errs & synth_errs)
    fn = len(orig_errs - synth_errs)
    fp = len(synth_errs - orig_errs)
    recall    = tp / (tp + fn) if tp+fn else 1.0
    precision = tp / (tp + fp) if tp+fp else 1.0
    f1        = 2*precision*recall/(precision+recall) if (precision+recall) else 0.0
    return precision, recall, f1

In [None]:
def main():
    # 1) CSV 읽어서 id->text 맵 생성
    transcripts = {}
    for lang, fname in [('en','english_transcripts.csv'),
                        ('jp','japanese_transcripts.csv')]:
        df = pd.read_csv(f"./test_dataset/{lang}/{fname}", engine="python", on_bad_lines="skip", encoding="utf-8")
        transcripts[lang] = {row.id: row.text for row in df.itertuples(index=False)}

    # 2) 평가할 디렉토리 구성
    ref_dirs = {
        'en': Path("./test_dataset/en/audio"),
        'jp': Path("./test_dataset/jp/audio"),
    }
    variants = ["original","pure","LoRA_whole","LoRA_AR","LoRA_NAR"]

    # 3) 출력 결과 저장할 최상위 폴더
    OUT_ROOT = Path("results")
    
    for lang in ['en','jp']:
        # 원본(참조) 오디오/텍스트 맵
        ref_map = {p.stem.split('_',1)[-1]: p for p in ref_dirs[lang].glob("*.wav")}
        
        for variant in variants:
            syn_dir = Path(f"./test_dataset/{lang}/{variant}")
            if not syn_dir.is_dir(): continue

            for syn_path in syn_dir.glob("*.wav"):
                # 파일명: ex) en_krishna_001_original.wav
                stem = syn_path.stem
                # id만 뽑아내기
                core = stem
                # "en_" 제거
                if core.startswith(f"{lang}_"):
                    core = core[len(lang)+1:]
                # "_{variant}" 제거
                if core.endswith(f"_{variant}"):
                    core = core[:-(len(variant)+1)]
                rec_id = core

                # 참조 오디오
                if rec_id not in ref_map:
                    print(f"⚠️  {lang}/{variant}: 참조 오디오 {rec_id} 없음, 건너뜁니다.")
                    continue
                ref_path = ref_map[rec_id]

                # transcript
                text = transcripts[lang].get(rec_id, "")
                if not text:
                    print(f"⚠️  {lang}/{variant}: 텍스트 {rec_id} 없음")
                    continue

                # 평가 결과 저장 폴더
                out_dir = OUT_ROOT / lang / variant / rec_id
                ensure_dir(out_dir)

                # 4) 오디오 로드
                y_ref, sr = librosa.load(str(ref_path), sr=16000)
                y_syn, _  = librosa.load(str(syn_path), sr=16000)

                # 5) 플롯
                plot_waveform   (y_ref, sr, out_dir/"ref_waveform.png",   f"{rec_id} ref")
                plot_spectrogram(y_ref, sr, out_dir/"ref_spectrogram.png",f"{rec_id} ref")
                plot_waveform   (y_syn, sr, out_dir/"syn_waveform.png",   f"{rec_id} syn")
                plot_spectrogram(y_syn, sr, out_dir/"syn_spectrogram.png",f"{rec_id} syn")

                # 6) 특징 추출
                p_ref = extract_pitch_sp(y_ref, sr)
                p_syn = extract_pitch_sp(y_syn, sr)
                f_ref = extract_formant_jitter_shimmer_hnr(str(ref_path))
                f_syn = extract_formant_jitter_shimmer_hnr(str(syn_path))

                # 텍스트 평가는 같은 텍스트를 비교하므로, 완전 일치 F1 = 1.0
                prec, rec, f1 = grammar_fidelity(text, text)

                # 7) 콘솔 요약 출력
                print(f"\n─── [{lang}/{variant}/{rec_id}] ───")
                print(f"Pitch mean  ref {p_ref['f0_mean']:.1f}Hz  | syn {p_syn['f0_mean']:.1f}Hz")
                print(f"Formant1    ref {f_ref['F1_mean']:.1f}Hz | syn {f_syn['F1_mean']:.1f}Hz")
                print(f"Jitter      ref {f_ref['jitter']:.6f}   | syn {f_syn['jitter']:.6f}")
                print(f"HNR         ref {f_ref['hnr']:.3f}   | syn {f_syn['hnr']:.3f}")
                print(f"Grammar F1      {f1:.3f}")
