# CosyVoice3 Fine-tuning (Debi & Marlene)

순서대로 실행. 모든 torchcodec/PyTorch 호환성 문제 패치 완료.

**모델:** Fun-CosyVoice3-0.5B-2512 (한국어 지원)

**패치 목록:**
1. file_utils.py - torchaudio.load -> soundfile
2. processor.py - torchaudio.load -> soundfile  
3. train_utils.py - PyTorch distributed 호환성
4. extract_embedding.py - kaldi.fbank 80-bin mel
5. extract_speech_token.py - whisper 128-bin mel + feats_length
6. cosyvoice2.yaml - vocab_size 자동 수정
7. data.list - 절대 경로 사용
8. 자동 백업 - 5분마다 Google Drive 저장 (세션 종료 대비)

In [None]:
# 1. GPU 확인
!nvidia-smi

In [None]:
# 2. CosyVoice 클론
!git clone --recursive https://github.com/FunAudioLLM/CosyVoice.git
%cd CosyVoice
!git submodule update --init --recursive

In [None]:
# 3. 의존성 설치
!pip install -q grpcio==1.62.0 grpcio-tools==1.62.0 --only-binary=:all:
!pip install -q -r requirements.txt --ignore-installed
!pip install -q modelscope onnxruntime-gpu openai-whisper
!pip install -q hyperpyyaml hydra-core lightning wget pyworld
!pip install -q x_transformers conformer

# 호환 버전 고정 (마지막에)
!pip install -q numpy==1.26.4 scipy==1.11.4 numba==0.59.1 llvmlite==0.42.0 transformers==4.40.0

print("설치 완료 - 런타임 재시작 필요")

In [None]:
# 4. 런타임 재시작 (필수!)
import os
os.kill(os.getpid(), 9)

In [None]:
# 5. 재시작 후 - 경로 설정 & 모든 패치 적용
%cd /content/CosyVoice

import sys
import os
sys.path.insert(0, '/content/CosyVoice')
sys.path.insert(0, '/content/CosyVoice/third_party/Matcha-TTS')

# ===== 1. file_utils.py 패치 (torchcodec -> soundfile) =====
file_utils_patch = '''
import os, json, torch, torchaudio
import soundfile as sf
import numpy as np
import logging
logging.getLogger("matplotlib").setLevel(logging.WARNING)
logging.basicConfig(level=logging.DEBUG, format="%(asctime)s %(levelname)s %(message)s")

def read_lists(list_file):
    with open(list_file, "r", encoding="utf8") as f:
        return [line.strip() for line in f]

def read_json_lists(list_file):
    results = {}
    for fn in read_lists(list_file):
        with open(fn, "r", encoding="utf8") as f:
            results.update(json.load(f))
    return results

def load_wav(wav, target_sr, min_sr=16000):
    data, sr = sf.read(wav, dtype="float32")
    if data.ndim == 1:
        speech = torch.from_numpy(data).unsqueeze(0)
    else:
        speech = torch.from_numpy(data.T).mean(dim=0, keepdim=True)
    if sr != target_sr:
        assert sr >= min_sr
        speech = torchaudio.transforms.Resample(sr, target_sr)(speech)
    return speech

def convert_onnx_to_trt(*args, **kwargs): pass
def export_cosyvoice2_vllm(*args, **kwargs): pass
'''
with open('cosyvoice/utils/file_utils.py', 'w') as f:
    f.write(file_utils_patch)

# ===== 2. processor.py 패치 (torchaudio.load -> soundfile) =====
processor_path = 'cosyvoice/dataset/processor.py'
with open(processor_path, 'r') as f:
    processor_content = f.read()

old_load = "sample['speech'], sample['sample_rate'] = torchaudio.load(BytesIO(sample['audio_data']))"
new_load = """# soundfile로 교체 (torchcodec 우회)
        import soundfile as sf
        audio_data, sr = sf.read(BytesIO(sample['audio_data']), dtype='float32')
        if audio_data.ndim == 1:
            sample['speech'] = torch.from_numpy(audio_data).unsqueeze(0)
        else:
            sample['speech'] = torch.from_numpy(audio_data.T).mean(dim=0, keepdim=True)
        sample['sample_rate'] = sr"""

if old_load in processor_content:
    processor_content = processor_content.replace(old_load, new_load)
    with open(processor_path, 'w') as f:
        f.write(processor_content)
    print("processor.py 패치 완료")

# ===== 3. train_utils.py 패치 (PyTorch 버전 호환성) =====
train_utils_path = 'cosyvoice/utils/train_utils.py'
with open(train_utils_path, 'r') as f:
    train_content = f.read()

old_timeout = "timeout=group_join.options._timeout"
new_timeout = "timeout=datetime.timedelta(seconds=1800)"

if old_timeout in train_content:
    train_content = train_content.replace(old_timeout, new_timeout)
    with open(train_utils_path, 'w') as f:
        f.write(train_content)
    print("train_utils.py 패치 완료")

# ===== 4. extract_embedding.py 패치 (kaldi.fbank 80-bin) =====
embedding_patch = '''
import sys, os, argparse, torch
sys.path.insert(0, "/content/CosyVoice")
import soundfile as sf
import torchaudio
import torchaudio.compliance.kaldi as kaldi
import onnxruntime as ort
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

def load_wav_sf(path, target_sr=16000):
    data, sr = sf.read(path, dtype="float32")
    if data.ndim > 1: data = data.mean(axis=1)
    audio = torch.from_numpy(data).unsqueeze(0)
    if sr != target_sr:
        audio = torchaudio.transforms.Resample(sr, target_sr)(audio)
    return audio

def single_job(utt, utt2wav, ort_session):
    audio = load_wav_sf(utt2wav[utt])
    feat = kaldi.fbank(audio, num_mel_bins=80, dither=0, sample_frequency=16000)
    feat = feat - feat.mean(dim=0, keepdim=True)
    embedding = ort_session.run(None, {ort_session.get_inputs()[0].name: feat.unsqueeze(0).numpy()})[0].flatten().tolist()
    return utt, embedding

def main(args):
    utt2wav, utt2spk = {}, {}
    with open(f"{args.dir}/wav.scp") as f:
        for line in f:
            p = line.strip().split(maxsplit=1)
            if len(p) == 2: utt2wav[p[0]] = p[1]
    with open(f"{args.dir}/utt2spk") as f:
        for line in f:
            p = line.strip().split()
            if len(p) == 2: utt2spk[p[0]] = p[1]
    
    option = ort.SessionOptions()
    option.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
    option.intra_op_num_threads = 1
    ort_session = ort.InferenceSession(args.onnx_path, sess_options=option, providers=["CUDAExecutionProvider", "CPUExecutionProvider"])
    
    utt2embedding, spk2embedding = {}, {}
    with ThreadPoolExecutor(max_workers=args.num_thread) as ex:
        futures = {ex.submit(single_job, u, utt2wav, ort_session): u for u in utt2wav}
        for fut in tqdm(as_completed(futures), total=len(futures)):
            try:
                utt, emb = fut.result()
                utt2embedding[utt] = emb
                spk = utt2spk[utt]
                if spk not in spk2embedding:
                    spk2embedding[spk] = []
                spk2embedding[spk].append(emb)
            except Exception as e:
                print(f"Error: {e}")
    
    for k, v in spk2embedding.items():
        spk2embedding[k] = torch.tensor(v).mean(dim=0).tolist()
    
    torch.save(utt2embedding, f"{args.dir}/utt2embedding.pt")
    torch.save(spk2embedding, f"{args.dir}/spk2embedding.pt")
    print(f"Saved {len(utt2embedding)} utt embeddings, {len(spk2embedding)} spk embeddings")

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--dir", required=True)
    parser.add_argument("--onnx_path", required=True)
    parser.add_argument("--num_thread", type=int, default=4)
    main(parser.parse_args())
'''

# ===== 5. extract_speech_token.py 패치 (whisper 128-bin + feats_length) =====
token_patch = '''
import sys, os, argparse, torch, logging
sys.path.insert(0, "/content/CosyVoice")
import soundfile as sf
import torchaudio
import numpy as np
import onnxruntime as ort
import whisper
from tqdm import tqdm

def load_wav_sf(path, target_sr=16000):
    data, sr = sf.read(path, dtype="float32")
    if data.ndim > 1: data = data.mean(axis=1)
    audio = torch.from_numpy(data).unsqueeze(0)
    if sr != target_sr:
        audio = torchaudio.transforms.Resample(sr, target_sr)(audio)
    return audio

def main(args):
    utt2wav = {}
    with open(f"{args.dir}/wav.scp") as f:
        for line in f:
            p = line.strip().split(maxsplit=1)
            if len(p) == 2: utt2wav[p[0]] = p[1]
    
    option = ort.SessionOptions()
    option.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
    option.intra_op_num_threads = 1
    ort_session = ort.InferenceSession(args.onnx_path, sess_options=option, providers=["CUDAExecutionProvider", "CPUExecutionProvider"])
    
    utt2token = {}
    for utt in tqdm(utt2wav):
        try:
            audio = load_wav_sf(utt2wav[utt])
            if audio.shape[1] / 16000 > 30:
                utt2token[utt] = []
                continue
            feat = whisper.log_mel_spectrogram(audio, n_mels=128)
            tokens = ort_session.run(None, {
                ort_session.get_inputs()[0].name: feat.detach().cpu().numpy(),
                ort_session.get_inputs()[1].name: np.array([feat.shape[2]], dtype=np.int32)
            })[0].flatten().tolist()
            utt2token[utt] = tokens
        except Exception as e:
            print(f"Error {utt}: {e}")
    
    torch.save(utt2token, f"{args.dir}/utt2speech_token.pt")
    print(f"Saved {len(utt2token)} speech tokens")

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--dir", required=True)
    parser.add_argument("--onnx_path", required=True)
    main(parser.parse_args())
'''

# 패치 파일 저장
os.makedirs('examples/libritts/cosyvoice2/tools', exist_ok=True)
with open('examples/libritts/cosyvoice2/tools/extract_embedding.py', 'w') as f:
    f.write(embedding_patch)
with open('examples/libritts/cosyvoice2/tools/extract_speech_token.py', 'w') as f:
    f.write(token_patch)

os.makedirs('tools', exist_ok=True)
with open('tools/extract_embedding.py', 'w') as f:
    f.write(embedding_patch)
with open('tools/extract_speech_token.py', 'w') as f:
    f.write(token_patch)

import numpy as np
print(f"모든 패치 완료! numpy: {np.__version__}")
print("패치 목록:")
print("  1. file_utils.py - soundfile 사용")
print("  2. processor.py - soundfile 사용") 
print("  3. train_utils.py - PyTorch 호환성")
print("  4. extract_embedding.py - kaldi.fbank 80-bin")
print("  5. extract_speech_token.py - whisper 128-bin + feats_length")

In [None]:
# 6. 모델 다운로드 (2512 = 한국어)
from huggingface_hub import snapshot_download

MODEL_DIR = '/content/CosyVoice/pretrained_models/CosyVoice3-0.5B'
snapshot_download('FunAudioLLM/Fun-CosyVoice3-0.5B-2512', local_dir=MODEL_DIR)
print(f"다운로드 완료: {MODEL_DIR}")

In [None]:
# 7. Google Drive 마운트 & 데이터 복사
from google.colab import drive
drive.mount('/content/drive')

!cp /content/drive/MyDrive/debi_tts/filtered_transcripts.json /content/
!cp /content/drive/MyDrive/debi_tts/Debi_Marlene_KOR.zip /content/
!unzip -q -o /content/Debi_Marlene_KOR.zip -d /content/
print("데이터 복사 완료")

In [None]:
# 8. 모델 로드
from cosyvoice.cli.cosyvoice import AutoModel
import json
import soundfile as sf

cosyvoice = AutoModel(model_dir=MODEL_DIR)
print(f"모델 로드 성공! Sample Rate: {cosyvoice.sample_rate}")

In [None]:
# 9. 대사 데이터 로드
import os

with open('/content/filtered_transcripts.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

debi_lines = [d for d in data if d['character'] == 'Debi']
marlene_lines = [d for d in data if d['character'] == 'Marlene']
print(f"Debi: {len(debi_lines)}개, Marlene: {len(marlene_lines)}개")

def get_wav_path(character, filename):
    for p in [f'/content/{character}/{filename}', f'/content/{character}/{filename.replace(" ", "_")}']:
        if os.path.exists(p): return p
    return None

In [None]:
# 10. Zero-shot 테스트 (Debi)
from IPython.display import Audio, display

debi_prompt = None
for d in debi_lines:
    if len(d['text']) > 10 and '!' not in d['text'] and '?' not in d['text']:
        wav_path = get_wav_path('Debi', d['path'].split('\\')[-1])
        if wav_path:
            debi_prompt = {'path': wav_path, 'text': d['text']}
            break

print(f"프롬프트: {debi_prompt['text']}")
prompt_text = f"You are a helpful assistant.<|endofprompt|>{debi_prompt['text']}"
test_text = "안녕! 나는 데비야! 오늘 뭐 할래?"

for i, out in enumerate(cosyvoice.inference_zero_shot(tts_text=test_text, prompt_text=prompt_text, prompt_wav=debi_prompt['path'], stream=False)):
    sf.write(f'/content/test_debi_{i}.wav', out['tts_speech'].squeeze().cpu().numpy(), cosyvoice.sample_rate)

print("[원본]")
display(Audio(debi_prompt['path']))
print("[생성]")
display(Audio('/content/test_debi_0.wav'))

In [None]:
# 11. Zero-shot 테스트 (Marlene)
marlene_prompt = None
for d in marlene_lines:
    if len(d['text']) > 10 and '!' not in d['text'] and '?' not in d['text']:
        wav_path = get_wav_path('Marlene', d['path'].split('\\')[-1])
        if wav_path:
            marlene_prompt = {'path': wav_path, 'text': d['text']}
            break

if marlene_prompt:
    print(f"프롬프트: {marlene_prompt['text']}")
    prompt_text = f"You are a helpful assistant.<|endofprompt|>{marlene_prompt['text']}"
    test_text = "...시끄러워. 조용히 해."
    
    for i, out in enumerate(cosyvoice.inference_zero_shot(tts_text=test_text, prompt_text=prompt_text, prompt_wav=marlene_prompt['path'], stream=False)):
        sf.write(f'/content/test_marlene_{i}.wav', out['tts_speech'].squeeze().cpu().numpy(), cosyvoice.sample_rate)
    
    print("[원본]")
    display(Audio(marlene_prompt['path']))
    print("[생성]")
    display(Audio('/content/test_marlene_0.wav'))

---
# SFT Fine-tuning
---

In [None]:
# 12. 훈련 데이터 준비
from pathlib import Path

DATA_DIR = Path('/content/CosyVoice/examples/libritts/cosyvoice2/data/debi_marlene')
DATA_DIR.mkdir(parents=True, exist_ok=True)

wav_scp, text_lines, utt2spk = [], [], []
spk2utt = {'Debi': [], 'Marlene': []}

for i, item in enumerate(data):
    filename = item['path'].split('\\')[-1]
    wav_path = get_wav_path(item['character'], filename)
    if not wav_path: continue
    
    utt_id = f"{item['character']}_{i:04d}"
    wav_scp.append(f'{utt_id} {wav_path}')
    text_lines.append(f"{utt_id} {item['text']}")
    utt2spk.append(f"{utt_id} {item['character']}")
    spk2utt[item['character']].append(utt_id)

(DATA_DIR / 'wav.scp').write_text('\n'.join(wav_scp), encoding='utf-8')
(DATA_DIR / 'text').write_text('\n'.join(text_lines), encoding='utf-8')
(DATA_DIR / 'utt2spk').write_text('\n'.join(utt2spk), encoding='utf-8')
(DATA_DIR / 'spk2utt').write_text('\n'.join([f"{k} {' '.join(v)}" for k,v in spk2utt.items()]), encoding='utf-8')

print(f"Debi: {len(spk2utt['Debi'])}개, Marlene: {len(spk2utt['Marlene'])}개")

In [None]:
# 13. Speaker Embedding 추출
!python examples/libritts/cosyvoice2/tools/extract_embedding.py \
    --dir examples/libritts/cosyvoice2/data/debi_marlene \
    --onnx_path {MODEL_DIR}/campplus.onnx

In [None]:
# 14. Speech Token 추출
!python examples/libritts/cosyvoice2/tools/extract_speech_token.py \
    --dir examples/libritts/cosyvoice2/data/debi_marlene \
    --onnx_path {MODEL_DIR}/speech_tokenizer_v3.onnx

In [None]:
# 15. Parquet 변환
!mkdir -p examples/libritts/cosyvoice2/data/debi_marlene/parquet
!python examples/libritts/cosyvoice2/tools/make_parquet_list.py \
    --num_utts_per_parquet 100 \
    --num_processes 4 \
    --src_dir examples/libritts/cosyvoice2/data/debi_marlene \
    --des_dir examples/libritts/cosyvoice2/data/debi_marlene/parquet

!ls examples/libritts/cosyvoice2/data/debi_marlene/parquet/

In [None]:
# 16. 훈련 데이터 리스트 (절대 경로 사용!)
%cd /content/CosyVoice/examples/libritts/cosyvoice2

import glob

# 존재하는 parquet 파일만 절대 경로로
parquet_files = sorted(glob.glob('/content/CosyVoice/examples/libritts/cosyvoice2/data/debi_marlene/parquet/*.tar'))
data_list = '\n'.join(parquet_files)

with open('data/train.data.list', 'w') as f:
    f.write(data_list)
with open('data/dev.data.list', 'w') as f:
    f.write(data_list)

print(f"Parquet 파일 {len(parquet_files)}개:")
!cat data/train.data.list

In [None]:
# 17. vocab_size 및 학습 설정 수정
import torch
import re

MODEL_DIR = '/content/CosyVoice/pretrained_models/CosyVoice3-0.5B'
ckpt = torch.load(f'{MODEL_DIR}/llm.pt', map_location='cpu', weights_only=False)
target_vocab = ckpt['llm_decoder.weight'].shape[0]
print(f"체크포인트 vocab_size: {target_vocab}")

config_path = '/content/CosyVoice/examples/libritts/cosyvoice2/conf/cosyvoice2.yaml'
with open(config_path, 'r') as f:
    content = f.read()

# 1. vocab_size 수정
match = re.search(r'speech_token_size:\s*(\d+)', content)
if match:
    current = int(match.group(1))
    offset = 3
    new_size = target_vocab - offset
    if current != new_size:
        content = re.sub(r'speech_token_size:\s*\d+', f'speech_token_size: {new_size}', content)
        print(f"speech_token_size 수정: {current} -> {new_size}")
    else:
        print(f"speech_token_size 이미 올바름: {current}")

# 2. max_epoch 수정 (200 -> 60, 오버피팅 방지)
content = re.sub(r'max_epoch:\s*\d+', 'max_epoch: 60', content)
print("max_epoch: 60으로 설정 (15, 30, 45, 60 체크포인트 저장)")

# 3. learning rate 수정 (1e-5 -> 5e-6, 더 안정적인 학습)
content = re.sub(r'lr:\s*[\d.e-]+', 'lr: 5e-6', content)
print("lr: 5e-6으로 설정 (더 안정적인 학습)")

# 4. warmup_steps 수정 (2500 -> 500, 데이터가 적으므로)
content = re.sub(r'warmup_steps:\s*\d+', 'warmup_steps: 500', content)
print("warmup_steps: 500으로 설정")

with open(config_path, 'w') as f:
    f.write(content)
print("\n설정 파일 저장 완료!")

In [None]:
# 18. 자동 백업 시작 (5분마다 Google Drive 저장 - 세션 종료 대비)
import os

backup_script = '''
import time, shutil, os
src = "/content/CosyVoice/examples/libritts/cosyvoice2/exp/debi_marlene"
dst = "/content/drive/MyDrive/debi_tts/cosyvoice3_finetuned/debi_marlene"
while True:
    time.sleep(300)  # 5분
    if os.path.exists(src):
        try:
            shutil.copytree(src, dst, dirs_exist_ok=True)
            print(f"백업 완료: {dst}")
        except Exception as e:
            print(f"백업 실패: {e}")
'''
with open('/content/backup.py', 'w') as f:
    f.write(backup_script)
!nohup python /content/backup.py > /content/backup.log 2>&1 &

# 체크포인트 자동 삭제 (15 에포크 단위로 유지: 15, 30, 45, 60) - Flow 폴더
cleanup_script = '''
import time, glob, os, re

d = "/content/CosyVoice/examples/libritts/cosyvoice2/exp/debi_marlene/flow"
keep_epochs = {15, 30, 45, 60}  # 유지할 에포크

while True:
    time.sleep(60)  # 1분마다 확인
    files = glob.glob(f"{d}/epoch_*_whole.pt")
    if not files:
        continue
    
    for f in files:
        # epoch 번호 추출
        match = re.search(r'epoch_(\d+)_whole\.pt', f)
        if match:
            epoch = int(match.group(1))
            # 유지할 에포크가 아니고, 최신 2개에도 포함되지 않으면 삭제
            sorted_files = sorted(files)
            if epoch not in keep_epochs and f not in sorted_files[-2:]:
                try:
                    os.remove(f)
                    print(f"삭제: epoch_{epoch}")
                except:
                    pass
'''
with open('/content/cleanup.py', 'w') as f:
    f.write(cleanup_script)
!nohup python /content/cleanup.py > /dev/null 2>&1 &

print("자동 백업 시작 (5분마다 Drive 저장)")
print("자동 삭제 시작 (15, 30, 45, 60 에포크 + 최신 2개 유지)")

In [None]:
# 19. Flow 훈련 (음성 품질/화자 특성 파인튜닝)
%cd /content/CosyVoice/examples/libritts/cosyvoice2

MODEL_DIR = '/content/CosyVoice/pretrained_models/CosyVoice3-0.5B'

!PYTHONPATH=/content/CosyVoice:/content/CosyVoice/third_party/Matcha-TTS \
    torchrun --nproc_per_node=1 --master_port=29501 \
    /content/CosyVoice/cosyvoice/bin/train.py \
    --train_engine torch_ddp \
    --config conf/cosyvoice2.yaml \
    --train_data data/train.data.list \
    --cv_data data/dev.data.list \
    --model flow \
    --checkpoint {MODEL_DIR}/flow.pt \
    --model_dir exp/debi_marlene/flow \
    --tensorboard_dir tensorboard/debi_marlene/flow \
    --num_workers 2 \
    --prefetch 50 \
    --pin_memory \
    --use_amp

In [None]:
# 20. 모델 저장 (Google Drive) - 절대 경로 사용
import shutil
import os

src = '/content/CosyVoice/examples/libritts/cosyvoice2/exp/debi_marlene'
dst = '/content/drive/MyDrive/debi_tts/cosyvoice3_finetuned/debi_marlene'

if os.path.exists(src):
    os.makedirs(os.path.dirname(dst), exist_ok=True)
    shutil.copytree(src, dst, dirs_exist_ok=True)
    print(f"저장 완료: {dst}")
    !ls -la {dst}/llm/
else:
    print(f"소스 폴더 없음: {src}")
    print("자동 백업 확인:")
    !ls -la /content/drive/MyDrive/debi_tts/cosyvoice3_finetuned/ 2>/dev/null || echo "백업 없음"

In [None]:
# 21. 파인튜닝 모델 테스트 (Flow)
%cd /content/CosyVoice
import glob, shutil
import soundfile as sf
from IPython.display import Audio, display
from cosyvoice.cli.cosyvoice import AutoModel

MODEL_DIR = '/content/CosyVoice/pretrained_models/CosyVoice3-0.5B'

# Flow 체크포인트 찾기 (로컬 또는 Drive 백업)
ckpts = sorted(glob.glob('/content/CosyVoice/examples/libritts/cosyvoice2/exp/debi_marlene/flow/epoch_*_whole.pt'))
if not ckpts:
    ckpts = sorted(glob.glob('/content/drive/MyDrive/debi_tts/cosyvoice3_finetuned/debi_marlene/flow/epoch_*_whole.pt'))

if ckpts:
    print(f"Flow 체크포인트: {ckpts[-1]}")
    shutil.copy(ckpts[-1], f'{MODEL_DIR}/flow.pt')
    cosyvoice_ft = AutoModel(model_dir=MODEL_DIR)
    print(f"Speaker: {cosyvoice_ft.list_available_spks()}")
    
    for i, out in enumerate(cosyvoice_ft.inference_sft(tts_text='안녕! 파인튜닝 완료!', spk_id='Debi', stream=False)):
        sf.write(f'/content/test_sft_{i}.wav', out['tts_speech'].squeeze().cpu().numpy(), cosyvoice_ft.sample_rate)
    display(Audio('/content/test_sft_0.wav'))
else:
    print("Flow 체크포인트 없음")