# Qwen3-TTS Epoch 비교 테스트

**테스트 목표:** batch size별 epoch 영향 비교

| 모델 | batch | epoch |
|------|-------|-------|
| C-3 | 2 | 3 |
| C-10 | 2 | 10 |
| v6-3 | 8 | 3 |
| v6-10 | 8 | 10 |

## 셀 1: 환경 설정 (공통)

In [None]:
!apt-get install -y sox
!pip install -q soundfile librosa tqdm huggingface_hub
!pip install flash-attn --no-build-isolation
!git clone https://github.com/QwenLM/Qwen3-TTS.git /content/Qwen3-TTS-repo
%cd /content/Qwen3-TTS-repo
!pip install -e .
print("환경 설정 완료!")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# 패치
sft_path = "/content/Qwen3-TTS-repo/finetuning/sft_12hz.py"
with open(sft_path, 'r') as f:
    code = f.read()
code = code.replace('log_with="tensorboard"', 'log_with=None')
with open(sft_path, 'w') as f:
    f.write(code)
print("패치 완료!")

In [None]:
from huggingface_hub import snapshot_download
import os, json, librosa, soundfile as sf
from tqdm import tqdm

# 모델 다운로드
model_path = snapshot_download(
    "Qwen/Qwen3-TTS-12Hz-1.7B-Base", 
    local_dir="/content/qwen3_tts_model"
)

# 오디오 24kHz 변환
AUDIO_DIR = "/content/drive/MyDrive/debi_tts_data/audio"
OUTPUT_DIR = "/content/audio_24k"
os.makedirs(OUTPUT_DIR, exist_ok=True)

files = [f for f in os.listdir(AUDIO_DIR) if f.endswith('.wav')]
for f in tqdm(files):
    try:
        audio, _ = librosa.load(os.path.join(AUDIO_DIR, f), sr=24000)
        sf.write(os.path.join(OUTPUT_DIR, f), audio, 24000)
    except:
        pass

# JSONL 생성
with open("/content/drive/MyDrive/debi_tts_data/debi_finetune.jsonl", 'r', encoding='utf-8') as f:
    data = [json.loads(line) for line in f]

valid_files = set(os.listdir(OUTPUT_DIR))
REF_AUDIO = "/content/audio_24k/Debi_airSupply_2_01.wav"

filtered = []
for item in data:
    filename = item['audio'].split('/')[-1]
    if filename in valid_files:
        item['audio'] = f"/content/audio_24k/{filename}"
        item['ref_audio'] = REF_AUDIO
        filtered.append(item)

with open("/content/debi_24k.jsonl", 'w', encoding='utf-8') as f:
    for item in filtered:
        f.write(json.dumps(item, ensure_ascii=False) + '\n')

print(f"준비 완료: {len(filtered)}개 샘플")

In [None]:
# 토큰화
!python /content/Qwen3-TTS-repo/finetuning/prepare_data.py \
    --device cuda:0 \
    --tokenizer_model_path Qwen/Qwen3-TTS-Tokenizer-12Hz \
    --input_jsonl /content/debi_24k.jsonl \
    --output_jsonl /content/debi_tokenized.jsonl

---
# C-3 (batch 2, epoch 3)

In [None]:
!rm -rf /content/drive/MyDrive/debi_C_epoch3
!python /content/Qwen3-TTS-repo/finetuning/sft_12hz.py \
    --init_model_path /content/qwen3_tts_model \
    --train_jsonl /content/debi_tokenized.jsonl \
    --output_model_path /content/drive/MyDrive/debi_C_epoch3 \
    --batch_size 2 \
    --lr 1e-6 \
    --num_epochs 3 \
    --speaker_name debi

---
# C-10 (batch 2, epoch 10)

In [None]:
!rm -rf /content/drive/MyDrive/debi_C_epoch10
!python /content/Qwen3-TTS-repo/finetuning/sft_12hz.py \
    --init_model_path /content/qwen3_tts_model \
    --train_jsonl /content/debi_tokenized.jsonl \
    --output_model_path /content/drive/MyDrive/debi_C_epoch10 \
    --batch_size 2 \
    --lr 1e-6 \
    --num_epochs 10 \
    --speaker_name debi

---
# v6-3 (batch 8, epoch 3)

In [None]:
!rm -rf /content/drive/MyDrive/debi_v6_epoch3
!python /content/Qwen3-TTS-repo/finetuning/sft_12hz.py \
    --init_model_path /content/qwen3_tts_model \
    --train_jsonl /content/debi_tokenized.jsonl \
    --output_model_path /content/drive/MyDrive/debi_v6_epoch3 \
    --batch_size 8 \
    --lr 1e-6 \
    --num_epochs 3 \
    --speaker_name debi

---
# v6-10 (batch 8, epoch 10)

In [None]:
!rm -rf /content/drive/MyDrive/debi_v6_epoch10
!python /content/Qwen3-TTS-repo/finetuning/sft_12hz.py \
    --init_model_path /content/qwen3_tts_model \
    --train_jsonl /content/debi_tokenized.jsonl \
    --output_model_path /content/drive/MyDrive/debi_v6_epoch10 \
    --batch_size 8 \
    --lr 1e-6 \
    --num_epochs 10 \
    --speaker_name debi

---
# 비교 테스트

In [None]:
import torch
from IPython.display import Audio, display
from qwen_tts import Qwen3TTSModel

TEST_TEXTS = [
    "뭐야, 왜 이렇게 늦은 거야?",
    "드디어 해냈다! 이겼어!",
    "정말 고마워, 잊지 않을게.",
]

MODELS = [
    ("C-3 (batch2, epoch3)", "/content/drive/MyDrive/debi_C_epoch3/checkpoint-epoch-2"),
    ("C-10 (batch2, epoch10)", "/content/drive/MyDrive/debi_C_epoch10/checkpoint-epoch-9"),
    ("v6-3 (batch8, epoch3)", "/content/drive/MyDrive/debi_v6_epoch3/checkpoint-epoch-2"),
    ("v6-10 (batch8, epoch10)", "/content/drive/MyDrive/debi_v6_epoch10/checkpoint-epoch-9"),
]

for model_name, model_path in MODELS:
    print("=" * 50)
    print(model_name)
    print("=" * 50)
    
    tts = Qwen3TTSModel.from_pretrained(
        model_path,
        device_map="cuda:0",
        dtype=torch.bfloat16,
        attn_implementation="flash_attention_2",
    )
    
    for i, text in enumerate(TEST_TEXTS):
        print(f"\n[{i+1}] {text}")
        wavs, sr = tts.generate_custom_voice(text=text, speaker="debi")
        display(Audio(wavs[0], rate=sr))
    
    del tts
    torch.cuda.empty_cache()
    print("\n")

---
# 결과 정리

| 모델 | batch | epoch | 결과 |
|------|-------|-------|------|
| C-3 | 2 | 3 | |
| C-10 | 2 | 10 | |
| v6-3 | 8 | 3 | |
| v6-10 | 8 | 10 | |

**결론:**
- epoch 늘리면 캐릭터성 강해지나?
- 과적합 발생하나?