# Qwen3-TTS 파인튜닝 - Marlene Voice

**테스트 목표:** v6 vs C 설정 비교 + 제로샷 비교

| 모델 | attn | lr | batch |
|------|------|-----|-------|
| v6 | flash | 1e-6 | 8 |
| C | flash | 1e-6 | 2 |
| 제로샷 | - | - | - |

**사전 준비:** Google Drive에 `marlene_tts_data` 폴더 업로드

## 셀 1: 환경 설정

In [None]:
!apt-get install -y sox
!pip install -q soundfile librosa tqdm huggingface_hub
!pip install flash-attn --no-build-isolation
!git clone https://github.com/QwenLM/Qwen3-TTS.git /content/Qwen3-TTS-repo
%cd /content/Qwen3-TTS-repo
!pip install -e .
print("환경 설정 완료!")

## 셀 2: Google Drive 마운트

In [None]:
from google.colab import drive
drive.mount('/content/drive')
print("드라이브 마운트 완료!")

## 셀 3: 학습 코드 패치

In [None]:
sft_path = "/content/Qwen3-TTS-repo/finetuning/sft_12hz.py"

with open(sft_path, 'r') as f:
    code = f.read()

code = code.replace('log_with="tensorboard"', 'log_with=None')

with open(sft_path, 'w') as f:
    f.write(code)

print("패치 완료! (flash_attention_2 유지)")

## 셀 4: 모델 다운로드

In [None]:
from huggingface_hub import snapshot_download

model_path = snapshot_download(
    "Qwen/Qwen3-TTS-12Hz-1.7B-Base", 
    local_dir="/content/qwen3_tts_model"
)
print(f"모델 준비 완료: {model_path}")

## 셀 5: 마를렌 오디오 변환 + JSONL 생성

In [None]:
import os, json, librosa, soundfile as sf
from tqdm import tqdm

# 마를렌 오디오 24kHz 변환
AUDIO_DIR = "/content/drive/MyDrive/marlene_tts_data/audio"
OUTPUT_DIR = "/content/audio_24k_marlene"
os.makedirs(OUTPUT_DIR, exist_ok=True)

files = [f for f in os.listdir(AUDIO_DIR) if f.endswith('.wav')]
for f in tqdm(files):
    try:
        audio, _ = librosa.load(os.path.join(AUDIO_DIR, f), sr=24000)
        sf.write(os.path.join(OUTPUT_DIR, f), audio, 24000)
    except:
        pass

print(f"오디오 변환 완료: {len(os.listdir(OUTPUT_DIR))}개")

In [None]:
# JSONL 경로 업데이트
with open("/content/drive/MyDrive/marlene_tts_data/marlene_finetune.jsonl", 'r', encoding='utf-8') as f:
    data = [json.loads(line) for line in f]

valid_files = set(os.listdir(OUTPUT_DIR))
REF_AUDIO = "/content/audio_24k_marlene/Marlene_airSupply_1_01.wav"

filtered = []
for item in data:
    filename = item['audio']
    if filename in valid_files:
        filtered.append({
            "audio": f"/content/audio_24k_marlene/{filename}",
            "text": item['text'],
            "ref_audio": REF_AUDIO
        })

with open("/content/marlene_24k.jsonl", 'w', encoding='utf-8') as f:
    for item in filtered:
        f.write(json.dumps(item, ensure_ascii=False) + '\n')

print(f"JSONL 준비 완료: {len(filtered)}개 샘플")

## 셀 6: 데이터 토큰화

In [None]:
!python /content/Qwen3-TTS-repo/finetuning/prepare_data.py \
    --device cuda:0 \
    --tokenizer_model_path Qwen/Qwen3-TTS-Tokenizer-12Hz \
    --input_jsonl /content/marlene_24k.jsonl \
    --output_jsonl /content/marlene_tokenized.jsonl

---
# v6 설정 (batch 8)

In [None]:
!rm -rf /content/drive/MyDrive/marlene_model_v6
!python /content/Qwen3-TTS-repo/finetuning/sft_12hz.py \
    --init_model_path /content/qwen3_tts_model \
    --train_jsonl /content/marlene_tokenized.jsonl \
    --output_model_path /content/drive/MyDrive/marlene_model_v6 \
    --batch_size 8 \
    --lr 1e-6 \
    --num_epochs 5 \
    --speaker_name marlene

---
# C 설정 (batch 2)

In [None]:
!rm -rf /content/drive/MyDrive/marlene_model_C
!python /content/Qwen3-TTS-repo/finetuning/sft_12hz.py \
    --init_model_path /content/qwen3_tts_model \
    --train_jsonl /content/marlene_tokenized.jsonl \
    --output_model_path /content/drive/MyDrive/marlene_model_C \
    --batch_size 2 \
    --lr 1e-6 \
    --num_epochs 5 \
    --speaker_name marlene

---
# 비교 테스트: v6 vs C vs 제로샷

In [None]:
import torch
from IPython.display import Audio, display
from qwen_tts import Qwen3TTSModel

TEST_TEXTS = [
    "뭐야, 왜 이렇게 늦은 거야?",
    "드디어 해냈다! 이겼어!",
    "정말 고마워, 잊지 않을게.",
]

# ========== v6 모델 (batch 8) ==========
print("=" * 50)
print("v6 모델 (batch 8)")
print("=" * 50)

tts_v6 = Qwen3TTSModel.from_pretrained(
    "/content/drive/MyDrive/marlene_model_v6/checkpoint-epoch-4",
    device_map="cuda:0",
    dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
)

for i, text in enumerate(TEST_TEXTS):
    print(f"\n[v6-{i+1}] {text}")
    wavs, sr = tts_v6.generate_custom_voice(text=text, speaker="marlene")
    display(Audio(wavs[0], rate=sr))

del tts_v6
torch.cuda.empty_cache()

In [None]:
# ========== C 모델 (batch 2) ==========
print("=" * 50)
print("C 모델 (batch 2)")
print("=" * 50)

tts_c = Qwen3TTSModel.from_pretrained(
    "/content/drive/MyDrive/marlene_model_C/checkpoint-epoch-4",
    device_map="cuda:0",
    dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
)

for i, text in enumerate(TEST_TEXTS):
    print(f"\n[C-{i+1}] {text}")
    wavs, sr = tts_c.generate_custom_voice(text=text, speaker="marlene")
    display(Audio(wavs[0], rate=sr))

del tts_c
torch.cuda.empty_cache()

In [None]:
# ========== Base 모델 Voice Clone (제로샷) ==========
print("=" * 50)
print("Base 모델 Voice Clone (제로샷)")
print("=" * 50)

base_model = Qwen3TTSModel.from_pretrained(
    "/content/qwen3_tts_model",
    device_map="cuda:0",
    dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
)

ref_audio = "/content/audio_24k_marlene/Marlene_airSupply_1_01.wav"
ref_text = "필요한 게 있었으면 좋겠는데"
prompt = base_model.create_voice_clone_prompt(ref_audio=ref_audio, ref_text=ref_text)

for i, text in enumerate(TEST_TEXTS):
    print(f"\n[제로샷-{i+1}] {text}")
    wavs, sr = base_model.generate_voice_clone(text=text, voice_clone_prompt=prompt)
    display(Audio(wavs[0], rate=sr))

print("\n비교 완료!")

---
# 결과 정리

| 모델 | 결과 |
|------|------|
| v6 (batch 8) | |
| C (batch 2) | |
| 제로샷 | |