# LoRA Rank Ablation Study

对 `diagnosis_generator`（Qwen3-1.7B）做 LoRA rank 消融实验：
- 对比 rank = 8, 16, 32, 64, 128
- alpha = rank × 2（保持比值 2.0 不变）
- 其余超参固定（lr=1e-4, batch=2, grad_acc=8, epochs=2）

评估指标：
1. Training loss 曲线
2. JSON 格式正确率
3. 字段完整性（results/recommendations/recomm_short）
4. 概率合理性（3 个疾病概率之和 ≈ 1.0）
5. 推理速度
6. 训练显存占用

**注意**: 每个 rank 训练完后需要 **重启 Runtime** 释放显存，再训练下一个。
建议按 rank 从小到大依次训练。

In [None]:
# 0. Setup
import os
repo_dir = '/content/Intel_Health'
if not os.path.exists(repo_dir):
    !git clone https://github.com/DemonRain7/Intel_Health.git {repo_dir}
else:
    !git -C {repo_dir} pull
%cd {repo_dir}

# Colab 已预装 torch，不要重装，否则会循环导入报错
!pip -q install "transformers>=4.46" datasets peft accelerate bitsandbytes sentencepiece loguru

try:
    from google.colab import userdata
    os.environ["HF_TOKEN"] = userdata.get("HF_TOKEN")
    print("HF_TOKEN loaded from Colab Secrets")
except (ImportError, Exception):
    from huggingface_hub import login
    login()

In [None]:
# 1. Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

DRIVE_ROOT = '/content/drive/MyDrive/Code_Project/IntelHealth'
SFT_DATA_DIR = f'{DRIVE_ROOT}/datasets/agent_sft/diagnosis_generator'
ABLATION_OUTPUT_ROOT = f'{DRIVE_ROOT}/models/adapters/ablation'
ABLATION_MERGED_ROOT = f'{DRIVE_ROOT}/models/merged/ablation'

os.makedirs(ABLATION_OUTPUT_ROOT, exist_ok=True)
os.makedirs(ABLATION_MERGED_ROOT, exist_ok=True)

# 检查训练数据
data_files = [f for f in os.listdir(SFT_DATA_DIR) if f.endswith('.jsonl')]
print(f'SFT data dir: {SFT_DATA_DIR}')
print(f'Data files: {data_files}')
for f in data_files:
    path = os.path.join(SFT_DATA_DIR, f)
    with open(path) as fp:
        count = sum(1 for _ in fp)
    print(f'  {f}: {count} samples')

## Part A: Training（每个 rank 跑一次，跑完重启 Runtime）

**修改下面的 `CURRENT_RANK` 后运行此 cell 和下一个 cell。**

训练顺序建议：8 → 16 → 32 → 64 → 128（每个跑完后重启 Runtime 再跑下一个）

In [None]:
# 2. 选择当前要训练的 rank
# ============================
# 每次只训练一个 rank，训练完重启 Runtime，修改此值再跑
CURRENT_RANK = 64  # <-- 修改这里: 8, 16, 32, 64, 128
# ============================

CURRENT_ALPHA = CURRENT_RANK * 2
MODEL_NAME = 'Qwen/Qwen3-1.7B'
OUTPUT_DIR = f'{ABLATION_OUTPUT_ROOT}/rank{CURRENT_RANK}'

# 检查是否已经训练过
if os.path.isdir(OUTPUT_DIR) and os.listdir(OUTPUT_DIR):
    print(f'WARNING: {OUTPUT_DIR} already exists and is not empty!')
    print('Contents:', os.listdir(OUTPUT_DIR))
    print('If you want to retrain, delete the directory first.')
else:
    os.makedirs(OUTPUT_DIR, exist_ok=True)

print(f'\n=== Ablation Config ===')
print(f'Rank:       {CURRENT_RANK}')
print(f'Alpha:      {CURRENT_ALPHA}')
print(f'Alpha/Rank: {CURRENT_ALPHA / CURRENT_RANK}')
print(f'Model:      {MODEL_NAME}')
print(f'Data:       {SFT_DATA_DIR}')
print(f'Output:     {OUTPUT_DIR}')

In [None]:
# 3. Train
import subprocess, shlex, sys, time as _time, os

cmd = [
    'python', 'training/supervised_finetuning.py',
    '--model_name_or_path',          MODEL_NAME,
    '--tokenizer_name_or_path',      MODEL_NAME,
    '--train_file_dir',              SFT_DATA_DIR,
    '--output_dir',                  OUTPUT_DIR,
    '--template_name',               'qwen',
    '--do_train',
    '--fp16',
    '--gradient_checkpointing',
    '--per_device_train_batch_size', '2',
    '--gradient_accumulation_steps',  '8',
    '--num_train_epochs',            '2',
    '--learning_rate',               '1e-4',
    '--lora_rank',                   str(CURRENT_RANK),
    '--lora_alpha',                  str(CURRENT_ALPHA),
    '--lora_dropout',                '0.05',
    '--model_max_length',            '256',
    '--logging_steps',               '5',
    '--save_strategy',               'epoch',
]

print(f'Training rank={CURRENT_RANK}, alpha={CURRENT_ALPHA}...')
print(f'Command: {" ".join(shlex.quote(x) for x in cmd)}')
print('=' * 60)

# Show actual train JSONL files to avoid accidental data contamination
train_jsonl_files = sorted([f for f in os.listdir(SFT_DATA_DIR) if f.endswith('.jsonl')])
print(f'Training JSONL files ({len(train_jsonl_files)}): {train_jsonl_files}')
if any('dapt' in f.lower() for f in train_jsonl_files):
    print('[WARN] Found dapt JSONL file(s); this may contaminate rank ablation training')

_t0 = _time.time()
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1)
for line in proc.stdout:
    print(line, end='', flush=True)
ret = proc.wait()
_elapsed = _time.time() - _t0

if ret != 0:
    raise RuntimeError(f'Training failed with exit code {ret}')
print(f'\nTraining rank={CURRENT_RANK} complete in {_elapsed/60:.1f} min!')
print(f'Adapter saved to: {OUTPUT_DIR}')
child_gpu_stats_path = f'{OUTPUT_DIR}/gpu_stats.json'
if os.path.exists(child_gpu_stats_path):
    print(f'Child GPU stats found: {child_gpu_stats_path}')
else:
    print(f'[WARN] Child GPU stats not found: {child_gpu_stats_path}')

In [None]:
# 4. Save GPU stats after training (read from child process)
import json, os

child_stats_path = f'{OUTPUT_DIR}/gpu_stats.json'
gpu_stats = {
    'rank': CURRENT_RANK,
    'alpha': CURRENT_ALPHA,
    'training_time_min': round(_elapsed / 60, 1),
}

if os.path.exists(child_stats_path):
    with open(child_stats_path) as f:
        child_stats = json.load(f)
    gpu_stats.update({
        'gpu_name': child_stats.get('gpu_name'),
        'total_gb': child_stats.get('total_gb'),
        'peak_allocated_gb': child_stats.get('peak_allocated_gb'),
        'peak_reserved_gb': child_stats.get('peak_reserved_gb'),
        'stats_source': 'training/supervised_finetuning.py',
    })
else:
    print(f'[WARN] Child GPU stats missing: {child_stats_path}')
    gpu_stats['stats_source'] = 'fallback_no_child_stats'

stats_path = f'{OUTPUT_DIR}/ablation_gpu_stats.json'
with open(stats_path, 'w') as f:
    json.dump(gpu_stats, f, indent=2)
print(f'GPU stats saved to {stats_path}')
print(json.dumps(gpu_stats, indent=2))

print('\n>>> Training done. Restart runtime, set CURRENT_RANK, and run next rank.')
print('>>> After all ranks are trained, go to Part B for merge + evaluation.')

---

## Part B: Training Metrics Tradeoff (Run After All Ranks Are Trained)

This section only uses hard training metrics:
- Training loss (first / final / best)
- Peak training VRAM
- Training runtime and throughput
- Adapter size

No inference-format metrics (such as JSON valid rate or results>=3) are used in this section.


In [None]:
# 5. Discover available rank runs
import os

DRIVE_ROOT = '/content/drive/MyDrive/Code_Project/IntelHealth'
ABLATION_OUTPUT_ROOT = f'{DRIVE_ROOT}/models/adapters/ablation'
RANKS = [8, 16, 32, 64, 128]

trained_ranks = []
print('Checking rank directories...')
for rank in RANKS:
    rank_dir = f'{ABLATION_OUTPUT_ROOT}/rank{rank}'
    if not os.path.isdir(rank_dir):
        print(f'  rank={rank:>3}: NOT FOUND ({rank_dir})')
        continue

    has_state = os.path.exists(f'{rank_dir}/trainer_state.json')
    has_train_results = os.path.exists(f'{rank_dir}/train_results.json')
    has_adapter = os.path.exists(f'{rank_dir}/adapter_config.json') and os.path.exists(f'{rank_dir}/adapter_model.safetensors')
    has_gpu_stats = os.path.exists(f'{rank_dir}/ablation_gpu_stats.json') or os.path.exists(f'{rank_dir}/gpu_stats.json')

    ready = has_state and has_train_results and has_adapter
    status = 'READY' if ready else 'INCOMPLETE'
    print(
        f'  rank={rank:>3}: {status} '
        f'(state={has_state}, train_results={has_train_results}, adapter={has_adapter}, gpu_stats={has_gpu_stats})'
    )

    if ready:
        trained_ranks.append(rank)

print(f'\nReady ranks: {trained_ranks}')
if not trained_ranks:
    raise RuntimeError('No READY ranks found. Run Part A training first.')

In [None]:
# 6. Load per-rank training metrics
import json
import pandas as pd

def _read_json(path):
    with open(path, 'r', encoding='utf-8') as f:
        return json.load(f)

def _loss_stats_from_state(state):
    losses = [e['loss'] for e in state.get('log_history', []) if 'loss' in e]
    if not losses:
        return None, None, None
    first_loss = losses[0]
    final_loss = losses[-1]
    best_loss = min(losses)
    return first_loss, final_loss, best_loss

rows = []
for rank in trained_ranks:
    rank_dir = f'{ABLATION_OUTPUT_ROOT}/rank{rank}'

    state = _read_json(f'{rank_dir}/trainer_state.json')
    train_results = _read_json(f'{rank_dir}/train_results.json')
    adapter_cfg = _read_json(f'{rank_dir}/adapter_config.json')

    gpu_stats_path = f'{rank_dir}/ablation_gpu_stats.json'
    if not os.path.exists(gpu_stats_path):
        gpu_stats_path = f'{rank_dir}/gpu_stats.json'
    gpu_stats = _read_json(gpu_stats_path) if os.path.exists(gpu_stats_path) else {}

    first_loss, final_loss, best_loss = _loss_stats_from_state(state)
    loss_drop = (first_loss - final_loss) if (first_loss is not None and final_loss is not None) else None

    adapter_path = f'{rank_dir}/adapter_model.safetensors'
    adapter_size_mb = round(os.path.getsize(adapter_path) / (1024**2), 1) if os.path.exists(adapter_path) else None

    runtime_min = None
    if 'train_runtime' in train_results:
        runtime_min = train_results['train_runtime'] / 60.0
    elif 'training_time_min' in gpu_stats:
        runtime_min = gpu_stats['training_time_min']

    rows.append({
        'Rank': rank,
        'Alpha': int(adapter_cfg.get('lora_alpha', rank * 2)),
        'First Loss': round(first_loss, 4) if first_loss is not None else None,
        'Final Loss': round(final_loss, 4) if final_loss is not None else None,
        'Best Loss': round(best_loss, 4) if best_loss is not None else None,
        'Loss Drop': round(loss_drop, 4) if loss_drop is not None else None,
        'Train Runtime (min)': round(runtime_min, 2) if runtime_min is not None else None,
        'Train Samples/s': round(train_results.get('train_samples_per_second', 0), 3),
        'Train Steps/s': round(train_results.get('train_steps_per_second', 0), 3),
        'Peak VRAM (GB)': gpu_stats.get('peak_allocated_gb'),
        'Peak Reserved VRAM (GB)': gpu_stats.get('peak_reserved_gb'),
        'Adapter Size (MB)': adapter_size_mb,
        'Global Steps': state.get('global_step'),
    })

metrics_df = pd.DataFrame(rows).sort_values('Rank').reset_index(drop=True)

print('=== Training Metrics By Rank ===')
display(metrics_df)

In [None]:
# 7. Compute balance score + Pareto front (loss vs VRAM)
import numpy as np

if metrics_df.empty:
    raise RuntimeError('metrics_df is empty')

def _minmax(series):
    s = series.astype(float)
    lo, hi = s.min(), s.max()
    if hi - lo < 1e-12:
        return pd.Series([0.0] * len(s), index=s.index)
    return (s - lo) / (hi - lo)

metrics_df['Norm Final Loss'] = _minmax(metrics_df['Final Loss'])
metrics_df['Norm Peak VRAM'] = _minmax(metrics_df['Peak VRAM (GB)'])

# You can adjust these weights based on interview story
W_LOSS = 0.5
W_VRAM = 0.5
metrics_df['Balance Score'] = (
    W_LOSS * metrics_df['Norm Final Loss'] +
    W_VRAM * metrics_df['Norm Peak VRAM']
)

pareto = []
for i, row in metrics_df.iterrows():
    dominated = False
    for j, other in metrics_df.iterrows():
        if i == j:
            continue
        not_worse = (
            other['Final Loss'] <= row['Final Loss'] and
            other['Peak VRAM (GB)'] <= row['Peak VRAM (GB)']
        )
        strictly_better = (
            other['Final Loss'] < row['Final Loss'] or
            other['Peak VRAM (GB)'] < row['Peak VRAM (GB)']
        )
        if not_worse and strictly_better:
            dominated = True
            break
    pareto.append(not dominated)

metrics_df['Pareto Optimal'] = pareto

ranked_df = metrics_df.sort_values(
    by=['Pareto Optimal', 'Balance Score', 'Rank'],
    ascending=[False, True, True]
).reset_index(drop=True)
recommended = ranked_df.iloc[0]

show_cols = [
    'Rank', 'Alpha', 'Final Loss', 'Best Loss', 'Loss Drop',
    'Peak VRAM (GB)', 'Train Runtime (min)', 'Train Samples/s',
    'Adapter Size (MB)', 'Pareto Optimal', 'Balance Score'
]

print('=== Ranked by Training Tradeoff ===')
display(ranked_df[show_cols])
print(
    f"Recommended rank by training tradeoff: rank={int(recommended['Rank'])} "
    f"(FinalLoss={recommended['Final Loss']}, PeakVRAM={recommended['Peak VRAM (GB)']}GB, "
    f"Score={recommended['Balance Score']:.4f})"
)

In [None]:
# 8. Visualize hard training metrics
import matplotlib.pyplot as plt
import numpy as np

plot_df = metrics_df.sort_values('Rank').reset_index(drop=True)
x = np.arange(len(plot_df))
labels = [str(r) for r in plot_df['Rank']]

fig, axes = plt.subplots(2, 2, figsize=(16, 10))

# (1) Final vs Best loss
ax = axes[0, 0]
ax.plot(labels, plot_df['Final Loss'], marker='o', label='Final Loss')
ax.plot(labels, plot_df['Best Loss'], marker='s', label='Best Loss')
ax.set_title('Loss vs Rank')
ax.set_xlabel('LoRA Rank')
ax.set_ylabel('Loss')
ax.grid(True, alpha=0.3)
ax.legend()

# (2) Peak VRAM
ax = axes[0, 1]
bars = ax.bar(labels, plot_df['Peak VRAM (GB)'], color='#3f8efc')
ax.set_title('Peak Training VRAM vs Rank')
ax.set_xlabel('LoRA Rank')
ax.set_ylabel('GB')
for b, v in zip(bars, plot_df['Peak VRAM (GB)']):
    ax.text(b.get_x() + b.get_width()/2, b.get_height() + 0.03, f'{v:.2f}', ha='center', fontsize=9)

# (3) Runtime
ax = axes[1, 0]
bars = ax.bar(labels, plot_df['Train Runtime (min)'], color='#f39c12')
ax.set_title('Training Runtime vs Rank')
ax.set_xlabel('LoRA Rank')
ax.set_ylabel('Minutes')
for b, v in zip(bars, plot_df['Train Runtime (min)']):
    ax.text(b.get_x() + b.get_width()/2, b.get_height() + 0.05, f'{v:.2f}', ha='center', fontsize=9)

# (4) Pareto scatter: Loss vs VRAM
ax = axes[1, 1]
ax.scatter(plot_df['Peak VRAM (GB)'], plot_df['Final Loss'], s=80, color='#808080', label='Rank')
pareto_df = plot_df[plot_df['Pareto Optimal']]
ax.scatter(pareto_df['Peak VRAM (GB)'], pareto_df['Final Loss'], s=140, color='#e74c3c', label='Pareto optimal')
for _, r in plot_df.iterrows():
    ax.text(r['Peak VRAM (GB)'] + 0.02, r['Final Loss'] + 0.002, f"r{int(r['Rank'])}", fontsize=9)
ax.set_title('Loss-VRAM Pareto View')
ax.set_xlabel('Peak VRAM (GB)')
ax.set_ylabel('Final Loss')
ax.grid(True, alpha=0.3)
ax.legend()

plt.tight_layout()
plot_path = 'ablation_rank_training_tradeoff.png'
fig.savefig(plot_path, dpi=150, bbox_inches='tight')
plt.show()
print(f'Plot saved to: {plot_path}')

In [None]:
# 9. Auto-generate interview talking points
top3 = ranked_df.head(3)[['Rank', 'Final Loss', 'Peak VRAM (GB)', 'Train Runtime (min)', 'Balance Score']]

print('=== Interview Talking Points (Training Metrics Only) ===')
print(f"1) We compared LoRA rank {sorted(metrics_df['Rank'].tolist())} under fixed training setup.")
print(
    f"2) Recommended rank={int(recommended['Rank'])} by balancing final loss and peak VRAM "
    f"(weights: loss={W_LOSS}, vram={W_VRAM})."
)
print(
    f"3) Recommended rank metrics: final_loss={recommended['Final Loss']}, "
    f"peak_vram={recommended['Peak VRAM (GB)']}GB, runtime={recommended['Train Runtime (min)']}min."
)
print('4) We used only hard training metrics for this decision (no format-dependent inference metric).')

print('\nTop-3 ranks by Balance Score:')
display(top3)

In [None]:
# 10. Concise table for slides/interview
interview_df = ranked_df[[
    'Rank', 'Alpha', 'Final Loss', 'Best Loss', 'Peak VRAM (GB)',
    'Train Runtime (min)', 'Train Samples/s', 'Adapter Size (MB)',
    'Pareto Optimal', 'Balance Score'
]].copy()

display(interview_df)

In [None]:
# 11. Optional: rank pick policy notes
print('Rank selection policy used in this notebook:')
print('- Keep training setup fixed across ranks.')
print('- Compare Final Loss and Peak VRAM as primary axes.')
print('- Use Pareto front + weighted balance score to pick final rank.')
print('- Keep inference-format metrics out of this decision report.')

In [None]:
# 12. Save hard-metrics report to Google Drive
import json, shutil

report_dir = f'{DRIVE_ROOT}/docs/ablation'
os.makedirs(report_dir, exist_ok=True)

report_payload = {
    'metric_scope': 'training_only',
    'weights': {'loss': W_LOSS, 'vram': W_VRAM},
    'recommended_rank': int(recommended['Rank']),
    'recommended_metrics': {
        'final_loss': float(recommended['Final Loss']),
        'best_loss': float(recommended['Best Loss']),
        'peak_vram_gb': float(recommended['Peak VRAM (GB)']),
        'train_runtime_min': float(recommended['Train Runtime (min)']),
        'balance_score': float(recommended['Balance Score']),
    },
    'ranks': ranked_df.to_dict(orient='records'),
}

json_path = f'{report_dir}/ablation_train_tradeoff.json'
with open(json_path, 'w', encoding='utf-8') as f:
    json.dump(report_payload, f, indent=2, ensure_ascii=False)
print(f'JSON saved to: {json_path}')

md_lines = [
    '# LoRA Rank Training Tradeoff Report',
    '',
    '- Metric scope: training-only (no inference-format metrics).',
    f'- Recommended rank: **{int(recommended["Rank"])}**',
    f'- Balance weights: loss={W_LOSS}, vram={W_VRAM}',
    '',
    '## Core Metrics by Rank',
    '',
    '| Rank | Alpha | Final Loss | Best Loss | Peak VRAM(GB) | Runtime(min) | Samples/s | Balance Score | Pareto |',
    '|---:|---:|---:|---:|---:|---:|---:|---:|:---:|',
]

for _, r in ranked_df.iterrows():
    md_lines.append(
        f"| {int(r['Rank'])} | {int(r['Alpha'])} | {r['Final Loss']:.4f} | {r['Best Loss']:.4f} | "
        f"{r['Peak VRAM (GB)']:.2f} | {r['Train Runtime (min)']:.2f} | {r['Train Samples/s']:.3f} | "
        f"{r['Balance Score']:.4f} | {'Y' if r['Pareto Optimal'] else 'N'} |"
    )

md_path = f'{report_dir}/ablation_train_tradeoff.md'
with open(md_path, 'w', encoding='utf-8') as f:
    f.write('\n'.join(md_lines))
print(f'Markdown saved to: {md_path}')

if os.path.exists('ablation_rank_training_tradeoff.png'):
    out_plot = f'{report_dir}/ablation_rank_training_tradeoff.png'
    shutil.copy('ablation_rank_training_tradeoff.png', out_plot)
    print(f'Plot copied to: {out_plot}')

print('\nDone. Training tradeoff report is ready for interview use.')