# SAEB-Flow — High-Fidelity Astex 85 Benchmark

**GitHub -> Kaggle -> Multi-GPU Parallel -> Auto-packaged results**

### 特點：
1. **雙 GPU 加速**：自動偵測 Kaggle T4 x2 並進行並行計算。
2. **高精度模式**：一鍵開啟 1000 步 + 64 Batch 大採樣。
3. **多種子集成**：支援多個 Random Seed 跑分（ENSEMBLE_SEEDS）。

In [None]:
# ═══════════════════════════════════════════════
# Cell 1: 參數配置 (在此設定實驗強度)
# ═══════════════════════════════════════════════

HIGH_FIDELITY  = True        # 是否開啟高精度模式 (1000步, Batch 64)
ENSEMBLE_SEEDS = "42,43"    # 多種子 (用逗號隔開，例如 "42,43,44")
USE_MULTI_GPU  = True        # 是否使用雙 GPU 並行

WORK = '/kaggle/working'
REPO_DIR = f'{WORK}/MaxFlow'
SRC_DIR  = f'{REPO_DIR}/src'
PDB_DIR  = '/kaggle/input/astex-diverse'  # Kaggle Dataset 需先掛載

In [None]:
# ═══════════════════════════════════════════════
# Cell 2: 清理環境 & 拉取代碼
# ═══════════════════════════════════════════════
import os, shutil, subprocess, sys

for old in [f'{WORK}/results', f'{WORK}/plots', f'{WORK}/cache', REPO_DIR]:
    if os.path.exists(old):
        shutil.rmtree(old)
os.makedirs(f'{WORK}/results', exist_ok=True)
os.makedirs(f'{WORK}/plots',   exist_ok=True)

GITHUB_REPO = 'https://github.com/DragX0826/MaxFlow.git'
subprocess.run(['git', 'clone', '--depth=1', GITHUB_REPO, REPO_DIR], check=True)

git_hash = subprocess.check_output(['git', '-C', REPO_DIR, 'rev-parse', '--short', 'HEAD']).decode().strip()
print(f'Cloned: {GITHUB_REPO} @{git_hash}')

if SRC_DIR not in sys.path: sys.path.insert(0, SRC_DIR)
subprocess.run(['pip', 'install', '-r', f'{SRC_DIR}/requirements.txt', '-q'], check=True)
print('Environment ready.')

In [None]:
# ═══════════════════════════════════════════════
# Cell 3: GPU 偵測 & 指令準備
# ═══════════════════════════════════════════════
import torch
gpu_count = torch.cuda.device_count()
print(f'Detected GPUs: {gpu_count}')

cmd = [
    sys.executable, f'{SRC_DIR}/run_benchmark.py',
    '--bench_astex',
    '--pdb_dir',    PDB_DIR,
    '--mode',       'inference',
    '--output_dir', f'{WORK}/results',
]

if HIGH_FIDELITY: 
    cmd.append('--high_fidelity')
    print('>>> Running in HIGH FIDELITY mode (1000 steps, batch 64)')
if ENSEMBLE_SEEDS:
    cmd.extend(['--seeds', ENSEMBLE_SEEDS])
    print(f'>>> Using seeds: {ENSEMBLE_SEEDS}')

if USE_MULTI_GPU and gpu_count > 1:
    print(f'>>> Utilizing {gpu_count} GPUs for parallel processing.')
    # 注意：在我的 run_benchmark.py 中，不加 --kaggle 且 num_gpus > 1 會自動開啟 mp spawn 並行
    cmd.extend(['--num_gpus', str(gpu_count)])
else:
    print('>>> Running in sequential mode (one target at a time).')
    cmd.append('--kaggle') # --kaggle 強制進入順序模式

In [None]:
# ═══════════════════════════════════════════════
# Cell 4: 執行 Benchmark
# ═══════════════════════════════════════════════
print('Benchmark started... (Logs streaming below)')
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1)
for line in proc.stdout:
    print(line.rstrip())
proc.wait()
print(f'\nFinished with exit code: {proc.returncode}')

In [None]:
# ═══════════════════════════════════════════════
# Cell 5: 打包與視覺化
# ═══════════════════════════════════════════════
import glob
for src_plot_dir in [f'{SRC_DIR}/plots', 'plots']:
    if os.path.exists(src_plot_dir):
        for f in glob.glob(f'{src_plot_dir}/*'):
            shutil.copy(f, f'{WORK}/plots/')

pack_cmd = [sys.executable, f'{SRC_DIR}/scripts/pack_results.py', '--src', WORK, '--output', WORK, '--label', f'hi_fi_git{git_hash}']
print(subprocess.run(pack_cmd, capture_output=True, text=True).stdout)

import pandas as pd, numpy as np, IPython.display as display
csv_path = f'{WORK}/results/benchmark_results.csv'
if os.path.exists(csv_path):
    df = pd.read_csv(csv_path)
    arr = df['best_rmsd'].values
    print(f'\nSR@2A: {(arr<2.0).mean()*100:.1f}% | Median: {np.median(arr):.2f}A')
    display.display(df.sort_values('best_rmsd').head(20))

for png in sorted(glob.glob(f'{WORK}/plots/*.png')):
    print(f'\n>>> {os.path.basename(png)}')
    display.display(display.Image(png, width=800))