## making dataset

### fragment.pdb fragment.xtc

In [16]:
def crop_fragments_3chain(md_pdb, md_dcd, out_dir,
                          cropping_length=85,
                          start_center=1,
                          stride=14,
                          n_frags=23,
                          verbose=True):
    import os
    import MDAnalysis as mda

    os.makedirs(out_dir, exist_ok=True)
    u = mda.Universe(md_pdb, md_dcd)

    for i in range(n_frags):
        center = start_center + i * stride

        # chain1 / chain2 / chain3 起點
        x1 = 3 * (center - 1) + 17
        x2 = 3 * (center - 1) + 1054 + 10
        x3 = 3 * (center - 1) + 17 + 2080

        e1 = x1 + cropping_length - 1
        e2 = x2 + cropping_length - 1
        e3 = x3 + cropping_length - 1

        sel = (
            f"protein and ("
            f"resid {x1}:{e1} or "
            f"resid {x2}:{e2} or "
            f"resid {x3}:{e3}"
            f")"
        )

        ag = u.select_atoms(sel)
        if ag.n_atoms == 0:
            raise RuntimeError(
                f"[center={center}] selection 為空\n"
                f"x1={x1}-{e1}, x2={x2}-{e2}, x3={x3}-{e3}"
            )

        # ✅ 用 chain1 的殘基編號當檔名
        pdb_out = os.path.join(out_dir, f"{x1}-{e1}.pdb")
        xtc_out = os.path.join(out_dir, f"{x1}-{e1}.xtc")

        if verbose:
            print(
                f"[center={center}] "
                f"chain1={x1}-{e1} | atoms={ag.n_atoms}"
            )

        # PDB：第一個 frame
        u.trajectory[0]
        ag.write(pdb_out)

        # XTC：整段 trajectory
        with mda.coordinates.XTC.XTCWriter(xtc_out, n_atoms=ag.n_atoms) as W:
            for ts in u.trajectory:
                W.write(ag)

    if verbose:
        print("All fragments written (named by chain1 resid range).")


In [None]:
pdb = '/mnt/hdd/jeff/dataset/output/collagen/SER/raw/860_SER_1/raw/860_SER_1.pdb'
dcd = '/mnt/hdd/jeff/dataset/output/collagen/SER/raw/860_SER_1/npt-out/860_SER_1.dcd'
out_dir = '/mnt/hdd/jeff/dataset/output/collagen/SER/fragment/860_SER_1'
crop_fragments_3chain(pdb, dcd, out_dir)

## analysis

### rmsd

In [18]:
import numpy as np
import matplotlib.pyplot as plt

def rmsd_check(pdb_path, xtc_path, out_png,
               align_sel="all",
               stride=1):
    """
    基本 RMSD 檢查：
      1) 讀 pdb + xtc
      2) 對齊到 frame 0（用 align_sel）
      3) 計算每個 frame 的 RMSD
      4) 畫 RMSD vs frame index -> out_png

    參數：
      - pdb_path, xtc_path : input structure / trajectory
      - out_png           : 輸出圖檔
      - align_sel         : 對齊與計算 RMSD 的 atom selection
      - stride            : 每隔幾個 frame 取樣一次
    """
    try:
        import MDAnalysis as mda
        from MDAnalysis.analysis import align
    except ImportError:
        raise ImportError("需要 MDAnalysis：pip install MDAnalysis")

    # load
    u = mda.Universe(pdb_path, xtc_path)

    sel = u.select_atoms(align_sel)
    if sel.n_atoms == 0:
        raise ValueError(f"Selection '{align_sel}' 沒有選到任何原子")

    # reference = frame 0
    u.trajectory[0]
    ref_coords = sel.positions.copy()

    # 對齊整條 trajectory 到 frame 0
    align.AlignTraj(
        u,
        u,
        select=align_sel,
        in_memory=True
    ).run()

    rmsd = []
    frames = []

    for ts in u.trajectory[::stride]:
        diff = sel.positions - ref_coords
        val = np.sqrt((diff * diff).sum(axis=1).mean())
        rmsd.append(val)
        frames.append(ts.frame)

    rmsd = np.asarray(rmsd)
    frames = np.asarray(frames)

    # plot
    plt.figure()
    plt.plot(frames, rmsd)
    plt.xlabel("Frame")
    plt.ylabel("RMSD (Å)")
    plt.tight_layout()
    plt.savefig(out_png, dpi=200)
    plt.close()

    print(f"[DONE] RMSD saved to {out_png}")
    return frames, rmsd


In [21]:
pdb = '/mnt/hdd/jeff/dataset/output/collagen/SER/fragment/860_SER_1/773-857.pdb'
xtc = '/mnt/hdd/jeff/dataset/output/collagen/SER/fragment/860_SER_1/773-857.xtc'
png = '/mnt/hdd/jeff/dataset/output/collagen/SER/analysis/rmsd/860_SER_1_773-857_rmsd.png'
rmsd_check(pdb, xtc, png)

[DONE] RMSD saved to /mnt/hdd/jeff/dataset/output/collagen/SER/analysis/rmsd/860_SER_1_773-857_rmsd.png


(array([   0,    1,    2, ..., 4997, 4998, 4999], shape=(5000,)),
 array([1.2078752e-07, 1.4152685e+00, 1.8873442e+00, ..., 5.4410090e+00,
        5.3222466e+00, 5.3258209e+00], shape=(5000,), dtype=float32))

### w2 distance

#### 折線圖

In [22]:
import os
import re
import numpy as np
import matplotlib.pyplot as plt

def pca_w2_by_center(dir1, dir2, out_png,
                     align_sel="name CA",
                     pca_n_components=10,
                     pca_use_components=2,
                     stride=1,
                     frame_start=0,
                     frame_end=None,
                     max_raw_frames=1500):
    """
    對 dir1 與 dir2 中「同一個 key」的 (key.pdb, key.xtc) 做：
      1) 兩邊都對齊到 dir1 的該 key.xtc 的 frame_start（同一個 reference）
      2) 只取指定 frame 範圍（預設前 1500 個 raw frames）
      3) 抽取座標向量 → 合併 fit PCA → 投影到 PCA space
      4) 在 PCA space 用 Gaussian 2-Wasserstein（W2）比較 dir1 vs dir2
      5) 畫 y=W2, x=key（若 key 是 17-101，x 用 17）

    你要「前 1500 個 frame」——直接用預設即可：
      frame_start=0, max_raw_frames=1500

    frame 範圍優先序：
      (1) 若 frame_end 有給 → 用 [frame_start, frame_end)
      (2) 否則若 max_raw_frames 有給 → frame_end = frame_start + max_raw_frames
      (3) 否則 → 到最後一個 frame

    W2（Gaussian / Bures）：
      W2^2 = ||m1-m2||^2 + Tr(C1 + C2 - 2*(C2^{1/2} C1 C2^{1/2})^{1/2})
    """
    try:
        import MDAnalysis as mda
        from MDAnalysis.analysis import align
    except ImportError as e:
        raise ImportError("需要先安裝 MDAnalysis：pip install MDAnalysis") from e

    try:
        from sklearn.decomposition import PCA
    except ImportError as e:
        raise ImportError("需要先安裝 scikit-learn：pip install scikit-learn") from e

    try:
        from scipy.linalg import sqrtm
    except ImportError as e:
        raise ImportError("需要先安裝 scipy：pip install scipy") from e

    def _key_x_from_stem(stem):
        """
        stem 可能是：
          - "123" → x=123
          - "17-101" → x=17（用 chain1 起始 resid 當 x）
        """
        m = re.match(r"^(\d+)(?:-(\d+))?$", stem)
        if not m:
            return None
        return int(m.group(1))

    def _list_keys(d):
        """
        回傳：
          keys_sorted: ["1", "2", ...] 或 ["17-101", "20-104", ...]
          xs_sorted:   [1, 2, ...] 或 [17, 20, ...]
        規則：同名 key 必須同時存在 key.pdb 與 key.xtc
        """
        items = []
        for fn in os.listdir(d):
            if not fn.endswith(".pdb"):
                continue
            stem = fn[:-4]
            x = _key_x_from_stem(stem)
            if x is None:
                continue
            xtc = os.path.join(d, f"{stem}.xtc")
            if os.path.exists(xtc):
                items.append((x, stem))
        items.sort(key=lambda t: t[0])
        xs_sorted = [t[0] for t in items]
        keys_sorted = [t[1] for t in items]
        return keys_sorted, xs_sorted

    def _resolve_frame_end(frame_start_, frame_end_, max_raw_frames_):
        if frame_end_ is not None:
            return frame_end_
        if max_raw_frames_ is not None:
            return frame_start_ + int(max_raw_frames_)
        return None  # None 表示到結尾

    def _load_align_coords(pdb, xtc, ref_u, sel,
                           stride_, frame_start_, frame_end_):
        """
        只對齊並取樣指定的 raw frame 範圍：
          raw frames: frame_start_ ... frame_end_-1（若 frame_end_ is None → 到最後）
          取樣步長：stride_
        """
        u = mda.Universe(pdb, xtc)
        mob = u.select_atoms(sel)
        ref = ref_u.select_atoms(sel)

        if mob.n_atoms != ref.n_atoms:
            raise ValueError(
                f"Selection 原子數不一致：mobile={mob.n_atoms}, ref={ref.n_atoms}。"
                f"請檢查 align_sel 是否一致，或兩邊拓樸是否相同。"
            )

        # 只對齊指定 frame 範圍（避免 AlignTraj 跑全程）
        align.AlignTraj(
            u, ref_u, select=sel,
            in_memory=True,
            start=frame_start_,
            stop=frame_end_,
            step=stride_
        ).run()

        coords = []
        # 用同樣的 frame slice 取出對齊後座標
        for ts in u.trajectory[frame_start_:frame_end_:stride_]:
            coords.append(mob.positions.astype(np.float64).reshape(-1))

        return np.asarray(coords, dtype=np.float64)

    def _w2_gaussian(X, Y):
        X = np.asarray(X, dtype=np.float64)
        Y = np.asarray(Y, dtype=np.float64)
        if X.ndim != 2 or Y.ndim != 2:
            raise ValueError("X/Y 必須是 2D array: (n_samples, d)")
        if X.shape[1] != Y.shape[1]:
            raise ValueError("X/Y 維度 d 不一致")

        m1 = X.mean(axis=0)
        m2 = Y.mean(axis=0)
        C1 = np.cov(X, rowvar=False, bias=False)
        C2 = np.cov(Y, rowvar=False, bias=False)

        eps = 1e-8
        C1 = C1 + eps * np.eye(C1.shape[0])
        C2 = C2 + eps * np.eye(C2.shape[0])

        C2_sqrt = np.real(sqrtm(C2))
        mid = C2_sqrt @ C1 @ C2_sqrt
        mid_sqrt = np.real(sqrtm(mid))

        w2_sq = np.sum((m1 - m2) ** 2) + np.trace(C1 + C2 - 2.0 * mid_sqrt)
        w2_sq = float(np.real(w2_sq))

        if w2_sq < 0 and w2_sq > -1e-6:
            w2_sq = 0.0
        if w2_sq < 0:
            raise ValueError(f"W2^2 計算出負值：{w2_sq}（數值不穩定或資料異常）")

        return np.sqrt(w2_sq)

    # — 找共同 keys（支援 "1" 或 "17-101"）
    keys1, xs1 = _list_keys(dir1)
    keys2, xs2 = _list_keys(dir2)

    set1 = set(keys1)
    set2 = set(keys2)
    keys_common = [k for k in keys1 if k in set2]  # 保持 dir1 的排序

    if not keys_common:
        raise FileNotFoundError(
            "找不到共同的 keys（兩邊都要有 {key}.pdb 與 {key}.xtc；"
            "key 可以是 123 或 17-101）。"
        )

    # x 軸用 key 的起始數字（例如 17-101 → 17）
    xs_common = [_key_x_from_stem(k) for k in keys_common]

    # — frame 範圍處理
    frame_end_eff = _resolve_frame_end(frame_start, frame_end, max_raw_frames)
    if frame_end_eff is not None and frame_end_eff <= frame_start:
        raise ValueError("frame_end 必須大於 frame_start")

    w2_list = []

    for key in keys_common:
        pdb1 = os.path.join(dir1, f"{key}.pdb")
        xtc1 = os.path.join(dir1, f"{key}.xtc")
        pdb2 = os.path.join(dir2, f"{key}.pdb")
        xtc2 = os.path.join(dir2, f"{key}.xtc")

        # reference：dir1 的 frame_start
        ref_u = mda.Universe(pdb1, xtc1)
        ref_u.trajectory[frame_start]

        X1 = _load_align_coords(pdb1, xtc1, ref_u, align_sel,
                                stride, frame_start, frame_end_eff)
        X2 = _load_align_coords(pdb2, xtc2, ref_u, align_sel,
                                stride, frame_start, frame_end_eff)

        X_all = np.vstack([X1, X2])

        # PCA 組件數上限受樣本數與維度限制
        n_comp = min(pca_n_components, X_all.shape[0], X_all.shape[1])
        if n_comp < 2:
            raise ValueError(
                f"key={key} 的樣本數/特徵數太小，無法做 PCA（n_comp={n_comp}）。"
            )

        pca = PCA(n_components=n_comp, random_state=0)
        Z_all = pca.fit_transform(X_all)
        Z1 = Z_all[:len(X1)]
        Z2 = Z_all[len(X1):]

        d_use = min(pca_use_components, Z1.shape[1])
        w2 = _w2_gaussian(Z1[:, :d_use], Z2[:, :d_use])
        w2_list.append(w2)

        print(
            f"[OK] key={key}: W2={w2:.6f} – "
            f"PC dims used={d_use}, frames1={len(Z1)}, frames2={len(Z2)}, "
            f"raw_frames=[{frame_start}:{frame_end_eff}], stride={stride}"
        )

    # plot
    plt.figure()
    plt.plot(xs_common, w2_list, marker="o")
    plt.xlabel("chain1 start resid (or center)")
    plt.ylabel("W2 distance (Gaussian, PCA space)")
    plt.tight_layout()
    plt.savefig(out_png, dpi=200)
    plt.close()

    print(f"[DONE] saved: {out_png}")
    return xs_common, w2_list


In [None]:
dir1 = '/mnt/hdd/jeff/dataset/output/collagen/SER/fragment/263_SER_1'
dir2 = '/mnt/hdd/jeff/dataset/output/collagen/SER/fragment/860_SER_1'
out_png = '/mnt/hdd/jeff/dataset/output/collagen/SER/analysis/w2/w2_by_center.png'
pca_w2_by_center(dir1, dir2,out_png)

#### pca圖

In [46]:
import numpy as np
import matplotlib.pyplot as plt

def pca_pc1_pc2_ca(a_pdb, a_xtc, b_pdb, b_xtc, out_png,
                   stride=1,
                   frame_start=0,
                   frame_end=None,
                   max_raw_frames=1500):
    """
    CA-only PCA with frame-colored scatter：
      1) a 與 b 都對齊到 a 的 frame_start（預設 frame 0）（CA）
      2) 只使用指定 frame 範圍（可用 max_raw_frames）
      3) 抽 CA 座標
      4) 用 a+b 合併資料 fit PCA
      5) 畫 PC1–PC2 scatter
         - dir1 (a)：紅色調（Reds），顏色 = frame label
         - dir2 (b)：藍色調（Blues），顏色 = frame label

    frame 範圍優先序：
      (1) frame_end != None → [frame_start, frame_end)
      (2) elif max_raw_frames != None → [frame_start, frame_start + max_raw_frames)
      (3) else → 到最後
    """
    try:
        import MDAnalysis as mda
        from MDAnalysis.analysis import align
    except ImportError:
        raise ImportError("需要 MDAnalysis：pip install MDAnalysis")

    try:
        from sklearn.decomposition import PCA
    except ImportError:
        raise ImportError("需要 scikit-learn：pip install scikit-learn")

    # ---------- resolve frame_end ----------
    if frame_end is None and max_raw_frames is not None:
        frame_end = frame_start + int(max_raw_frames)
    if frame_end is not None and frame_end <= frame_start:
        raise ValueError("frame_end 必須大於 frame_start")

    # ---------- load ----------
    ua = mda.Universe(a_pdb, a_xtc)
    ub = mda.Universe(b_pdb, b_xtc)

    sel = "protein and name CA"
    ca_a = ua.select_atoms(sel)
    ca_b = ub.select_atoms(sel)

    if ca_a.n_atoms != ca_b.n_atoms:
        raise ValueError(
            f"CA 原子數不一致：a={ca_a.n_atoms}, b={ca_b.n_atoms}"
        )

    # ---------- reference (a frame_start) ----------
    ua.trajectory[frame_start]
    ref = ua

    # ---------- align both to reference (only selected slice) ----------
    align.AlignTraj(
        ua, ref, select=sel,
        in_memory=True,
        start=frame_start,
        stop=frame_end,
        step=stride
    ).run()

    align.AlignTraj(
        ub, ref, select=sel,
        in_memory=True,
        start=frame_start,
        stop=frame_end,
        step=stride
    ).run()

    # ---------- collect coords ----------
    Xa, frame_a = [], []
    for i, ts in enumerate(ua.trajectory[frame_start:frame_end:stride]):
        Xa.append(ca_a.positions.astype(np.float64).reshape(-1))
        frame_a.append(i)
    Xa = np.asarray(Xa, dtype=np.float64)
    frame_a = np.asarray(frame_a, dtype=np.int32)

    Xb, frame_b = [], []
    for i, ts in enumerate(ub.trajectory[frame_start:frame_end:stride]):
        Xb.append(ca_b.positions.astype(np.float64).reshape(-1))
        frame_b.append(i)
    Xb = np.asarray(Xb, dtype=np.float64)
    frame_b = np.asarray(frame_b, dtype=np.int32)

    if Xa.shape[0] == 0 or Xb.shape[0] == 0:
        raise ValueError("選到的 frame 數為 0，請檢查 frame_start / frame_end / max_raw_frames")

    # ---------- PCA (fit on combined) ----------
    Xall = np.vstack([Xa, Xb])
    pca = PCA(n_components=2, random_state=0)
    Zall = pca.fit_transform(Xall)

    Za = Zall[:len(Xa)]
    Zb = Zall[len(Xa):]

    # ---------- plot ----------
    plt.figure(figsize=(6, 5))

    sca = plt.scatter(
        Za[:, 0], Za[:, 1],
        c=frame_a,
        cmap="Reds",
        s=10,
        alpha=0.7,
        label="83"
    )

    scb = plt.scatter(
        Zb[:, 0], Zb[:, 1],
        c=frame_b,
        cmap="Blues",
        s=10,
        alpha=0.7,
        label="282"
    )

    plt.xlabel("PC1")
    plt.ylabel("PC2")
    plt.legend()


    plt.tight_layout()
    plt.savefig(out_png, dpi=200)
    plt.close()

    print(f"[DONE] saved {out_png}")


In [49]:
a_pdb = '/mnt/hdd/jeff/dataset/output/collagen/SER/fragment/263_SER_1/227-311.pdb'
a_xtc = '/mnt/hdd/jeff/dataset/output/collagen/SER/fragment/263_SER_1/227-311.xtc'
b_pdb = '/mnt/hdd/jeff/dataset/output/collagen/SER/fragment/860_SER_1/227-311.pdb'
b_xtc = '/mnt/hdd/jeff/dataset/output/collagen/SER/fragment/860_SER_1/227-311.xtc'
out_png = '/mnt/hdd/jeff/dataset/output/collagen/SER/analysis/pca/227-311.png'
pca_pc1_pc2_ca(a_pdb, a_xtc, b_pdb, b_xtc, out_png)

[DONE] saved /mnt/hdd/jeff/dataset/output/collagen/SER/analysis/pca/227-311.png


### dccm

In [None]:
import numpy as np
import matplotlib.pyplot as plt

def dccm_ca_from_traj(pdb, dcd, max_raw_frames, out_png,
                      stride=1,
                      frame_start=0,
                      align_sel="protein and name CA",
                      vmin=-1.0, vmax=1.0):
    """
    計算 CA-only DCCM（Dynamic Cross-Correlation Matrix）

    流程：
      1) 對齊到該 pdb/dcd 的 frame_start（預設 frame 0）
      2) 只使用前 max_raw_frames 個 raw frames（可搭配 stride）
      3) 計算 CA 的 DCCM
      4) 輸出 heatmap png

    參數：
      - pdb, dcd: 軌跡
      - max_raw_frames: 使用的 raw frame 數（例如 1500）
      - out_png: 輸出檔名
      - stride: 取樣步長（預設 1）
      - frame_start: 起始 frame（預設 0）
      - align_sel: 對齊與計算用 selection（預設 CA）
    """
    try:
        import MDAnalysis as mda
        from MDAnalysis.analysis import align
    except ImportError:
        raise ImportError("需要 MDAnalysis：pip install MDAnalysis")

    # ---------- frame range ----------
    frame_end = frame_start + int(max_raw_frames)
    if frame_end <= frame_start:
        raise ValueError("max_raw_frames 必須 > 0")

    # ---------- load ----------
    u = mda.Universe(pdb, dcd)
    sel_atoms = u.select_atoms(align_sel)
    n_atoms = sel_atoms.n_atoms
    if n_atoms == 0:
        raise ValueError("selection 沒抓到原子，請檢查 align_sel")

    # ---------- reference (frame_start) ----------
    u.trajectory[frame_start]
    ref = u

    # ---------- align (only selected slice) ----------
    align.AlignTraj(
        u, ref, select=align_sel,
        in_memory=True,
        start=frame_start,
        stop=frame_end,
        step=stride
    ).run()

    # ---------- collect positions ----------
    coords = []
    for ts in u.trajectory[frame_start:frame_end:stride]:
        coords.append(sel_atoms.positions.astype(np.float64))
    coords = np.asarray(coords)   # (T, N, 3)

    if coords.shape[0] < 2:
        raise ValueError("frame 數不足，無法計算 DCCM")

    # ---------- fluctuations ----------
    mean_pos = coords.mean(axis=0)          # (N, 3)
    delta = coords - mean_pos               # (T, N, 3)

    # ---------- DCCM ----------
    # numerator: <Δri · Δrj>
    T = delta.shape[0]
    dccm = np.zeros((n_atoms, n_atoms), dtype=np.float64)

    # 預先算每個原子的 <|Δr|^2>
    var = np.mean(np.sum(delta**2, axis=2), axis=0)  # (N,)

    for i in range(n_atoms):
        for j in range(i, n_atoms):
            num = np.mean(np.sum(delta[:, i, :] * delta[:, j, :], axis=1))
            denom = np.sqrt(var[i] * var[j])
            cij = num / denom if denom > 0 else 0.0
            dccm[i, j] = cij
            dccm[j, i] = cij

    # ---------- plot ----------
    plt.figure(figsize=(6, 5))
    im = plt.imshow(dccm, cmap="coolwarm", vmin=vmin, vmax=vmax, origin="lower")
    plt.colorbar(im, label="DCCM")
    plt.xlabel("CA index")
    plt.ylabel("CA index")
    plt.tight_layout()
    plt.savefig(out_png, dpi=200)
    plt.close()

    print(f"[DONE] DCCM saved: {out_png}")
    return dccm


In [None]:
pdb = '/mnt/hdd/jeff/dataset/output/collagen/SER/raw/263_SER_1/raw/263_SER_1.pdb'
dcd = '/mnt/hdd/jeff/dataset/output/collagen/SER/raw/263_SER_1/npt-out/263_SER_1.dcd'
out_png = '/mnt/hdd/jeff/dataset/output/collagen/SER/analysis/dccm/263_SER_1.png'
dccm_ca_from_traj(pdb, dcd, 1500, out_png)