In [None]:
import pathlib
import os
import warnings

from Bio.PDB import *
from Bio.PDB.PDBExceptions import PDBConstructionWarning

from utils import *

import swalign

In [None]:
# Folder containing bmDCA parameters used to generate the synthetic MSAs without or with phylogeny
bmDCA_PARAMETERS_DIR = pathlib.Path("data/Synthetic_data/bmDCA_parameters")

# Folders containing plmDCA coupling scores inferred from the synthetic MSAs
plmDCA_EQ_INFERRED_SCORES_DIR = pathlib.Path("data/Synthetic_data/equilibrium/coupling_scores/plmDCA_inferred")
plmDCA_TREE_INFERRED_SCORES_DIR = pathlib.Path("data/Synthetic_data/tree/coupling_scores/plmDCA_inferred")

# Folders containing MSA Transformer coupling scores inferred from the synthetic MSAs
MSA_TR_EQ_INFERRED_SCORES_DIR = pathlib.Path("data/Synthetic_data/equilibrium/coupling_scores/MSA_Transformer_inferred")
MSA_TR_TREE_INFERRED_SCORES_DIR = pathlib.Path("data/Synthetic_data/tree/coupling_scores/MSA_Transformer_inferred")

# Folder to host PDB structures
PDB_DIR = pathlib.Path("data/PDB_structures")
if not PDB_DIR.exists():
    os.mkdir(PDB_DIR)

# MSA data dictionary

``"pfam_seq"`` is the sequence from the Pfam full MSA corresponding to the PDB structure with ID ``"pdb_id"``. 

In [None]:
msa_data = {
    "PF00004": {
        "pdb_id": "4d81",
        "chain_id": "A",
        "pfam_seq": "ILLYGPPGCGKTMIAAAVANELDSEFIHVDAASIMSKWLGEAEKNVAKIFKTARELSKPAIIFIDELDALLASY-TSEVGGEARVRNQFLKEMDGLADKISKVYVIGATNKPWRLDEPFL-RRFQKRIYIT-"
    },
    "PF00005": {
        "pdb_id": "1l7v",
        "chain_id": "C",
        "pfam_seq": "--PLSGEVRAGEILHLVGPNGAGKSTLLARMAGMTS-GKGSIQFAGQPLEAWSATKLALHRAYLSQQQTPPFAMPVWHYQHDKTRTELLNDVAGALALDDKLGRSTNQLSGGEWQRVRLAAVVLQAGQLLLLDEPMN"
    },
    "PF00041": {
        "pdb_id": "3up1",
        "chain_id": "A",
        "pfam_seq": "-APFDLSVVYRGANDFVVTFNTSHKKYVKVLMHDVAYRQEKDENKWTHVNLSSTKLTLLQRKLQPAAMYEIKVRSIPDHYKGFWS"
    },
    "PF00072": {
        "pdb_id": "3ilh",
        "chain_id": "A",
        "pfam_seq": "VLLIDDDDIVNFLNTTIIRTHRVEEIQSVTSGNAAINKLNELYPSIICIDINMPGINGWELIDLFKQHFNKSIVCLLSSSLDPRDQAKAEASDVDYYVSKPLTANALN----"
    },
    "PF00076": {
        "pdb_id": "3nnh",
        "chain_id": "A",
        "pfam_seq": "-FVGQVPRTWSEKDLRELFEQYGAVYEINVLRDNPPQSKGCCFVTFYTRKAALEAQNALHNMKV-----"
    },
    "PF00096": {
        "pdb_id": "4r2a",
        "chain_id": "A",
        "pfam_seq": "YACPSCDRRFSRSDELTRHIRIH"
    },
    "PF00153": {
        "pdb_id": "1okc",
        "chain_id": "A",
        "pfam_seq": "--RYFAGNLASGGAAGATSLCFVYPLDFARTRLAADVGKGAQREFTGLGNCITKIFKSDGLRGLQGFNSVQGIIIYRAAYGVYDTAKGMLP---"
    },
    "PF00512": {
        "pdb_id": "3dge",
        "chain_id": "A",
        "pfam_seq": "MKTEFIANISHERTPLTAIKAYAETIYNSELDLSTLKEFLEVIIDQSNHLENLLNELLDFSRLE--"
    },
    "PF00595": {
        "pdb_id": "1be9",
        "chain_id": "A",
        "pfam_seq": "-IVIHR-GSTGLGFNIVGGEDGE---GIFISFILAGGPADLSGLRKGDQILSVNGVDLRNASHEQAAIALKNAGQTVTII--"
    },
    "PF01535": {
        "pdb_id": "4m57",
        "chain_id": "A",
        "pfam_seq": "VTYHTLVGGYSSLEMFSEAREVIGYMVQHGL"
    },
    "PF02518": {
        "pdb_id": "3g7e",
        "chain_id": "A",
        "pfam_seq": "-DGTGLHHMVFEVVDNAIDAGHCKEIIVTIH---ADNSVSVQDDGRGIPTGIHPHAGGKFDD-NSYKVSGGLHGVGVSVVNALSQKLELVIQRGETEKTGTMVRFWPSLE-"
    },
    "PF07679": {
        "pdb_id": "1fhg",
        "chain_id": "A",
        "pfam_seq": "PYFTKTILDMDVVEGSAARFDCKVEGYPDPEVMWFKDDNPVKESRHFQIDYDEGNCSLTISEVCGDDDAKYTCKAVNSLGEATCTAELLV"
    },
    "PF00271": {
        "pdb_id": "3ex7",
        "chain_id": "C",
        "pfam_seq": "-KFDTLCDLY-DTLTITQAVIFCNTKRKVDWTEKMREA-NFTVSSMHGDMPQKERESIMKEFRSGASRVLISTDVWARGLDVPQVSLIINYDLPNNRELYIHRIGRSGRYG"
    },
    "PF00397": {
        "pdb_id": "4rex",
        "chain_id": "A",
        "pfam_seq": "LPAGWEMAKTSS-GQRYFLNHIDQTTTWQDP"
    },
    "PF13354": {
        "pdb_id": "6qw8",
        "chain_id": "A",
        "pfam_seq": "--DNSQILYRADERFAMCSTSKVMAAAAVLKKSESENLLNQRVEIKKSDLVNYNPIAEKHVNGTMSLAESAAALQYSDNVAMNKLIAHVGPASVTAFARQLGDETFRLDRTEPTLNAIPGDPRDTTSPRAMAQTLRNLTLGKALGDSLVTWMKNTTGAASIQAGLPAWVVGDKTGSGYGTTNDIAVIWPDRAPLILV-"
    }
}

# Align with PDB data

In [None]:
match = 2
mismatch = -2
gap_penalty = -2

scoring = swalign.IdentityScoringMatrix(match, mismatch)
sw = swalign.LocalAlignment(scoring, gap_penalty=gap_penalty)

In [None]:
idxs_chains = {}
idxs_pfam_seqs = {}
dist_mat = {}

for msa_name in msa_data:
    print(msa_name)
    pdb_id = msa_data[msa_name]["pdb_id"]
    chain_id = msa_data[msa_name]["chain_id"]

    # Download and parse structure
    pdbl = PDBList()
    pdbl.retrieve_pdb_file(pdb_id,
                           pdir=PDB_DIR,
                           file_format="mmCif")
    pdb_parser = MMCIFParser()
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=PDBConstructionWarning)
        chain = pdb_parser.get_structure(pdb_id, f"{PDB_DIR}/{pdb_id}.cif")[0][chain_id]
    # Convert to one-letter encoding
    pdb_seq = to_one_letter_seq(chain)
    pfam_seq = msa_data[msa_name]["pfam_seq"]
    print(f"Ref: {pdb_seq = }")
    print(f"Query: {pfam_seq = }")

    # Align PDB sequence with PFAM sequence
    alignment = sw.align(pdb_seq, pfam_seq)
    alignment.dump()
    # Store matching indices and PDB distance matrix for matching indices
    idxs_chain, idxs_pfam_seq = indices_in_ref_and_query(alignment)
    idxs_chains[msa_name] = idxs_chain
    idxs_pfam_seqs[msa_name] = idxs_pfam_seq
    dist_mat[msa_name] = calc_min_dist_matrix(chain, idxs_chain)
    print()

# Compute ground truth (bmDCA) coupling scores for the synthetic MSAs

In [None]:
bmDCA_scores = {}
for pfam_family in msa_data:
    bmDCA_scores[pfam_family] = {}
    _, J2 = read_bmDCA_parameters(bmDCA_PARAMETERS_DIR / f"{pfam_family}.txt")
    idx_subset = np.asarray(idxs_pfam_seqs[pfam_family])  # Restrict to sites matching with the PDB
    bmDCA_scores[pfam_family] = zero_sum_gauge_frob_scores(J2)[idx_subset, :][:, idx_subset]  # Use APC (default)

# Read in plmDCA scores inferred from the synthetic MSAs

In [None]:
plmDCA_equilibrium_scores = {}
plmDCA_tree_scores = {}
for pfam_family in msa_data:
    length = len(msa_data[pfam_family]["pfam_seq"])
    for dic, path in zip([plmDCA_equilibrium_scores, plmDCA_tree_scores],
                         [plmDCA_EQ_INFERRED_SCORES_DIR, plmDCA_TREE_INFERRED_SCORES_DIR]):
        scores = np.loadtxt(path / f"{pfam_family}.txt")
        scores[:, :2] -= 1  # Convert 1-based indexing (Julia) to 0-based indexing (Python)
        mat = np.zeros((length, length), dtype=np.float64)  # Initialize scores matrix
        mat[tuple(scores[:, :2].astype(int).T)] = scores[:, 2]  # Populate scores matrix
        mat += mat.T  # Symmetrize scores matrix
        idx_subset = np.asarray(idxs_pfam_seqs[pfam_family])  # Restrict to sites matching with the PDB
        mat = mat[idx_subset, :][:, idx_subset]
        dic[pfam_family] = mat

# Read in MSA Transformer scores inferred from the synthetic MSAs

These scores were obtained from each synthetic MSA by computing contact probabilities (scores) according to MSA Transformer [(Rao et al, 2021)](https://proceedings.mlr.press/v139/rao21a.html) from 10 sub-MSAs defined as the rows with labels 0-9 `in ``data/Synthetic_data/MSA_Transformer_subsample_labels``, and then averaging the resulting 10 score matrices.

In [None]:
MSA_Tr_equilibrium_scores = {}
MSA_Tr_tree_scores = {}
for pfam_family in msa_data:
    for dic, path in zip([MSA_Tr_equilibrium_scores, MSA_Tr_tree_scores],
                         [MSA_TR_EQ_INFERRED_SCORES_DIR, MSA_TR_TREE_INFERRED_SCORES_DIR]):
        mat = np.loadtxt(path / f"{pfam_family}.txt")
        idx_subset = np.asarray(idxs_pfam_seqs[pfam_family])  # Restrict to sites matching with the PDB
        mat = mat[idx_subset, :][:, idx_subset]
        dic[pfam_family] = mat

In [None]:
# Exclude PF00096 from the analysis as it is too short
msa_names_long = [msa_name for msa_name in msa_data if msa_name != "PF00096"]
print(f"{msa_names_long = }")