## Bepipred 3 dataset

- job_name: unique identifier for protein, comes from hash of seq
- seq: amino acid sequence of protein
- test: boolean indicating if seq is part of test set
- epitope_boolmask: boolean array the same length as seq indiciating if the AA at that position is an epitope residue
- raw_protein_id: original ID assigned to protein in BP3C50ID set
- RSA: relative solvent accessiblity of the protein at each AA, calculated by FreeSASA
- SA: absolute solvent accessibility of the protein at each AA, calculated by FreeSASA

In [22]:
import polars as pl

bp3 = pl.read_parquet("../data/bp3c50id/bp3c50id.rsa.parquet")

bp3

job_name,seq,test,epitope_boolmask,raw_protein_id,RSA,SA
str,str,bool,list[bool],str,list[f64],list[f64]
"""bf2a62534941cf895971e1daa33a46…","""LIQTPSSLLVQTNHTAKMSCEVKSISKLTS…",true,"[false, false, … false]","""3b9k_B""","[0.205823, 0.471213, … 1.001547]","[36.957627, 82.806331, … 152.205138]"
"""d4febd28417e8a4bf6266337c7a2de…","""GNVDLVFLFDGSMSLQPDEFQKILDFMKDV…",true,"[false, false, … false]","""3hi6_A""","[0.840245, 0.294451, … 0.605393]","[68.13546, 42.698276, … 129.669178]"
"""17d233a2b305a3544cf6c164f8ad67…","""DERETWSGKVDFLLSVIGFAVDLANVWRFP…",true,"[false, false, … false]","""4xp9_C""","[1.100846, 1.039373, … 1.03129]","[157.156786, 181.038055, … 185.178502]"
"""34e0c5de18ccd222f24d4bc9d0f0e4…","""KAMHVAQPAVVLASSRGIASFVCEYASPGK…",true,"[true, true, … false]","""5ggv_Y""","[0.731129, 0.872878, … 1.227995]","[149.866882, 94.934212, … 168.493256]"
"""f4c930a3f1b5fb78cef62c5021adc0…","""GSHHHHHHGSGTDITNQLTNVTVGIDSGTT…",true,"[false, false, … false]","""5jq6_A""","[1.462795, 0.796439, … 0.90302]","[118.618012, 94.250586, … 119.379304]"
…,…,…,…,…,…,…
"""2c282aeeb88596bf1f1f99be1bb7f0…","""LDKIDLSYETTESGDTAVSEDSYDKYASQN…",false,"[false, false, … false]","""7jum_A""","[0.474081, 0.908022, … 0.986187]","[85.126053, 129.629226, … 143.007018]"
"""5196520df0000bf1b3fafa8c0e9ecc…","""TDRQLAEEYLYRYGYTRVASLGPALLLLQK…",false,"[false, false, … false]","""5th9_A""","[0.871364, 0.432033, … 0.256134]","[122.513787, 61.677101, … 54.861323]"
"""96836e4358c57e3f571a4f2bb8a8f8…","""LPWLNVSADGDNVHLVLNVSEEQHFGLSLY…",false,"[false, false, … true]","""6hga_B""","[0.93068, 0.110532, … 1.220648]","[167.112849, 15.166135, … 223.341912]"
"""9d838eec0c24655e9902a3ac128a34…","""CSSPPCECHQEEDFRVTCKDIQRIPSLPPS…",false,"[false, false, … false]","""2xwt_C""","[0.479508, 1.059907, … 0.528113]","[63.390948, 125.429402, … 74.252646]"


## Getting structural embeddings for a protein


In [17]:
from mdaf3.AF3OutputParser import AF3Output
from pathlib import Path


INF_DIR = Path("../data/bp3c50id/inference")
sample_job_name = bp3.select("job_name")[0].item()

af3_output = AF3Output(INF_DIR / sample_job_name)

af3_single_embed = af3_output.get_single_embeddings()
af3_pairwise_embed = af3_output.get_pair_embeddings()

af3_single_embed

array([[-316.  , -916.  , -148.  , ...,  114.5 , -175.  ,   56.75],
       [-498.  , -616.  , -120.5 , ...,  198.  , -235.  ,  105.5 ],
       [-508.  , -608.  , -107.  , ...,  288.  , -241.  ,   -6.03],
       ...,
       [-370.  , -528.  , -242.  , ...,  163.  , -460.  , -227.  ],
       [-370.  , -528.  , -242.  , ...,  163.  , -460.  , -227.  ],
       [-370.  , -528.  , -242.  , ...,  163.  , -460.  , -227.  ]],
      shape=(256, 384), dtype=float16)

### The af3_output object can do a lot more:


In [19]:
u = af3_output.get_mda_universe()

# select all alpha carbons in topology
calphas = u.select_atoms("name CA")

# get their plddt
print(f"Alpha carbon pLDDT: {calphas.tempfactors}")

# get the contact probability array
contact_probs = af3_output.get_contact_prob_ndarr()

Alpha carbon pLDDT: [89.05000305 90.90000153 93.55000305 92.66999817 95.01999664 96.
 97.02999878 96.05000305 97.12000275 96.63999939 96.61000061 95.05999756
 93.01999664 93.45999908 92.69000244 93.5        93.80999756 93.55000305
 92.87999725 91.15000153 88.44000244 82.69999695 76.98000336 72.58000183
 68.90000153 72.23999786 69.37999725 71.36000061 77.56999969 84.30999756
 87.66999817 91.04000092 92.34999847 93.26999664 93.         92.87000275
 86.84999847 79.66000366 76.95999908 76.51000214 80.98999786 81.33999634
 84.69999695 89.47000122 90.79000092 90.59999847 90.86000061 90.16999817
 89.73999786 88.19000244 84.12999725 81.91999817 78.51000214 80.23999786
 81.58000183 82.37999725 84.91000366 85.86000061 87.         85.43000031
 80.48000336 74.48000336 74.86000061 60.11000061 59.43999863 62.54999924
 57.09000015 70.15000153 77.37999725 75.16000366 75.26999664 73.08999634
 64.12999725 57.47999954 55.90000153 61.22000122 75.45999908 83.11000061
 84.58000183 87.79000092 90.01999664 90



## Getting LM embeddings for a protein


In [21]:
from pathlib import Path
import torch

ESM_ENCODING_DIR = Path("/tgen_labs/altin/esm_encodings")

sample_job_name = bp3.select("job_name")[0].item()

esm_2_embed = torch.load(ESM_ENCODING_DIR / (sample_job_name + ".pt"))

esm_2_embed

tensor([[-7.9217e-02, -8.2230e-02,  5.8380e-02,  ...,  2.4681e-01,
          9.6495e-02,  1.1700e+02],
        [ 2.7191e-01,  1.3160e-01, -1.2749e-01,  ...,  8.3813e-02,
          2.6999e-02,  1.1700e+02],
        [ 7.5211e-02, -1.2474e-01, -3.1285e-01,  ..., -7.0912e-02,
         -1.3021e-01,  1.1700e+02],
        ...,
        [-9.3008e-02,  1.5062e-01,  3.5336e-01,  ..., -3.2767e-01,
         -1.1053e-01,  1.1700e+02],
        [ 6.3777e-02,  1.2429e-01,  2.3989e-01,  ..., -2.6909e-01,
          8.8695e-02,  1.1700e+02],
        [ 7.8697e-02, -1.0143e-02,  3.3305e-01,  ..., -1.6285e-01,
          1.1192e-01,  1.1700e+02]])