##

# Assess the structural overlap of actual pairs vs translations

In [1]:
import datasets
from nomelt.translate import translate_sequences
import torch
import esm
import os
import copy
from tqdm import tqdm

import Bio.pairwise2 as pairwise2
import biotite.structure.io as bsio

In [2]:
from Bio.PDB import *
import nglview as nv
import ipywidgets



In [3]:
from Bio.Align import substitution_matrices

In [4]:
import datasets
import numpy

In [5]:
mat = substitution_matrices.load('BLOSUM62')

In [6]:
parser = PDBParser()

In [7]:
torch.cuda.is_available()

True

In [8]:
ESMFOLD = esm.pretrained.esmfold_v1().eval()

[2023-08-18 12:57:39,503] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)


2023-08-18 12:57:50.086650: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-08-18 12:57:51.613166: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [9]:
def get_structures(sequences, ids, dir='./tmp/esm', gpu_id=None):
    
    if gpu_id is None:
        device='cuda'
    else:
        device=f'cuda:{gpu_id}'

    if not os.path.exists(dir):
        os.makedirs(dir)


    local_model = copy.deepcopy(ESMFOLD)
    local_model.to(device)

    pdb_outputs = {}
    
    with torch.no_grad():
        for pos in tqdm(range(0, len(sequences), 4)):
            batch_sequences = sequences[pos:pos + 4]
            batch_ids = ids[pos:pos + 4]
            
            outputs = local_model.infer(batch_sequences)
            
            # Convert outputs to pdb and save them immediately for the current batch
            batch_pdb_list = local_model.output_to_pdb(outputs)
            for seq_id, pdb_data in zip(batch_ids, batch_pdb_list):
                pdb_filename = os.path.join(dir, f"{seq_id}.pdb")
                with open(pdb_filename, "w") as f:
                    f.write(pdb_data)
                pdb_outputs[seq_id] = pdb_filename
    del local_model
    return pdb_outputs

## translate a training example

In [10]:
dataset = datasets.load_from_disk('../data/dataset')

In [11]:
train_example = dataset['train'].select([numpy.random.randint(len(dataset['train']))])[0]

In [12]:
meso=train_example['meso_seq']
# thermo=train_example['thermo_seq']

In [None]:
meso

In [20]:
translation = translate_sequences([meso], model_path='../data/nomelt-model/model')

In [21]:
translation[0][0]

array([['MSKAHILVVDDEKPIVDIIKFNLEKEGYKVTASYDGEDALNRIKNENFDMVLLDVMLPKLDGFSVCKKVREFSDVPIIMITAKADEVDKVLGLELGADDYITKPFGIRELIARIRANLRRTAQSAAQDGKVLKAGNLTLNPETFEVKKDGKVIELTVREYELLKFLMSQKGQVFSREELLEKVWDYEYYGDVRTVDVTVRRLREKIEDNPSEPNFILTKRGIGYYFNPNI']],
      dtype='<U230')

### true alignment

In [17]:
alignment = pairwise2.align.globalds(meso, thermo, mat, -2,-1, one_alignment_only=True, penalize_extend_when_opening=False, penalize_end_gaps=False)

In [18]:
print(pairwise2.format_alignment(*alignment[0], full_sequences=True))

MRLLLLED--DI-T-LGEGL-RDYLRSDGYL-VDWCSNLAQAR-AL-IS-E-PYD-AWLLDWNLPDG-SGIDWL-RSLRAKGLR-VPALLLTARDRLSD-RIEGLDSGADDFLVKPFAPEELSARLRAISRRVAG-S--ALRKAFGPVEIDLNAKAAWFEGQGVELTAREWGILEALVLRAGRIVSKADL-EALVL-GFDSELASNST-EVHVFKLRSKL---GKAL--IETVRGLGYRI-----PAA
||.|..||  .. . |.|.| |   |  | | ||  .. ...| .| .. | ||| | ..|..|||  .|.. | |.|||.. | .|.|.||||.|. | ...||..||||.|||||..|||.|||.|..||.|| |  .||  .||.|.|..|.....||..|||||.|...||.||||||...||..| |.|.. .||..  || | ||.|..||.||   |. |  |||.||.|||.     || 
MRVLVVEDEPELRARLVEALGR---R--G-LAVD--AT-GEGREGLFLAREIPYDVA-VVDLGLPD-LDGLE-LIRTLRAED-RPLPVLILTARGRW-DEKVAGLEAGADDYLVKPFHMEELVARLNALVRRAAGWSDPVLR--VGPLEVDTRAQRVRVEGREVELTAFEYRLLEQLVLRAGEVLSKSELTEHLYAQDFDRD--SN-TIEVFVARLRRKLDPEGR-LRPIETLRGQGYRLAWRREPA-
  Score=501



In [23]:
alignment = pairwise2.align.globalds(meso, translation[0][0], mat, -2,-1, one_alignment_only=True, penalize_extend_when_opening=False, penalize_end_gaps=False)

In [24]:
print(pairwise2.format_alignment(*alignment[0], full_sequences=True))

M-R--LLLLED-----DI---TLG-EGLRDY-LR-S-DGYLVDWCS-NLAQARALI-SEPYDAWLLDWNLP--DG-SGIDWLRSLRAKGLR-VPALLLTAR-DRLSDRIEGLDSGADDFLVKPFAPEELSARLRA-ISRRVA------GSALRKAFGPVEIDLNAKAAWFE----GQGV-ELTAREWGILEALVLRAGRIVSKADL-EALVLGFDSELASN-ST-EVHVFKLRSKL----GKA-LIETVRGLGYRI-PAA--
| .  .|...|     ||   .|. ||   | .. | ||  .|  . |    |  | .|..|..|||..||  || | .  ....| . .. ||....||. |.. |...||..||||...|||...||.||.|| . ||.|      |..| || |  ...||...  ||    |. | |||.||...|..|....|...|...| |. |  .|.|.... .| .|.|..||.|.    ... .|.|.||.||.. |    
MSKAHILVVDDEKPIVDIIKFNLEKEG---YKVTASYDG--ED--ALN----R--IKNENFDMVLLDVMLPKLDGFS-V--CKKVR-E-FSDVPIIMITAKADEV-DKVLGLELGADDYITKPFGIRELIARIRANL-RRTAQSAAQDGKVL-KA-G--NLTLNPET--FEVKKDGK-VIELTVREYELLKFLMSQKGQVFSREELLEK-V--WDYEYYGDVRTVDVTVRRLREKIEDNPSEPNFILTKRGIGYYFNP--NI
  Score=351



### structure differences

In [43]:
# predict structures
structures = get_structures([meso, thermo, translation[0][0]], ['meso_training', 'thermo_training', 'trans_training'])

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:14<00:00, 14.37s/it]


In [44]:
structures

{'meso_training': './tmp/esm/meso_training.pdb',
 'thermo_training': './tmp/esm/thermo_training.pdb',
 'trans_training': './tmp/esm/trans_training.pdb'}

In [45]:
meso_struct = parser.get_structure('meso_training', structures['meso_training'])



In [46]:
nv.show_biopython(meso_struct, gui=True)

NGLWidget()

Tab(children=(Box(children=(Box(children=(Box(children=(Label(value='step'), IntSlider(value=1, min=-100)), la…

In [47]:
thermo_struct = parser.get_structure('thermo_training', structures['thermo_training'])
nv.show_biopython(thermo_struct, gui=True)



NGLWidget()

Tab(children=(Box(children=(Box(children=(Box(children=(Label(value='step'), IntSlider(value=1, min=-100)), la…

In [48]:
trans_struct = parser.get_structure('trans_training', structures['trans_training'])
nv.show_biopython(trans_struct, gui=True)



NGLWidget()

Tab(children=(Box(children=(Box(children=(Box(children=(Label(value='step'), IntSlider(value=1, min=-100)), la…

The structure is mostly recovered for the training example. 

# Run estimator on the three sequences

In [49]:
from nomelt.thermo_estimation import mAFminDGEstimator, mAFminDGArgs, AlphaFoldArgs

import logging
logger = logging.getLogger('root')
logger.setLevel(logging.INFO)

logger.info("test")

In [None]:
af_args= AlphaFoldArgs(
    data_dir='/mmfs1/gscratch/cheme/usr/evankomp/af_reduced_dbs/',
    db_preset='reduced_dbs',
    use_precomputed_msas=False,
    base_executable_path="/mmfs1/home/evankomp/cheme/repos/alphafold/run_singularity.py",
    model_preset='model_4',
    models_to_relax='none'
)
estimator_args=mAFminDGArgs(
    af_params=af_args,
    wdir='./tmp/af_dg/',
    use_relaxed=False,
    num_replicates=20,
    fix_msas=False
)
estimator = mAFminDGEstimator(args=estimator_args)
sequences = [
    meso,
    thermo,
    translation[0][0]
]
ids = ['meso_ex', "thermo_ex", "trans_ex"]
outs = estimator.run(sequences=sequences, ids=ids)

I0814 09:29:23.740637 23097726142272 run_singularity.py:136] Binding /mmfs1/gscratch/cheme/usr/evankomp/repos/nomelt/analysis/tmp/af_dg -> /mnt/fasta_path_0
I0814 09:29:23.740730 23097726142272 run_singularity.py:136] Binding /mmfs1/gscratch/cheme/usr/evankomp/repos/nomelt/analysis/tmp/af_dg -> /mnt/fasta_path_1
I0814 09:29:23.740778 23097726142272 run_singularity.py:136] Binding /mmfs1/gscratch/cheme/usr/evankomp/repos/nomelt/analysis/tmp/af_dg -> /mnt/fasta_path_2
I0814 09:29:23.740824 23097726142272 run_singularity.py:136] Binding /mmfs1/gscratch/cheme/usr/evankomp/af_reduced_dbs/uniref90 -> /mnt/uniref90_database_path
I0814 09:29:23.740863 23097726142272 run_singularity.py:136] Binding /mmfs1/gscratch/cheme/usr/evankomp/af_reduced_dbs/mgnify -> /mnt/mgnify_database_path
I0814 09:29:23.740900 23097726142272 run_singularity.py:136] Binding /mmfs1/gscratch/cheme/usr/evankomp/af_reduced_dbs -> /mnt/data_dir
I0814 09:29:23.740935 23097726142272 run_singularity.py:136] Binding /mmfs1/gsc