## Find a test example with a few thermophilic targets, create visuals of the structures, overlapped

In [1]:
import esm
import torch
import os
import datasets
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from Bio.PDB import *
import nglview as nv
import ipywidgets

parser  = PDBParser()



### First find the example

In [2]:
ds = datasets.load_from_disk('../data/dataset')['test']
ds = ds.filter(lambda x: x['status_in_cluster'] in ['extreme', 'unique'])

Filter:   0%|          | 0/159970 [00:00<?, ? examples/s]

In [3]:
predictions = pd.read_csv('../data/nomelt-model/predictions.tsv', sep='\t', header=None)

In [4]:
predictions = predictions.applymap(lambda s: ''.join(s.split()))
predictions.columns = ['m', 'g', 't']

In [5]:
ds = ds.filter(lambda x: x['meso_seq'] in predictions['m'].values)

Filter:   0%|          | 0/34537 [00:00<?, ? examples/s]

In [6]:
ds

Dataset({
    features: ['meso_seq', 'thermo_seq', 'taxid', 'query_align_cov', 'subject_align_cov', 'bit_score', 'scaled_local_symmetric_percent_id', 'meso_seq_len', 'thermo_seq_len', 'seq_len_diff', 'meso_temp', 'thermo_temp', 'index', 'cluster', 'status_in_cluster'],
    num_rows: 1064
})

In [7]:
meso_replica_counts = pd.value_counts(ds['meso_seq'])

In [8]:
mesoseq = meso_replica_counts.index[meso_replica_counts.argmax()]

In [9]:
ds = ds.filter(lambda x: x['meso_seq']==mesoseq)

Filter:   0%|          | 0/1064 [00:00<?, ? examples/s]

In [10]:
ds

Dataset({
    features: ['meso_seq', 'thermo_seq', 'taxid', 'query_align_cov', 'subject_align_cov', 'bit_score', 'scaled_local_symmetric_percent_id', 'meso_seq_len', 'thermo_seq_len', 'seq_len_diff', 'meso_temp', 'thermo_temp', 'index', 'cluster', 'status_in_cluster'],
    num_rows: 3
})

In [11]:
thermo_seqs = ds['thermo_seq']

In [12]:
translated_seq = predictions[predictions['m']==mesoseq]['g'].iloc[0]

In [13]:
translated_seq

'MSKAHILVVDDEKPIVDIIKFNLEKEGYKVTASYDGEDALNRIKNENFDMVLLDVMLPKLDGFSVCKKVREFSDVPIIMITAKADEVDKVLGLELGADDYITKPFGIRELIARIRANLRRTAQSAAQDGKVLKAGNLTLNPETFEVKKDGKVIELTVREYELLKFLMSQKGQVFSREELLEKVWDYEYYGDVRTVDVTVRRLREKIEDNPSEPNFILTKRGIGYYFNPNI'

### get structure predictions

In [14]:
ESMFOLD = esm.pretrained.esmfold_v1().eval()

[2023-09-22 14:07:02,849] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)


2023-09-22 14:07:21.622733: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-09-22 14:07:23.041900: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [15]:
ESMFOLD = ESMFOLD.cuda()

In [17]:
def esm_one_struc(name, sequence):
    if not os.path.exists('./tmp/esmfold_predicts/'):
        os.makedirs('./tmp/esmfold_predicts/')

    if os.path.exists(f"./tmp/esmfold_predicts/{name}.pdb"):
        pass
    else:
        with torch.no_grad():
            output = ESMFOLD.infer_pdb(sequence)
        
        with open(f"./tmp/esmfold_predicts/{name}.pdb", "w") as f:
            f.write(output)
    return f"./tmp/esmfold_predicts/{name}.pdb"

In [18]:
structs_to_run = {
    f'thermo_{i}': t for i, t in enumerate(thermo_seqs)
}
structs_to_run['meso'] = mesoseq
structs_to_run['trans'] = translated_seq

In [19]:
files = {}
for name, seq in structs_to_run.items():
    files[name] = esm_one_struc(name, seq)

In [20]:
files

{'thermo_0': './tmp/esmfold_predicts/thermo_0.pdb',
 'thermo_1': './tmp/esmfold_predicts/thermo_1.pdb',
 'thermo_2': './tmp/esmfold_predicts/thermo_2.pdb',
 'meso': './tmp/esmfold_predicts/meso.pdb',
 'trans': './tmp/esmfold_predicts/trans.pdb'}

### View structures

In [21]:
from nomelt.thermo_estimation.optimizer import OptTrajSuperimposer

In [22]:
imposer = OptTrajSuperimposer(list(structs_to_run.values()), list(files.values()), values=range(len(files)))

In [23]:
dir(imposer)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_make_movie',
 '_parse_universes',
 '_run_vmd_script',
 '_save_trajectory',
 '_superimpose_one',
 '_vmd_script_single',
 'make_optimization_movie',
 'output_dir',
 'output_files',
 'ref_struct',
 'run',
 'sequences',
 'structure_files',
 'temp_dir',
 'universes',
 'values']

In [26]:
imposed_files = imposer.run()



In [27]:
structures = {name: parser.get_structure(i, imposed_files[i]) for i, name in enumerate(files.keys())}



In [28]:
structures

{'thermo_0': <Structure id=0>,
 'thermo_1': <Structure id=1>,
 'thermo_2': <Structure id=2>,
 'meso': <Structure id=3>,
 'trans': <Structure id=4>}

In [76]:
view = nv.NGLWidget(gui=True)

In [77]:
components = {k: view.add_component(nv.BiopythonStructure(v)) for k, v in structures.items()}

In [78]:
for c in components.values():
    c.remove_cartoon()

In [79]:
# components['meso'].add_representation('cartoon', color='#CCF5FF')
components['thermo_0'].add_representation('cartoon', color='#FFD6D6')
components['thermo_1'].add_representation('cartoon', color='#FFD6D6')
components['thermo_2'].add_representation('cartoon', color='#FFD6D6')
components['trans'].add_representation('cartoon', color='#C890FF', )

In [80]:
view

NGLWidget()

Tab(children=(Box(children=(Box(children=(Box(children=(Label(value='step'), IntSlider(value=1, min=-100)), la…

In [81]:
import time
def save_image(view, name):
    img = view.render_image()
    
    while not img.value:
        time.sleep(0.1)
    
    with open(f"./tmp/{name}.png", "wb") as f:
        f.write(img.value)


thread = threading.Thread(target=save_image, args=(view, 'thermo_superimposed'), daemon=False)
thread.start()

In [82]:
view = nv.NGLWidget(gui=True)
components = {k: view.add_component(nv.BiopythonStructure(v)) for k, v in structures.items()}
for c in components.values():
    c.remove_cartoon()
components['meso'].add_representation('cartoon', color='#CCF5FF')
components['trans'].add_representation('cartoon', color='#C890FF', )
view

NGLWidget()

Tab(children=(Box(children=(Box(children=(Box(children=(Label(value='step'), IntSlider(value=1, min=-100)), la…

In [83]:
thread = threading.Thread(target=save_image, args=(view, 'meso_trans'), daemon=False)
thread.start()

In [85]:
view = nv.NGLWidget(gui=True)
view.add_component(nv.BiopythonStructure(structures['meso']))
view

NGLWidget()

Tab(children=(Box(children=(Box(children=(Box(children=(Label(value='step'), IntSlider(value=1, min=-100)), la…

In [86]:
thread = threading.Thread(target=save_image, args=(view, 'meso'), daemon=False)
thread.start()

In [87]:
view = nv.NGLWidget(gui=True)
view.add_component(nv.BiopythonStructure(structures['trans']))
view

NGLWidget()

Tab(children=(Box(children=(Box(children=(Box(children=(Label(value='step'), IntSlider(value=1, min=-100)), la…

In [93]:
thread = threading.Thread(target=save_image, args=(view, 'trans'), daemon=False)
thread.start()

In [89]:
mesoseq

'MTEGARILVVDDEHYLADLAANALRRAGFQAEVAGTGGAALAVGLSRRPDLLVLDLRLAKGPGGSLADQLRRFGCSIPVLFLLGRDATQQDKITGLSVPGADYLGKPFSLGELVARCRAALRRSTGAGSPLLSCAGLRLDEDAHLVLRDETRVDLSPTEFRLLRHLLTHQNRVLTKQHILDHVWEYDYAGEDSVVPTYISYLRRKVDARREPMIHTIPRTGYVLRPPTPPAGPS'

In [90]:
thermo_seqs

['MEKLLIIDDEEMFVKGLKLSLEEEGFEVDAAYDGEEGLDKVRLGNYDLVILDIMLPKLDGFSVCREIRTFSNIPIIMLTARGDDIDKIVGIEIGADDYLAKPFNTRELTARIRALLRRATNPYTKRKDEIRRGELYINIPERAVYKRGKRIELTNKEFEILVLLASNPGKVYTKDKLLDLIWGFDFYGDTNTVTVHVRKLREKIEDDPANPQYIFTKWGAGYYMK',
 'MNKKILVVDDEKPIADILKFNLEKEGFEVTTAYDGESAVNKTLEDQPDLVLLDIMLPKKDGFQVLREIRKKLQIPILMLTAKEEEVDKVLGLELGADDYITKPFSIRELIARVKANLRRAEISLVNGNELIVSDSLVIDLNKYEVKKGNTIIELTLREFELLKFLATRAGQVFSREKLLEEVWGYEYYGDIRTVDVTIRRLREKVEDDSSNPKYVLTKRGVGYYFRRS',
 'MANKILIVDDEPLLVKGLKYSLEQDGYTVDAAYDGKEALDKFEKDDFDLIILDLMLPSIDGLEVCQKIRQKSQVPIIMLTAKGEDISKILGLEYGADDYLTKPFNILELKARIKAILRRVNTSETKIGEQVIKMDDFTINTLGRKVIAKDKEINLTAKEFDLLLLLASNPGKVFSREELLKIIWGYEYFGDLRTVDVHIRRLREKIEKNSSQPEYILTKWGVGYYFRNKT']