# Look at similar sequences to 1enh in training set

See `diamond_1enh.py` for the local alignment search for similar sequences. Let's look at the top N sequences

### Open an index of the dataset

In [1]:
import pandas as pd
from Bio import SeqIO
import numpy as np
import Bio.pairwise2 as pairwise2
from Bio.Align import substitution_matrices
mat = substitution_matrices.load('BLOSUM62')
import os



In [2]:
# Indexing the FASTA file
fasta_index = SeqIO.index("./diamond_1enh/training_meso_db.fasta", "fasta")
thermo_index = SeqIO.index("./diamond_1enh/training_thermo.fasta", "fasta")
print(fasta_index["seq_52"].seq)
print(thermo_index["seq_52"].seq)

MKILLGEDEEKMRELLRKFFEKAGYSVVEVSDGEAALTKFYEEQFDLAVLDWMMPKLSGIKVAKRMKQERNIKILMLTAKNLPEDEVQALVVGADDYLAKPFHAAVLLARVAKLLGVLQQDIKQPLVFIPAEQRALFQGEELALTKREYELLLYLYQNRGQALTREQILLAVWGLDHETDERTVDSFIRILREKIGKERIQTVYGIGYRFDDQKT
MRILVVDDEAPMRALLRLFLEQHGFAVSEAEDGYEALERVRTERPDLVLLDIMLPGIDGWAVCRILRRESDVPVIMLTARDDVRDRVSGLEAGADDYLVKPFAEEELLARIRAVLRRTPKVEGRPADRVIVGPLLIDVPAREAYCHGRRLNLTPKEFDLLALLARHPGQVLDRARIIERVWGWDYDGDVRTVDTHVKTLRAKLVAAGCSRHLIETVRSIGYRLNPHHDGRGGTIP


index works

### Open the hits for 1 enh by blast

In [7]:
hits = pd.read_csv('./diamond_1enh/blast_out.tsv', sep='\t', skiprows=5, header=None)

In [8]:
hit_ids = hits[1].dropna().values

In [9]:
meso_seqs = [str(fasta_index[s].seq) for s in hit_ids]
thermo_seqs = [str(thermo_index[s].seq) for s in hit_ids]

In [10]:
df = pd.DataFrame({'m': meso_seqs, 't': thermo_seqs})

In [11]:
df

Unnamed: 0,m,t
0,MSTKSRTRSKTRLSRALGIALTPKAARFLEKRPYAPGEHGRTKRKT...,MARYTGPVCRLCRREGMKLYLKGSRCYTDKCAFERRPFPPGQHGRN...
1,MSTKSRTRSKTRLSRALGIALTPKAARFLEKRPYAPGEHGRTKRKT...,MARYTGPSCKLCRREGVKLYLKGDRCYSDKCALVRRPYAPGQHGHN...
2,MSTKSRTRSKTRLSRALGIALTPKAARFLEKRPYAPGEHGRTKRKT...,MARYTGPVCRLCRREGIKLYLKGERCYTPKCAIDRRGYAPGQHGQM...
3,MSTKSRTRSKTRLSRALGIALTPKAARFLEKRPYAPGEHGRTKRKT...,MARHTGPVCRLCRREGLKLYLKGEKCYTDKCPVSRRNYAPGQHGKA...
4,MSTKSRTRSKTRLSRALGIALTPKAARFLEKRPYAPGEHGRTKRKT...,MARYTGPVCKLCRREGIKLFLKGDRCYTDKCAIARRNYAPGQHGHT...
...,...,...
398,MNHPAQQLDPQFLPQHIALVMDGNGRWAQQRGMKRTEGHKRGEQVL...,MVFFRQKNRDLSDKIDKNKLPIHIGIIMDGNGRWAQKRGMMRFYGH...
399,MNHPAQQLDPQFLPQHIALVMDGNGRWAQQRGMKRTEGHKRGEQVL...,MFEFLKRKKIKIDKEKMPQHIAIIMDGNGRWAKKRGLPRSAGHRFG...
400,MNHPAQQLDPQFLPQHIALVMDGNGRWAQQRGMKRTEGHKRGEQVL...,MRIPNHIGIIPDGNRRWALKNGLNKEDGYDFGLKPGLELFRLCKKV...
401,MNHPAQQLDPQFLPQHIALVMDGNGRWAQQRGMKRTEGHKRGEQVL...,MGIKEIDLIDPKKLPCHIAIIMDGNGRWAQKRGLPRIAGHWAGAEA...


### Make structure predictions

In [12]:
import torch
import esm
from Bio.PDB import *
import nglview as nv
import ipywidgets
parser = PDBParser()

model = esm.pretrained.esmfold_v1()
model = model.eval().cuda()



[2023-08-02 14:37:28,433] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)


2023-08-02 14:37:35.995484: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-08-02 14:37:37.115567: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [13]:
os.makedirs("./enh_similar_seq/", exist_ok=True)

In [14]:
ENH1 = "RPRTAFSSEQLARLKREFNENRYLTERRRQQLSSELGLNEAQIKIWFQNKRAKI"

In [15]:
# Lower sizes will have lower memory requirements at the cost of increased speed.
# model.set_chunk_size(128)
def one_struc(i, sequence):
    # Multimer prediction can be done with chains separated by ':'
    
    with torch.no_grad():
        output = model.infer_pdb(sequence)
    
    with open(f"./enh_similar_seq/structure_{i}.pdb", "w") as f:
        f.write(output)
    
    struct = parser.get_structure(f'tmp_{i}', f"./enh_similar_seq/structure_{i}.pdb")
    return struct
    

In [32]:
def align(a, b):
    alignment = pairwise2.align.globalds(a, b, mat, -11,-1, one_alignment_only=True, penalize_extend_when_opening=False, penalize_end_gaps=False)
    a_strings = pairwise2.format_alignment(*alignment[0], full_sequences=True).split('\n')
    wt_chunks = [a_strings[0][i:i+79] for i in range(0, len(a_strings[0]), 79)]
    variant_chunks = [a_strings[2][i:i+79] for i in range(0, len(a_strings[2]), 79)]
    alignment_chunks = [a_strings[1][i:i+79] for i in range(0, len(a_strings[1]), 79)]
    net_string = ""
    for wt_chunk, variant_chunk, alignment_chunk in zip(wt_chunks, variant_chunks, alignment_chunks):
        net_string += '\n'.join([wt_chunk, alignment_chunk, variant_chunk]) + '\n\n'
    print(net_string)

In [33]:
align(ENH1, meso_seqs[0])

-------------------------------RPRTAFSSEQLARLKREFNENRYLTERRRQQLSSELGLNEAQIKIWFQ
                               ||   .......|.||..........|..|.|....|..|||.||.|.
MSTKSRTRSKTRLSRALGIALTPKAARFLEKRP---YAPGEHGRTKRKTDSDYAVRLREKQRLRAQYGIREAQLKIAFE

NKRAKI-------------------------------------------------------------------------
..|...                                                                         
QARRRQGLTGENLVEILETRLDALLVRSAIARTTAQARQMIVHRHILVDGELVDRPSFRVKPGQLMHVKARSEGMEPFQ

------------------------------------------------------
                                                      
VAAAGGHVDLLPKLPPYLEVEVDKLQARLVRAPKRIEIPVTCEVQLVVEYYAAR




Not a very good alignment, let's look at the structure of all three

In [16]:
enh_struct = one_struc(-1, ENH1)
nv.show_biopython(enh_struct, gui=True)



NGLWidget()

Tab(children=(Box(children=(Box(children=(Box(children=(Label(value='step'), IntSlider(value=1, min=-100)), la…

In [28]:
meso_struct = one_struc(0, meso_seqs[0])
nv.show_biopython(meso_struct, gui=True)



NGLWidget()

Tab(children=(Box(children=(Box(children=(Box(children=(Label(value='step'), IntSlider(value=1, min=-100)), la…

In [30]:
thermo_struct = one_struc(0, thermo_seqs[0])
nv.show_biopython(thermo_struct, gui=True)



NGLWidget()

Tab(children=(Box(children=(Box(children=(Box(children=(Label(value='step'), IntSlider(value=1, min=-100)), la…

The thermo structure looks like the meso one

In [34]:
align(meso_seqs[0], thermo_seqs[0])

MSTKSRTRSKTRLSRALGIALTPKAAR------FLEKRPYAPGEHGRTKRKTDSDYAVRLREKQRLRAQYGIREAQLKI
  ....|....||.|..|..|..|..|      ..|.||..||.|||...|. |.|...|||||.....||..|.|...
--MARYTGPVCRLCRREGMKLYLKGSRCYTDKCAFERRPFPPGQHGRNRKKL-SEYGLQLREKQKVKRIYGVLERQFER

AFEQARRRQGLTGENLVEILETRLDALLVRSAIARTTAQARQMIVHRHILVDGELVDRPSFRVKPGQLMHVKARSEG-M
.||.|.|..|.|||||..|||.|||....|...|...|||||...|.|..|.|..|..||..|..|....|...|.. |
YFEMAERMKGVTGENLLQILERRLDNVVYRMGFASSRAQARQLVRHGHFTVNGKRVNIPSYLVDVGDVIAVAEKSAAKM

EPFQVAAAGGHVDLLPKLPPYLEVEVDKLQARLVRAPKRIEIPVTCEVQLVVEYYAAR
|.|......|...   ...|.|.|..|||.......|.|..|.|.....|.||.|.. 
EHFKALREQGPAG---NIVPWLSVDFDKLEGTVTALPTRQDIDVPIQEHLIVELYSK-




The thermo and meso are nice and similar.

In [36]:
from nomelt.thermo_estimation.optimizer import OptTrajSuperimposer

In [37]:
siser = OptTrajSuperimposer(
    sequences = [meso_seqs[0], ENH1], structure_files=["./enh_similar_seq/structure_0.pdb", "./enh_similar_seq/structure_-1.pdb"], trials_dataframe = 1
)

In [38]:
siser.run()

atoms:    N_ref=262, N_traj=267
but we attempt to create a valid selection (use strict=True to disable this heuristic).


['/mmfs1/gscratch/cheme/usr/evankomp/repos/nomelt/analysis/tmp/opt_traj_superimposer/structure_0000.pdb',
 '/mmfs1/gscratch/cheme/usr/evankomp/repos/nomelt/analysis/tmp/opt_traj_superimposer/structure_0001.pdb']

In [39]:
enh_struct = nv.BiopythonStructure(parser.get_structure('enh', '/mmfs1/gscratch/cheme/usr/evankomp/repos/nomelt/analysis/tmp/opt_traj_superimposer/structure_0001.pdb'))
other_struct = nv.BiopythonStructure(parser.get_structure('other', '/mmfs1/gscratch/cheme/usr/evankomp/repos/nomelt/analysis/tmp/opt_traj_superimposer/structure_0000.pdb'))



In [82]:
view = nv.NGLWidget(gui=True)

In [83]:
view.add_component(enh_struct)
view.add_representation("cartoon", selection="all", color='blue')

In [84]:
view

NGLWidget()

Tab(children=(Box(children=(Box(children=(Box(children=(Label(value='step'), IntSlider(value=1, min=-100)), la…

In [85]:
other_struct = nv.BiopythonStructure(parser.get_structure('enh', '/mmfs1/gscratch/cheme/usr/evankomp/repos/nomelt/analysis/tmp/opt_traj_superimposer/structure_0000.pdb'))



In [86]:
view.add_component(other_struct)

<nglview.component.ComponentViewer at 0x15034f76d1f0>

In [87]:
view

NGLWidget(n_components=2)

Tab(children=(Box(children=(Box(children=(Box(children=(Label(value='step'), IntSlider(value=1, min=-100)), la…

In [39]:
struct = one_struc(-1, hit_seqs[-1])
nv.show_biopython(struct, gui=True)



NGLWidget()

Tab(children=(Box(children=(Box(children=(Box(children=(Label(value='step'), IntSlider(value=1, min=-100)), la…

In [40]:
views = []
for i, seq in enumerate(hit_seqs):
    struct = one_struc(i, seq)
    view = nv.show_biopython(struct, gui=True)
    views.append(view)



In [21]:
views[0]

NGLWidget(n_components=1)

Tab(children=(Box(children=(Box(children=(Box(children=(Label(value='step'), IntSlider(value=1, min=-100)), la…

In [28]:
views[1]

NGLWidget(n_components=1)

Tab(children=(Box(children=(Box(children=(Box(children=(Label(value='step'), IntSlider(value=1, min=-100)), la…

In [23]:
views[2]

NGLWidget(n_components=1)

Tab(children=(Box(children=(Box(children=(Box(children=(Label(value='step'), IntSlider(value=1, min=-100)), la…

In [24]:
views[3]

NGLWidget(n_components=1)

Tab(children=(Box(children=(Box(children=(Box(children=(Label(value='step'), IntSlider(value=1, min=-100)), la…

In [29]:
views[10]

NGLWidget()

Tab(children=(Box(children=(Box(children=(Box(children=(Label(value='step'), IntSlider(value=1, min=-100)), la…

In [31]:
views[-1]

NGLWidget()

Tab(children=(Box(children=(Box(children=(Box(children=(Label(value='step'), IntSlider(value=1, min=-100)), la…

In [32]:
hit_seqs[-1]

'MVYAMAMILLVEDDRIITAALSRALTDAGHVVRPVGRAADALKIVTDERPDLVILDLGLPDIDGTDALRMMRSVSDVPVIVATARRSEADIISLLSAGADDYVTKPFSGGHILARISAVLRRSRAATTEAPSAITVGDLMINPRQRRAELRGEPLQLTRREFDVLAYLAERVGQVISRRELMNEVWNQARIGEEQTIDVHISWLRRKLGETAAQPRFLHTVRGVGVMMVDPR'

In [33]:
hit_seqs[0]

'MAERVIRVVVVDDEPMVCAHLRTILSSAVDIDVVDLAQDGAAAVEAVMRHRPHVVLMDLRMPGVDGLTAIERITRLPAPPAVVALTTFDADQYVIRALRAGAAGFLVKTTPPEDLIGLVRVAADGHTVLSPAATRRLVAAQAGEQAARERALRRVRELTERETEVLSALGEGLSNAQIATRLSLSEATVKSYVSRMLVKLDCANRTQAGLLAYDAGLATR'