In [1]:
import time
# science
import numpy as np
import torch
from einops import repeat, rearrange

In [2]:
# data / process
import joblib
import sidechainnet
VOCAB = sidechainnet.utils.sequence.ProteinVocabulary()

In [3]:
# self module
import mp_nerf

### Load a protein in SCN format - you can skip this since a joblib file is provided

In [4]:
dataloaders = sidechainnet.load(casp_version=7, with_pytorch="dataloaders")
dataloaders.keys() # ['train', 'train_eval', 'valid-10', ..., 'valid-90', 'test']
# ProteinDataset(casp_version=12, split='train', n_proteins=81454,
#               created='Sep 20, 2020')

SidechainNet was loaded from ./sidechainnet_data/sidechainnet_casp7_30.pkl.


dict_keys(['train', 'train-eval', 'test', 'valid-10', 'valid-20', 'valid-30', 'valid-40', 'valid-50', 'valid-70', 'valid-90'])

In [5]:
def get_prot(dataloader_=None, vocab_=None, min_len=80, max_len=150, verbose=True):
    """ Gets a protein from sidechainnet and returns
        the right attrs for training. 
        Inputs: 
        * dataloader_: sidechainnet iterator over dataset
        * vocab_: sidechainnet VOCAB class
        * min_len: int. minimum sequence length
        * max_len: int. maximum sequence length
        * verbose: bool. verbosity level
        Outputs: (cleaned, without padding)
        (seq_str, int_seq, coords, angles, padding_seq, mask, pid)
    """
    for batch in dataloader_['train']:
        # try for breaking from 2 loops at once
        try:
            for i in range(batch.int_seqs.shape[0]):
                # strip padding padding
                padding_seq = (batch.int_seqs[i] == 20).sum().item()
                padding_angles = (torch.abs(batch.angs[i]).sum(dim=-1) == 0).long().sum().item()

                if padding_seq == padding_angles:
                    # check for appropiate length
                    real_len = batch.int_seqs[i].shape[0] - padding_seq
                    if max_len >= real_len >= min_len:
                        # strip padding tokens
                        seq = ''.join([vocab_.int2char(aa) for aa in batch.int_seqs[i].numpy()])
                        seq = seq[:-padding_seq or None]
                        int_seq = batch.int_seqs[i][:-padding_seq or None]
                        angles  = batch.angs[i][:-padding_seq or None]
                        mask    = batch.msks[i][:-padding_seq or None]
                        coords  = batch.crds[i][:-padding_seq*14 or None]

                        print("stopping at sequence of length", real_len)
                        raise StopIteration
                else:
                    # print("found a seq of length:", len(seq),
                    #        "but oustide the threshold:", min_len, max_len)
                    pass
        except StopIteration:
            break
            
    return seq, int_seq, coords, angles, padding_seq, mask, batch.pids[i]

In [6]:
seq, int_seq, true_coords, angles, padding_seq, mask, pid = get_prot(dataloader_=dataloaders, vocab_=VOCAB, 
                                                                     min_len=700, max_len=1000)

stopping at sequence of length 821


### Load joblib file

In [7]:
!ls experiments

100_info.joblib     400_info.joblib     700_info.joblib     logs_experiment.txt
200_info.joblib     500_info.joblib     800_info.joblib     profile_csv
300_info.joblib     600_info.joblib     900_info.joblib


In [8]:
# joblib.dump({"seq": seq, "int_seq": int_seq, "angles": angles,
#              "id": batch.pids[i], "true_coords": batch.crds[i]}, "experiments/727_aas_seq_and_angles.joblib")
# info = joblib.load("experiments/727_aas_seq_and_angles.joblib")
# seq, int_seq, angles, id_, true_coords = info["seq"], info["int_seq"], info["angles"], info["id"], info["true_coords"]

### Test algo

In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [16]:
%%timeit
# measure time to featurize
mp_nerf.proteins.build_scaffolds_from_scn_angles(seq, angles.to(device))

56.3 ms ± 2.58 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [17]:
# featurize
scaffolds = mp_nerf.proteins.build_scaffolds_from_scn_angles(seq, angles.to(device))

In [18]:
%%timeit
# convert coords - fold
coords, mask = mp_nerf.proteins.protein_fold(seq, **scaffolds, device=device)
coords_flat  = rearrange(coords, 'l c d -> (l c) d') 

16.2 ms ± 213 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [19]:
coords, mask = mp_nerf.proteins.protein_fold(seq, **scaffolds, device=device)
coords_flat  = rearrange(coords, 'l c d -> (l c) d') 

#### Profiling

In [20]:
%load_ext snakeviz
%snakeviz mp_nerf.proteins.protein_fold(seq[:-padding_seq], **scaffolds, device=device)

 
*** Profile stats marshalled to file '/var/folders/lh/zgndpx8x755_lcsq48lp_5t40000gn/T/tmptb0jhy8g'. 
Embedding SnakeViz in this document...


#### Display

In [21]:
sb = sidechainnet.StructureBuilder(int_seq, crd=coords_flat) 
sb.to_3Dmol()

<py3Dmol.view at 0x1329d42d0>

In [22]:
# base structure with current coords
sb = sidechainnet.StructureBuilder(int_seq, crd=true_coords) # coords_flat

# scn custom nerf
# sb = sidechainnet.StructureBuilder(int_seq[:-padding_angles], angles[:-padding_seq])

# put structure coords in wrapper
sb._initialize_coordinates_and_PdbCreator()
true_coords = sb.coords
# true_coords = rearrange(scn_struct_coords, '(l c) d -> l c d', c=14)

sb.to_3Dmol()

<py3Dmol.view at 0x1329d4950>

### Save oriented to manually diagnose

In [23]:
# save predicted
sb = sidechainnet.StructureBuilder(int_seq, crd=coords_flat) 
sb.to_pdb("preds/predicted.pdb")
sb = sidechainnet.StructureBuilder(int_seq, crd=true_coords) 
sb.to_pdb("preds/labels.pdb")
# go here: https://molstar.org/viewer/
# load chains and use superimposition tool