### Load Data
Load in data from data folder and spit it out in data loader object.

In [1]:
import os
import torch
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

from torch.utils.data.dataloader import DataLoader
from foldingdiff import modelling
from foldingdiff import datasets as dsets
import numpy as np

# instance variables 'trim_strategy', 'pad', 'min_length', 'pdbs_src', 'structures', 'cache_dir', 'rng', 'means', 'all_lengths', '_length_rng', 'feature_idx'

clean_dset = dsets.CathCanonicalAnglesOnlyDataset(pad=128, trim_strategy='randomcrop')

print("instance variables in clean_dset", clean_dset.__dict__.keys())
print("all lengths", clean_dset.all_lengths)

max_length = 0 #need to find maximum length for padding purposes
for length in clean_dset.all_lengths:
    if length > max_length:
        max_length = length 

dset_angles = [] #list of matrices, each matrix represents one protein
dset_fnames = [] #list of file names, aligned to dset_angles

for i in range(len(clean_dset.structures)):
    structure = clean_dset.structures[i]
    structure_np = structure["angles"].to_numpy()
    np.nan_to_num(structure_np)
    rows = structure_np.shape[0]
    rows_to_add = max_length - rows
    structure_np_padded = np.pad(structure_np, pad_width=((0,rows_to_add), (0,0)))

    # we need to pad this to have the shape (max_length, 9)
    dset_angles.append(structure_np_padded)
    dset_fnames.append(structure["fname"])

dl = DataLoader(dset_angles, batch_size=32, shuffle=False)

'''
for (batch_idx, data) in enumerate(dl):
    print(data.size())
    print(batch_idx)
'''



  from .autonotebook import tqdm as notebook_tqdm


instance variables in clean_dset dict_keys(['trim_strategy', 'pad', 'min_length', 'pdbs_src', 'fnames', 'structures', 'cache_dir', 'rng', 'means', 'all_lengths', '_length_rng', 'feature_idx'])
all lengths [135, 192, 241, 157, 65, 230, 146, 197, 85, 63, 123, 139, 248, 173, 244, 101, 104, 157, 154, 130, 346, 79, 81, 307, 101, 85, 176, 123, 302, 383, 95, 201, 62, 116, 101, 113, 81, 121, 141, 126, 206, 73, 345, 226, 75, 154, 119, 99, 112, 150, 115, 97, 93, 210, 118, 78, 173, 69, 91, 52, 119, 171, 74, 86, 101, 138, 127, 113, 168, 141, 221, 307, 219, 155, 67, 125, 87, 216, 391, 144, 142, 216, 52, 88, 256, 102, 179, 205, 164, 488, 291, 103, 87, 74, 100, 85, 173, 135, 288, 126, 87, 196, 241, 126, 138, 165, 100, 520, 112, 175, 106, 139, 449, 106, 113, 147, 54, 128, 259, 98, 168, 41, 176, 80, 188, 65, 146, 205, 141, 48, 103, 193, 44, 152, 271, 208, 189, 201, 165, 221, 125, 108, 176, 141, 174, 145, 148, 129, 47, 121, 81, 193, 131, 123, 84, 105, 96, 97, 268, 142, 269, 94, 280, 480, 155, 80, 119, 1

'\nfor (batch_idx, data) in enumerate(dl):\n    print(data.size())\n    print(batch_idx)\n'

From angles, construct a pdb file

In [3]:
from foldingdiff import angles_and_coords as angles
from foldingdiff import nerf as nerf
 
test_structure = clean_dset.structures[0] # example structure
 
# extract psi, phi, and omega angles from test_structure
test_structure_phi = test_structure['angles']['phi']
test_structure_psi = test_structure['angles']['psi']
test_structure_omega = test_structure['angles']['omega']
test_structure_tau = test_structure['angles']['tau']
test_structure_c_n = test_structure['angles']['CA:C:1N']
test_structure_n_ca = test_structure['angles']['C:1N:1CA']

#build the object for nerf construction
nerf_build_kwargs = dict(
    phi_dihedrals=test_structure_phi, #0
    psi_dihedrals=test_structure_psi, #1
    omega_dihedrals=test_structure_omega, #2
    bond_angle_ca_c=test_structure_tau, #3
    bond_angle_c_n = test_structure_c_n, #4
    bond_angle_n_ca=test_structure_n_ca #5
)
 
#call nerf builder to get cartesian coordinates from angles
nerf_builder = nerf.NERFBuilder(**nerf_build_kwargs)
coords = (
    nerf_builder.cartesian_coords
)
 
#convert the cartesian coordinates to a pdb file using built in function
angles.write_coords_to_pdb(coords, "output.pdb")

'output.pdb'

In [20]:
print(len(dset_fnames))

1868


In [21]:
np.shape(data)

torch.Size([12, 1031, 9])

In [22]:
k = data[0]

In [23]:
print(k[0:100, :])

tensor([[ 1.3297e+00,  1.4589e+00,  1.5292e+00,         nan,  1.3919e+00,
         -3.1207e+00,  1.9474e+00,  2.0333e+00,  2.1278e+00],
        [ 1.3269e+00,  1.4594e+00,  1.5143e+00, -1.9010e+00,  2.4706e+00,
          3.1064e+00,  1.7855e+00,  2.0275e+00,  2.1369e+00],
        [ 1.3211e+00,  1.4423e+00,  1.5125e+00, -2.1535e+00,  1.9841e+00,
         -3.0987e+00,  1.9876e+00,  2.0377e+00,  2.0963e+00],
        [ 1.3252e+00,  1.4591e+00,  1.5244e+00, -1.8065e+00,  2.0973e+00,
          3.0857e+00,  1.8540e+00,  2.0268e+00,  2.1231e+00],
        [ 1.3264e+00,  1.4565e+00,  1.5203e+00, -1.9705e+00,  1.6751e+00,
          3.1109e+00,  1.8284e+00,  2.0327e+00,  2.1084e+00],
        [ 1.3264e+00,  1.4584e+00,  1.5277e+00, -1.2916e+00,  3.0877e+00,
         -3.1322e+00,  1.9264e+00,  2.0426e+00,  2.1116e+00],
        [ 1.3422e+00,  1.4647e+00,  1.5320e+00,  1.0526e+00,  1.5431e+00,
          3.1409e+00,  1.9734e+00,  2.0334e+00,  2.2000e+00],
        [ 1.3304e+00,  1.4613e+00,  1.5253e+00, 

In [24]:
k

tensor([[1.3297, 1.4589, 1.5292,  ..., 1.9474, 2.0333, 2.1278],
        [1.3269, 1.4594, 1.5143,  ..., 1.7855, 2.0275, 2.1369],
        [1.3211, 1.4423, 1.5125,  ..., 1.9876, 2.0377, 2.0963],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]])