### Load Data
Load in data from data folder and spit it out in data loader object.

In [1]:
import os
import torch
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

from torch.utils.data.dataloader import DataLoader
from foldingdiff import modelling
from foldingdiff import datasets as dsets
import numpy as np

# instance variables 'trim_strategy', 'pad', 'min_length', 'pdbs_src', 'structures', 'cache_dir', 'rng', 'means', 'all_lengths', '_length_rng', 'feature_idx'

clean_dset = dsets.CathCanonicalAnglesOnlyDataset(pad=128, trim_strategy='randomcrop')

print("instance variables in clean_dset", clean_dset.__dict__.keys())
print("all lengths", clean_dset.all_lengths)

max_length = 0 #need to find maximum length for padding purposes
for length in clean_dset.all_lengths:
    if length > max_length:
        max_length = length 

dset_angles = [] #list of matrices, each matrix represents one protein
dset_fnames = [] #list of file names, aligned to dset_angles

for i in range(len(clean_dset.structures)):
    structure = clean_dset.structures[i]
    structure_np = structure["angles"].to_numpy()
    
    rows = structure_np.shape[0]
    rows_to_add = max_length - rows
    structure_np_padded = np.pad(structure_np, pad_width=((0,rows_to_add), (0,0)))

    # we need to pad this to have the shape (max_length, 9)
    dset_angles.append(structure_np_padded)
    dset_fnames.append(structure["fname"])

dl = DataLoader(dset_angles, batch_size=32, shuffle=False)

for (batch_idx, data) in enumerate(dl):
    print(data.size())
    print(batch_idx)


  from .autonotebook import tqdm as notebook_tqdm


instance variables in clean_dset dict_keys(['trim_strategy', 'pad', 'min_length', 'pdbs_src', 'fnames', 'structures', 'cache_dir', 'rng', 'means', 'all_lengths', '_length_rng', 'feature_idx'])
all lengths [62, 133, 82, 150, 146, 124, 230, 135, 114, 87, 45, 95, 42, 329, 88, 98, 184, 195, 119, 211, 56, 119, 357, 111, 130, 172, 97, 271, 101, 47, 242, 123, 176, 76, 162, 103, 109, 176, 86, 52, 110, 112, 90, 85, 148, 143, 256, 150, 45, 397, 188, 100, 50, 114, 87, 89, 185, 121, 107, 116, 40, 292, 71, 305, 117, 185, 417, 145, 185, 196, 95, 173, 40, 198, 261, 40, 126, 109, 86, 177, 286, 133, 159, 97, 303, 91, 151, 279, 155, 180, 175, 212, 182, 244, 118, 132, 83, 42, 243, 57, 95, 119, 123, 155, 212, 140, 206, 279, 45, 179, 230, 99, 53, 230, 66, 215, 100, 327, 152, 143, 95, 56, 47, 253, 125, 173, 227, 105, 121, 94, 126, 282, 189, 200, 146, 127, 322, 198, 172, 136, 84, 487, 217, 185, 385, 106, 98, 108, 419, 99, 158, 111, 153, 140, 155, 185, 135, 106, 202, 94, 243, 110, 133, 448, 200, 144, 215, 267

In [3]:
print(len(dset_fnames))

1868


In [5]:
np.shape(data)

torch.Size([12, 1031, 9])

In [11]:
k = data[0]

In [16]:
print(k[100:200, :])

tensor([[ 1.3258,  1.4510,  1.5274, -1.8882, -2.8869, -3.0935,  1.9680,  2.0186,
          2.1170],
        [ 1.3264,  1.4520,  1.5244, -1.2709, -0.4216, -3.1249,  1.9303,  2.0344,
          2.1169],
        [ 1.3290,  1.4575,  1.5267,  1.4876,  2.7619,  3.1223,  1.9131,  2.0268,
          2.1321],
        [ 1.3313,  1.4566,  1.4989, -2.4237,  2.2328,  3.1293,  1.9155,  2.0188,
          2.1230],
        [ 1.3166,  1.4344,  1.5182, -1.3832,  2.0315,  3.0908,  1.8540,  2.0185,
          2.0819],
        [ 1.3248,  1.4489,  1.5264, -1.7260,  2.2672,  3.1270,  1.8672,  2.0101,
          2.1342],
        [ 1.3265,  1.4513,  1.5453, -2.1315,  1.7734,  3.1005,  1.8416,  2.0163,
          2.1180],
        [ 1.3283,  1.4631,  1.5286, -1.5530,  1.8912, -3.1223,  1.9854,  2.0101,
          2.1650],
        [ 1.3496,  1.4700,  1.5316, -1.4062,  2.1831, -3.1413,  1.9422,  2.0572,
          2.1038],
        [ 0.0000,  0.0000,  0.0000, -1.1997,     nan,     nan,     nan,     nan,
             nan],


In [None]:
k