In [1]:
import h5py
import pandas as pd
from deeprankcore.dataset import GraphDataset
from typing import List, Union

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
run_day_data = '17102022'
resolution_data = 'residue' # either 'residue' or 'atomic'
protein_class = 'I'
project_folder = '/Users/giuliacrocioni/Desktop/docs/eScience/projects/3D-vac/snellius_50/' # local resized df path
folder_data = f'{project_folder}data/pMHC{protein_class}/features_output_folder/GNN/{resolution_data}/{run_day_data}'

node_features = [
    "res_type",
    "res_charge",
    "res_size",
    "polarity",
    "hb_donors",
    "hb_acceptors",
    "pssm", 
    "info_content",
    "bsa",
    "hse",
    "sasa",
    "res_depth"]
edge_features = [
    "same_chain",
    "distance",
    "covalent",
    "electrostatic",
    "vanderwaals"]

input_data_path = folder_data + '/' + resolution_data + '.hdf5'
with h5py.File(input_data_path, 'r') as hdf5:
    for mol in hdf5.keys():

        print('Node features:\n')
        for feat in node_features:
            print(feat)
            print(hdf5[mol]['node_features'][feat][:].shape)
            print(type(hdf5[mol]['node_features'][feat][:]))
            print('Min:')
            print(hdf5[mol]['node_features'][feat][:].min())
            print('Max:')
            print(hdf5[mol]['node_features'][feat][:].max())
            print('\n')

        print('Edge features:\n')
        for feat in edge_features:
            print(feat)
            print(hdf5[mol]['edge_features'][feat][:].shape)
            print(type(hdf5[mol]['edge_features'][feat][:]))
            print('Min:')
            print(hdf5[mol]['edge_features'][feat][:].min())
            print('Max:')
            print(hdf5[mol]['edge_features'][feat][:].max())
            print('\n')
        break

Node features:

res_type
(165, 20)
<class 'numpy.ndarray'>
Min:
0.0
Max:
1.0


res_charge
(165,)
<class 'numpy.ndarray'>
Min:
-1.65
Max:
0.0


res_size
(165,)
<class 'numpy.ndarray'>
Min:
0
Max:
10


polarity
(165, 4)
<class 'numpy.ndarray'>
Min:
0.0
Max:
1.0


hb_donors
(165,)
<class 'numpy.ndarray'>
Min:
0
Max:
5


hb_acceptors
(165,)
<class 'numpy.ndarray'>
Min:
0
Max:
4


pssm
(165, 20)
<class 'numpy.ndarray'>
Min:
-4.0
Max:
11.0


info_content
(165,)
<class 'numpy.ndarray'>
Min:
0.0
Max:
2.33


bsa
(165,)
<class 'numpy.ndarray'>
Min:
0.0
Max:
211.9266056718234


hse
(165, 3)
<class 'numpy.ndarray'>
Min:
0.0
Max:
28.0


sasa
(165,)
<class 'numpy.ndarray'>
Min:
0.0
Max:
182.90009607344837


res_depth
(165,)
<class 'numpy.ndarray'>
Min:
1.672157916125892
Max:
5.935929233676673


Edge features:

same_chain
(5670,)
<class 'numpy.ndarray'>
Min:
0.0
Max:
1.0


distance
(5670,)
<class 'numpy.ndarray'>
Min:
1.306496842705713
Max:
14.998359676978012


covalent
(5670,)
<class 'numpy.ndarray'

In [18]:
from deeprankcore.tools import transform
import imp
imp.reload(transform)

<module 'deeprankcore.tools.transform' from '/Users/giuliacrocioni/Desktop/docs/eScience/projects/3D-vac/deeprank-core/deeprankcore/tools/transform.py'>

In [19]:
node_features_n, edge_features_n, targets = transform.hdf5_to_pandas(input_data_path, target_features='binary')

In [20]:
print(node_features_n)
print(edge_features_n)
print(targets)

['bsa', 'hb_acceptors', 'hb_donors', 'hse', 'info_content', 'polarity', 'pssm', 'res_charge', 'res_depth', 'res_size', 'res_type', 'sasa']
['covalent', 'distance', 'electrostatic', 'same_chain', 'vanderwaals']
['BA', 'binary', 'cluster']


In [6]:
print(node_features)
print(edge_features)

['res_type', 'res_charge', 'res_size', 'polarity', 'hb_donors', 'hb_acceptors', 'pssm', 'info_content', 'bsa', 'hse', 'sasa', 'res_depth']
['same_chain', 'distance', 'covalent', 'electrostatic', 'vanderwaals']
