In [2]:
from tempfile import mkdtemp
from shutil import rmtree
import os
import h5py
from deeprankcore.preprocess import preprocess
from deeprankcore.models.query import ProteinProteinInterfaceResidueQuery
from deeprankcore.feature import amino_acid, atomic_contact, biopython, bsa, pssm, sasa
from tests.utils import PATH_TEST
from deeprankcore.DataSet import HDF5DataSet
from deeprankcore.Trainer import Trainer
from deeprankcore.ginet import GINet
from deeprankcore.models.metrics import OutputExporter
from deeprankcore.tools.score import get_all_scores
from deeprankcore.domain.features import nodefeats as Nfeat
from deeprankcore.domain.features import edgefeats
from deeprankcore.domain import targettypes as targets
import tempfile


pdb_path = str(PATH_TEST / "data/pdb/1ATN/1ATN_1w.pdb")
ref_path = str(PATH_TEST / "data/ref/1ATN/1ATN.pdb")
pssm_path1 = str(PATH_TEST / "data/pssm/1ATN/1ATN.A.pdb.pssm")
pssm_path2 = str(PATH_TEST / "data/pssm/1ATN/1ATN.B.pdb.pssm")
chain_id1 = "A"
chain_id2 = "B"

output_directory = mkdtemp()
metrics_directory = tempfile.mkdtemp()

prefix = os.path.join(output_directory, "test-preprocess")

feature_modules = [amino_acid, atomic_contact, biopython, bsa, pssm, sasa]


all_targets = get_all_scores(pdb_path, ref_path)

count_queries = 2
queries = []
for _ in range(1, count_queries + 1):
    query = ProteinProteinInterfaceResidueQuery(
        pdb_path,
        chain_id1,
        chain_id2,
        pssm_paths={chain_id1: pssm_path1, chain_id2: pssm_path2},
        targets = all_targets
    )
    queries.append(query)

output_paths = preprocess(feature_modules, queries, prefix, 2)
assert len(output_paths) > 0

graph_names = []
for path in output_paths:
    with h5py.File(path, "r") as f5:
        graph_names += list(f5.keys())

for query in queries:
    query_id = query.get_query_id()
    assert query_id in graph_names, f"missing in output: {query_id}"

n_files = len(output_paths)

node_features = [Nfeat.RESTYPE, Nfeat.POLARITY, Nfeat.BSA, Nfeat.RESDEPTH, Nfeat.HSE, Nfeat.INFOCONTENT, Nfeat.PSSM]
edge_features = [edgefeats.DISTANCE]

dataset_train = HDF5DataSet(
    hdf5_path = output_paths[:int(n_files*0.8)],
    node_feature = node_features,
    edge_feature = edge_features,
    target = targets.BINARY,
    clustering_method = "mcl",
)

dataset_val = HDF5DataSet(
    hdf5_path = output_paths[int(n_files*0.8):n_files-1],
    node_feature = node_features,
    edge_feature = edge_features,
    target = targets.BINARY,
    clustering_method = "mcl",
)

dataset_test = HDF5DataSet(
    hdf5_path = output_paths[-1],
    node_feature = node_features,
    edge_feature = edge_features,
    target = targets.BINARY,
    clustering_method = "mcl",
)

trainer = Trainer(
    dataset_train,
    dataset_val,
    dataset_test,
    GINet,
    batch_size=64,
    metrics_exporters=[OutputExporter(metrics_directory)],
    transform_sigmoid=True,
)   

  from .autonotebook import tqdm as notebook_tqdm
  potentials = numpy.expand_dims(charges, axis=0) * numpy.expand_dims(charges, axis=1) \
  potentials = numpy.expand_dims(charges, axis=0) * numpy.expand_dims(charges, axis=1) \
  potentials = numpy.expand_dims(charges, axis=0) * numpy.expand_dims(charges, axis=1) \
  potentials = numpy.expand_dims(charges, axis=0) * numpy.expand_dims(charges, axis=1) \
  potentials = numpy.expand_dims(charges, axis=0) * numpy.expand_dims(charges, axis=1) \
  potentials = numpy.expand_dims(charges, axis=0) * numpy.expand_dims(charges, axis=1) \
  potentials = numpy.expand_dims(charges, axis=0) * numpy.expand_dims(charges, axis=1) \
  potentials = numpy.expand_dims(charges, axis=0) * numpy.expand_dims(charges, axis=1) \
  potentials = numpy.expand_dims(charges, axis=0) * numpy.expand_dims(charges, axis=1) \
  potentials = numpy.expand_dims(charges, axis=0) * numpy.expand_dims(charges, axis=1) \
  potentials = numpy.expand_dims(charges, axis=0) * numpy.ex

   ['/tmp/tmpcmf17kt_/test-preprocess-18078.hdf5', '/tmp/tmpcmf17kt_/test-preprocess-18073.hdf5', '/tmp/tmpcmf17kt_/test-preprocess-18069.hdf5', '/tmp/tmpcmf17kt_/test-preprocess-18074.hdf5', '/tmp/tmpcmf17kt_/test-preprocess-18071.hdf5', '/tmp/tmpcmf17kt_/test-preprocess-18075.hdf5', '/tmp/tmpcmf17kt_/test-preprocess-18077.hdf5', '/tmp/tmpcmf17kt_/test-preprocess-18072.hdf5'] dataset                 : 100%|██████████| 8/8 [00:00<00:00, 652.77it/s, mol=test-preprocess-18072.hdf5]
   ['/tmp/tmpcmf17kt_/test-preprocess-18070.hdf5'] dataset                 : 100%|██████████| 1/1 [00:00<00:00, 284.11it/s, mol=test-preprocess-18070.hdf5]
   ['/tmp/tmpcmf17kt_/test-preprocess-18076.hdf5'] dataset                 : 100%|██████████| 1/1 [00:00<00:00, 215.37it/s, mol=test-preprocess-18076.hdf5]


  0%|          | 0/8 [00:00<?, ?it/s]no clustering group found
 12%|█▎        | 1/8 [00:00<00:01,  4.50it/s]no clustering group found
 25%|██▌       | 2/8 [00:00<00:00,  6.31it/s]no clustering group found
no clustering group found
 50%|█████     | 4/8 [00:00<00:00,  8.34it/s]no clustering group found
 62%|██████▎   | 5/8 [00:00<00:00,  8.65it/s]no clustering group found
no clustering group found
 88%|████████▊ | 7/8 [00:00<00:00,  9.56it/s]no clustering group found
100%|██████████| 8/8 [00:00<00:00,  8.81it/s]
  0%|          | 0/1 [00:00<?, ?it/s]no clustering group found
100%|██████████| 1/1 [00:00<00:00, 12.75it/s]
  0%|          | 0/1 [00:00<?, ?it/s]no clustering group found
100%|██████████| 1/1 [00:00<00:00, 11.53it/s]


In [4]:
trainer.complete_exporter.df

Unnamed: 0,phase,epoch,entry,output,target,loss


In [5]:
trainer.train(nepoch=2, validate=True)

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed-integer,key->block1_values] [items->Index(['phase', 'entry', 'output', 'target'], dtype='object')]

  self.df.to_hdf(


TypeError: cannot pickle 'module' object