In [3]:
from pathlib import Path
import pkg_resources as pkg
PATH_DEEPRANK_CORE = Path(pkg.resource_filename("deeprankcore", ""))
ROOT = PATH_DEEPRANK_CORE.parent
PATH_TEST = ROOT / "tests"
from deeprankcore.query import (
    QueryCollection,
    ProteinProteinInterfaceResidueQuery,
    SingleResidueVariantResidueQuery,
    ProteinProteinInterfaceAtomicQuery)
from deeprankcore.tools.target import compute_targets
from deeprankcore.dataset import save_hdf5_keys
from deeprankcore.domain.aminoacidlist import alanine, phenylalanine
import glob
import os
import h5py
import pandas as pd

- Generating 1ATN_ppi.hdf5

In [2]:
import warnings
from Bio import BiopythonWarning
with warnings.catch_warnings():
    warnings.simplefilter("ignore", BiopythonWarning)
    warnings.simplefilter("ignore", RuntimeWarning)

    ref_path = str(PATH_TEST / "data/ref/1ATN/1ATN.pdb")
    pssm_path1 = str(PATH_TEST / "data/pssm/1ATN/1ATN.A.pdb.pssm")
    pssm_path2 = str(PATH_TEST / "data/pssm/1ATN/1ATN.B.pdb.pssm")
    chain_id1 = "A"
    chain_id2 = "B"
    pdb_paths = [
        str(PATH_TEST / "data/pdb/1ATN/1ATN_1w.pdb"),
        str(PATH_TEST / "data/pdb/1ATN/1ATN_2w.pdb"),
        str(PATH_TEST / "data/pdb/1ATN/1ATN_3w.pdb"),
        str(PATH_TEST / "data/pdb/1ATN/1ATN_4w.pdb")]

    queries = QueryCollection()

    for pdb_path in pdb_paths:
        # Append data points
        targets = compute_targets(pdb_path, ref_path)
        queries.add(ProteinProteinInterfaceResidueQuery(
            pdb_path = pdb_path,
            chain_id1 = chain_id1,
            chain_id2 = chain_id2,
            targets = targets,
            pssm_paths = {
                chain_id1: pssm_path1,
                chain_id2: pssm_path2
            }
        ))

    # Generate graphs and save them in hdf5 files
    output_paths = queries.process()

- Generating train.hdf5, valid.hdf5, test.hdf5

In [5]:
# Local data
project_folder = '/Users/giuliacrocioni/Desktop/docs/eScience/projects/3D-vac/snellius_data/snellius_100_07122022/'
csv_file_name = 'BA_pMHCI_human_quantitative.csv'
models_folder_name = 'exp_nmers_all_HLA_quantitative'
data = 'pMHCI'
resolution = 'residue' # either 'residue' or 'atomic'
distance_cutoff = 15 # max distance in Å between two interacting residues/atoms of two proteins

csv_file_path = f'{project_folder}data/external/processed/I/{csv_file_name}'
models_folder_path = f'{project_folder}data/{data}/features_input_folder/{models_folder_name}'

pdb_files = glob.glob(os.path.join(models_folder_path + '/pdb', '*.pdb'))
pdb_files.sort()
print(f'{len(pdb_files)} pdbs found.')
pssm_m = glob.glob(os.path.join(models_folder_path + '/pssm', '*.M.*.pssm'))
pssm_m.sort()
print(f'{len(pdb_files)} MHC pssms found.')
pssm_p = glob.glob(os.path.join(models_folder_path + '/pssm', '*.P.*.pssm'))
pssm_p.sort()
print(f'{len(pdb_files)} peptide pssms found.')
csv_data = pd.read_csv(csv_file_path)
csv_data.cluster = csv_data.cluster.fillna(-1)
pdb_ids_csv = [pdb_file.split('/')[-1].split('.')[0].replace('-', '_') for pdb_file in pdb_files]
clusters = [csv_data[csv_data.ID == pdb_id].cluster.values[0] for pdb_id in pdb_ids_csv]
bas = [csv_data[csv_data.ID == pdb_id].measurement_value.values[0] for pdb_id in pdb_ids_csv]

queries = QueryCollection()
print(f'Adding {len(pdb_files)} queries to the query collection ...')
for i in range(len(pdb_files)):
    queries.add(
        ProteinProteinInterfaceResidueQuery(
            pdb_path = pdb_files[i], 
            chain_id1 = "M",
            chain_id2 = "P",
            distance_cutoff = distance_cutoff,
            targets = {
                'binary': int(float(bas[i]) <= 500), # binary target value
                'BA': bas[i], # continuous target value
                'cluster': clusters[i]
                },
            pssm_paths = {
                "M": pssm_m[i],
                "P": pssm_p[i]
                }))
print(f'Queries created and ready to be processed.\n')

output_paths = queries.process()
print(output_paths)

100 pdbs found.
100 MHC pssms found.
100 peptide pssms found.
Adding 100 queries to the query collection ...
Queries created and ready to be processed.



chain M not found in the structure
Query residue-ppi-BA-101047:M-P's graph was not saved in the hdf5 file; check the query's files
tri_norm: face with normal vector of lenght 0
tri_norm: face with normal vector of lenght 0
tri_norm: face with normal vector of lenght 0
tri_norm: face with normal vector of lenght 0


['processed-queries.hdf5']


In [None]:
# dividing hdf5 file in train, valid, test
hdf5_path = 'residue.hdf5'
train_clusters = [3, 4, 5, 2]
val_clusters = [1, 8]
test_clusters = [6]
target = 'target_values'
feature = 'cluster'

clusters = {}
train_ids = []
val_ids = []
test_ids = []
# '/Users/giuliacrocioni/remote_snellius/data/pMHCI/features_output_folder/GNN/residue/13072022/residue.hdf5'
with h5py.File(hdf5_path, 'r') as hdf5:

    for key in hdf5.keys():
        feature_value = float(hdf5[key][target][feature][()])
        if feature_value in train_clusters:
            train_ids.append(key)
        elif feature_value in val_clusters:
            val_ids.append(key)
        elif feature_value in test_clusters:
            test_ids.append(key)

        if feature_value in clusters.keys():
            clusters[int(feature_value)] += 1
        else:
            clusters[int(feature_value)] = 1


    print(f'Trainset contains {len(train_ids)} data points, {round(100*len(train_ids)/len(hdf5.keys()), 2)}% of the total data.')
    print(f'Validation set contains {len(val_ids)} data points, {round(100*len(val_ids)/len(hdf5.keys()), 2)}% of the total data.')
    print(f'Test set contains {len(test_ids)} data points, {round(100*len(test_ids)/len(hdf5.keys()), 2)}% of the total data.\n')

    for (key, value) in dict(sorted(clusters.items(), key=lambda x:x[1], reverse=True)).items():
        print(f'Group with value {key}: {value} data points, {round(100*value/len(hdf5.keys()), 2)}% of total data.')

save_hdf5_keys(hdf5_path, train_ids, 'train.hdf5', hardcopy = True)
save_hdf5_keys(hdf5_path, val_ids, 'valid.hdf5', hardcopy = True)
save_hdf5_keys(hdf5_path, test_ids, 'test.hdf5', hardcopy = True)

- Generating variants.hdf5

In [7]:
count_queries = 5
pdb_path = str(PATH_TEST / "data/pdb/3C8P/3C8P.pdb")
ref_path = str(PATH_TEST / "data/ref/3C8P/3C8P.pdb")
targets = compute_targets(pdb_path, ref_path)
queries = QueryCollection()

for number in range(1, count_queries + 1):
    query = SingleResidueVariantResidueQuery(
        pdb_path,
        "A",
        number,
        None,
        alanine,
        phenylalanine,
        pssm_paths={
            "A": str(PATH_TEST / "data/pssm/3C8P/3C8P.A.pdb.pssm"),
            "B": str(PATH_TEST / "data/pssm/3C8P/3C8P.B.pdb.pssm")},
        targets = targets
    )
    queries.add(query)

output_paths = queries.process()

- Generating atomic.hdf5

In [8]:
ref_path = str(PATH_TEST / "data/ref/1ATN/1ATN.pdb")
pssm_path1 = str(PATH_TEST / "data/pssm/1ATN/1ATN.A.pdb.pssm")
pssm_path2 = str(PATH_TEST / "data/pssm/1ATN/1ATN.B.pdb.pssm")
chain_id1 = "A"
chain_id2 = "B"
pdb_paths = [
    str(PATH_TEST / "data/pdb/1ATN/1ATN_1w.pdb"),
    str(PATH_TEST / "data/pdb/1ATN/1ATN_2w.pdb"),
    str(PATH_TEST / "data/pdb/1ATN/1ATN_3w.pdb"),
    str(PATH_TEST / "data/pdb/1ATN/1ATN_4w.pdb")]

queries = QueryCollection()

for pdb_path in pdb_paths:
    # Append data points
    targets = compute_targets(pdb_path, ref_path)
    queries.add(ProteinProteinInterfaceAtomicQuery(
        pdb_path = pdb_path,
        chain_id1 = chain_id1,
        chain_id2 = chain_id2,
        targets = targets,
        pssm_paths = {
            chain_id1: pssm_path1,
            chain_id2: pssm_path2
        }
    ))

# Generate graphs and save them in hdf5 files
output_paths = queries.process()