In [987]:
import pandas as pd
from rdkit import Chem
import numpy as np

In [988]:
biological_ligands = pd.read_pickle("../kegg_information/final_kegg_compound_reaction_enzyme_df.pkl")
biological_ligands.rename(columns = {"matched_entry": "ecID", "matched_name" : "ecName"}, inplace = True)

biological_ligands_mols = biological_ligands[["unique_id", "mol"]]
biological_ligands_mols = biological_ligands_mols.drop_duplicates(subset = "unique_id")

biological_ligands_mols.loc[biological_ligands_mols.mol.isna() == False, "mol_smiles"] = biological_ligands_mols.loc[biological_ligands_mols.mol.isna() == False, "mol"].apply(lambda x: Chem.MolToSmiles(x))
biological_ligands_nodes = biological_ligands_mols[["unique_id", "mol_smiles"]].rename(columns = {"unique_id": "uniqueID:ID", "mol_smiles": "descriptor"})
biological_ligands_nodes.to_csv("biological_ligand_mols.csv", sep = "\t", index = False)

ec_nodes = biological_ligands[["pdb_entry", "ecID", "ecName"]].drop_duplicates()
ec_nodes = ec_nodes.loc[ec_nodes.pdb_entry != "KEGG API did not return result"]

In [989]:
biological_ligand_ec_rels = biological_ligands.loc[(biological_ligands.unique_id.isna() == False) & (biological_ligands.ecID.isin(ec_nodes.ecID)),["unique_id", "ecID"]]
biological_ligand_ec_rels.drop_duplicates(inplace = True)
biological_ligand_ec_rels.rename(columns = {"unique_id" : ":START_ID", "ecID": ":END_ID(ec-id)"}, inplace = True)
biological_ligand_ec_rels.to_csv("biological_ligand_ec_rels.csv", sep = "\t" , index = False)

In [990]:
pdb_ec_nodes = ec_nodes.pdb_entry.drop_duplicates().rename("pdbEC:ID(pdb-ec-id)")
ec_nodes[["ecClass", "ecSubClass", "ecSubSubClass", "drop"]] = ec_nodes.ecID.str.split(".", expand = True)
ec_nodes.drop(columns = ["drop"], inplace = True)
ec_nodes["ecSubClass"] = ec_nodes["ecClass"] + "." + ec_nodes["ecSubClass"]
ec_nodes["ecSubSubClass"] = ec_nodes["ecClass"] + "." + ec_nodes["ecSubClass"] + ec_nodes["ecSubSubClass"]

In [991]:
ec_id_nodes = ec_nodes[["ecID", "ecName"]].rename(columns = {"ecID" : "ecID:ID(ec-id)"}).drop_duplicates()
ec_nodes_class = ec_nodes["ecClass"].rename("ecClass:ID").drop_duplicates()
ec_nodes_subclass = ec_nodes["ecSubClass"].rename("ecSubClass:ID").drop_duplicates()
ec_nodes_subsubclass = ec_nodes["ecSubSubClass"].rename("ecSubSubClass:ID").drop_duplicates()

pdb_ec_nodes.to_csv("pdb_ec_nodes.csv", sep = "\t", index = False)
ec_id_nodes.to_csv("ec_id_nodes.csv", sep = "\t", index = False)
ec_nodes_class.to_csv("ec_nodes_class.csv", sep = "\t", index = False)
ec_nodes_subclass.to_csv("ec_nodes_subclass.csv", sep = "\t", index = False)
ec_nodes_subsubclass.to_csv("ec_nodes_subsubclass.csv", sep = "\t", index = False)

In [992]:
ec_class_subclass_rel = ec_nodes[["ecClass", "ecSubClass"]].drop_duplicates()
ec_class_subclass_rel.rename(columns = {"ecClass": ":END_ID", "ecSubClass" : ":START_ID"}, inplace = True)
ec_class_subclass_rel = ec_class_subclass_rel[[":START_ID", ":END_ID"]]
ec_subclass_subsubclass_rel = ec_nodes[["ecSubClass", "ecSubSubClass"]].drop_duplicates()
ec_subclass_subsubclass_rel.rename(columns = {"ecSubClass": ":END_ID", "ecSubSubClass" : ":START_ID"}, inplace = True)
ec_subsubclass_id_rel = ec_nodes[["ecSubSubClass", "ecID"]].drop_duplicates()
ec_subsubclass_id_rel.rename(columns = {"ecSubSubClass": ":END_ID", "ecID" : ":START_ID(ec-id)"}, inplace = True)
pdb_ec_rels = ec_nodes[["pdb_entry", "ecID"]].drop_duplicates()
pdb_ec_rels.rename(columns = {"pdb_entry" : ":START_ID(pdb-ec-id)", "ecID": ":END_ID(ec-id)"}, inplace = True)


ec_class_subclass_rel.to_csv("ec_class_subclass_rel.csv", sep = "\t", index = False)
ec_subclass_subsubclass_rel.to_csv("ec_subclass_subsubclass_rel.csv", sep = "\t", index = False)
ec_subsubclass_id_rel.to_csv("ec_subsubclass_id_rel.csv", sep = "\t", index = False)

pdb_ec_rels.to_csv("pdb_ec_rel.csv", sep = "\t", index = False)

In [993]:
cath_domains = pd.read_csv("../domain_ownership/cath_domain_ownership.csv", na_values = ["NaN", "None"], keep_default_na = False)
scop_domains = pd.read_csv("../domain_ownership/scop_domain_ownership.csv", na_values = ["NaN", "None"], keep_default_na = False)

In [994]:
cath_pdb_nodes = cath_domains.pdb_id.unique()
scop_pdb_nodes = scop_domains.pdb_id.unique()

pdb_nodes = np.union1d(cath_pdb_nodes, scop_pdb_nodes)

np.savetxt("pdb_entry_nodes.csv", pdb_nodes, delimiter='\t',fmt='%s', header='pdbEntry:ID',comments='')

In [995]:
cath_protein_entities = cath_domains.protein_entity_id.unique()
scop_protein_entities = scop_domains.protein_entity_id.unique()

protein_entities = np.union1d(cath_protein_entities, scop_protein_entities)

np.savetxt("pdb_protein_chain_nodes.csv", protein_entities, delimiter='\t',fmt='%s', header='pdbProteinChain:ID',comments='')

In [1009]:
scop_domains_nodes = scop_domains[["scop_id", "dm_description"]].drop_duplicates()
scop_domains_nodes.rename(columns = {"scop_id": "scopDomain:ID(scop-domain-id)", "dm_description": "domainDescription"}, inplace = True)
scop_domains_nodes.to_csv("scop_domains_nodes.csv", sep = "\t", index = False)

scop_family_nodes = scop_domains[["scop_sunid", "sf_description"]].drop_duplicates()
scop_family_nodes.rename(columns = {"scop_sunid": "scopFamily:ID(scop-family-id)", "sf_description": "familyDescription"}, inplace = True)
scop_family_nodes.to_csv("scop_family_nodes.csv", sep = "\t", index = False)

scop_superfamily_nodes = scop_domains[["sf_id", "sf_description"]].drop_duplicates()
scop_superfamily_nodes.rename(columns = {"sf_id": "scopSuperfamily:ID(scop-superfam-id)", "sf_description": "superfamilyDescription"}, inplace = True)
scop_superfamily_nodes.to_csv("scop_superfamily_nodes.csv", sep = "\t", index = False)

scop_class_nodes = scop_domains[["cl_id", "cl_description"]].drop_duplicates()
scop_class_nodes.rename(columns = {"cl_id": "scopClass:ID(scop-class-id)", "cl_description": "classDescription"}, inplace = True)
scop_class_nodes.to_csv("scop_class_nodes.csv", sep = "\t", index = False)

scop_fold_nodes = scop_domains[["cf_id", "cf_description"]].drop_duplicates()
scop_fold_nodes.rename(columns = {"cf_id": "scopFold:ID(scop-fold-id)", "cf_description": "foldDescription"}, inplace = True)
scop_fold_nodes.to_csv("scop_fold_nodes.csv", sep = "\t", index = False)

scop_domain_family_rels = scop_domains[["scop_id", "scop_sunid"]].drop_duplicates()
scop_domain_family_rels.rename(columns = {"scop_id": ":START_ID(scop-domain-id)", "scop_sunid": ":END_ID(scop-family-id)"}, inplace = True)
scop_domain_family_rels.to_csv("scop_domain_family_rels.csv", sep = "\t", index = False)

scop_family_superfamily_rels = scop_domains[["scop_sunid", "sf_id"]].drop_duplicates()
scop_family_superfamily_rels.rename(columns = {"scop_sunid": ":START_ID(scop-family-id)", "sf_id": ":END_ID(scop-superfam-id)"}, inplace = True)
scop_family_superfamily_rels.to_csv("scop_family_superfam_rels.csv", sep = "\t", index = False)

scop_superfamily_fold_rels = scop_domains[["sf_id", "cf_id"]].drop_duplicates()
scop_superfamily_fold_rels.rename(columns = {"sf_id": ":START_ID(scop-superfam-id)", "cf_id": ":END_ID(scop-fold-id)"}, inplace = True)
scop_superfamily_fold_rels.to_csv("scop_superfam_fold_rels.csv", sep = "\t", index = False)

scop_fold_class_rels = scop_domains[["cf_id", "cl_id"]].drop_duplicates()
scop_fold_class_rels.rename(columns = {"cf_id": ":START_ID(scop-fold-id)", "cl_id": ":END_ID(scop-class-id)"}, inplace = True)
scop_fold_class_rels.to_csv("scop_fold_class_rels.csv", sep = "\t", index = False)

In [997]:
cath_domains_nodes = cath_domains[["cath_domain", "cath_name"]].drop_duplicates()
cath_domains_nodes.rename(columns = {"cath_domain": "cathDomain:ID(cath-domain-ID)", "cath_name": "cathName"}, inplace = True)
cath_domains_nodes.to_csv("cath_domains_nodes.csv", sep = "\t", index = False)

cath_class_nodes = cath_domains.cath_class.unique()
cath_architecture_nodes = cath_domains.cath_architecture.unique()
cath_topology_nodes = cath_domains.cath_topology.unique()
cath_homology_nodes = cath_domains.cath_homology.unique()

np.savetxt("cath_class_nodes.csv", cath_class_nodes, delimiter='\t',fmt='%s', header='cathClass:ID(cath-class-ID)', comments='')
np.savetxt("cath_architecture_nodes.csv", cath_architecture_nodes, delimiter='\t',fmt='%s', header='cathArchitecture:ID(cath-architecture-ID)', comments='')
np.savetxt("cath_topology_nodes.csv", cath_topology_nodes, delimiter='\t',fmt='%s', header='cathTopology:ID(cath-topology-ID)', comments='')
np.savetxt("cath_homology_nodes.csv", cath_homology_nodes, delimiter='\t',fmt='%s', header='cathHomology:ID(cath-homology-ID)', comments='')

cath_class_architecture_rels = cath_domains[["cath_class", "cath_architecture"]].rename(columns = {"cath_class": ":END_ID(cath-class-ID)", "cath_architecture" : ":START_ID(cath-architecture-ID)"}).drop_duplicates()
cath_architecture_topology_rels = cath_domains[["cath_architecture", "cath_topology"]].rename(columns = {"cath_architecture": ":END_ID(cath-architecture-ID)", "cath_topology" : ":START_ID(cath-topology-ID)"}).drop_duplicates()
cath_topology_homology_rels = cath_domains[["cath_topology", "cath_homology"]].rename(columns = {"cath_topology": ":END_ID(cath-topology-ID)", "cath_homology" : ":START_ID(cath-homology-ID)"}).drop_duplicates()
cath_homology_domain_rels = cath_domains[["cath_homology", "cath_domain"]].rename(columns = {"cath_domain": ":END_ID(cath-domain-ID)", "cath_homology" : ":START_ID(cath-homology-ID)"}).drop_duplicates()

cath_class_architecture_rels.to_csv("cath_class_architecture_rels.csv", sep = "\t", index = False)
cath_architecture_topology_rels.to_csv("cath_architecture_topology_rels.csv", sep = "\t", index = False)
cath_topology_homology_rels.to_csv("cath_topology_homology_rels.csv", sep = "\t", index = False)
cath_homology_domain_rels.to_csv("cath_homology_domain_rels.csv", sep = "\t", index = False)

In [998]:
cath_bl_entities = cath_domains[["bound_ligand_id", "bound_ligand_name"]].drop_duplicates().reset_index(drop = True)
scop_bl_entities = scop_domains[["bound_ligand_id", "bound_ligand_name"]].drop_duplicates().reset_index(drop = True)

domain_bl_entities = pd.concat([cath_bl_entities, scop_bl_entities]).reset_index(drop = True).drop_duplicates()

pdb_ligands = pd.read_pickle("../pdbe_graph_files/chains_domains_chem_descriptors_cath_scop.pkl")
pdb_ligands_nodes = pdb_ligands[["bl_id", "descriptor"]].drop_duplicates()
pdb_ligands_nodes.rename(columns = {"bl_id" : "entityID:ID"}, inplace = True)

pdb_ligands_nodes = pdb_ligands_nodes.merge(domain_bl_entities, how = "right", left_on = "entityID:ID", right_on = "bound_ligand_id", indicator = True)
assert(len(pdb_ligands_nodes.loc[pdb_ligands_nodes._merge != "both"]) == 0)

pdb_ligands_nodes.drop(columns = ["bound_ligand_id", "_merge"], inplace = True)
pdb_ligands_nodes.rename(columns = {"bound_ligand_name": "entityName"}, inplace = True)
pdb_ligands_nodes.to_csv("pdb_ligands.csv", sep = "\t", index = False)

In [999]:
import re
def return_partial_EC_list(ec, total_ec_list):
    if not isinstance(ec, str) and np.isnan(ec):
        return np.nan
    elif "-" in ec:
        replacement_character = r'.'
        modified_ec = re.sub(r'\.', r"_", ec)
        modified_ec = modified_ec.replace("-", ".")
        total_ec_list = [re.sub(r'\.', r"_", item) for item in total_ec_list]
        # Use re.match() to check if the modified string matches any item in the match_list
        matching_ec = [ec for ec in total_ec_list if re.match(modified_ec, ec)]
        matching_ec = [re.sub(r'_', r".", item) for item in matching_ec]
        return(matching_ec)
    else:
        return [ec]

# Load the XML file containing explorenz ec data
xml_file_path = '../kegg_information/enzyme-data.xml'
tree = ET.parse(xml_file_path)

# Find all table_data elements and parse their data
data = {}
for table_data_elem in tree.findall(".//table_data"):
    table_name = table_data_elem.attrib['name']
    data[table_name] = parse_table_data(table_data_elem)

# Convert the parsed data into a DataFrame
dfs = {table_name: pd.DataFrame.from_dict(data) for table_name, data in data.items()}

total_ec_nums = dfs["entry"].T
total_ec_nums_list = total_ec_nums["ec_num"].dropna().unique().tolist()

In [1000]:
cath_ligand_ec_rel = cath_domains[["bound_ligand_id", "protein_entity_ec"]].copy()
cath_ligand_ec_rel["protein_entity_ec"] = cath_ligand_ec_rel["protein_entity_ec"].str.split(",")
cath_ligand_ec_rel = cath_ligand_ec_rel.explode("protein_entity_ec")
cath_ligand_ec_rel.drop_duplicates()
scop_ligand_ec_rel = scop_domains[["bound_ligand_id", "protein_entity_ec"]].copy()
scop_ligand_ec_rel["protein_entity_ec"] = scop_ligand_ec_rel["protein_entity_ec"].str.split(",")
scop_ligand_ec_rel = scop_ligand_ec_rel.explode("protein_entity_ec")
scop_ligand_ec_rel.drop_duplicates()

ligand_ec_rel = pd.concat([cath_ligand_ec_rel, scop_ligand_ec_rel]).drop_duplicates()
ligand_ec_rel["protein_entity_ec"] = ligand_ec_rel["protein_entity_ec"].str.strip()
ligand_ec_rel["protein_entity_ec"] = ligand_ec_rel["protein_entity_ec"].apply(lambda x: return_partial_EC_list(x, total_ec_nums_list))
ligand_ec_rel = ligand_ec_rel.explode("protein_entity_ec")
ligand_ec_rel["protein_entity_ec"] = ligand_ec_rel["protein_entity_ec"].str.strip()
ligand_ec_rel = ligand_ec_rel.loc[ligand_ec_rel.protein_entity_ec.isin(pdb_ec_nodes)]
ligand_ec_rel.rename(columns = {"bound_ligand_id" : ":START_ID", "protein_entity_ec" : ":END_ID(pdb-ec-id)"}, inplace = True)

ligand_ec_rel.to_csv("ligand_ec_rel.csv", sep = "\t" , index = False)

cath_protein_ec_rel = cath_domains[["protein_entity_id", "protein_entity_ec"]].copy()
cath_protein_ec_rel["protein_entity_ec"] = cath_protein_ec_rel["protein_entity_ec"].str.split(",")
cath_protein_ec_rel = cath_protein_ec_rel.explode("protein_entity_ec")
cath_protein_ec_rel.drop_duplicates()
scop_protein_ec_rel = scop_domains[["protein_entity_id", "protein_entity_ec"]].copy()
scop_protein_ec_rel["protein_entity_ec"] = scop_protein_ec_rel["protein_entity_ec"].str.split(",")
scop_protein_ec_rel = scop_protein_ec_rel.explode("protein_entity_ec")
scop_protein_ec_rel.drop_duplicates()

protein_ec_rel = pd.concat([cath_protein_ec_rel, scop_protein_ec_rel]).drop_duplicates()
protein_ec_rel["protein_entity_ec"] = protein_ec_rel["protein_entity_ec"].str.strip()
protein_ec_rel["protein_entity_ec"] = protein_ec_rel["protein_entity_ec"].apply(lambda x: return_partial_EC_list(x, total_ec_nums_list))
protein_ec_rel = protein_ec_rel.explode("protein_entity_ec")
protein_ec_rel["protein_entity_ec"] = protein_ec_rel["protein_entity_ec"].str.strip()
protein_ec_rel = protein_ec_rel.loc[protein_ec_rel.protein_entity_ec.isin(pdb_ec_nodes)]
protein_ec_rel.rename(columns = {"protein_entity_id" : ":START_ID", "protein_entity_ec" : ":END_ID(pdb-ec-id)"}, inplace = True)


protein_ec_rel.to_csv("protein_ec_rel.csv", sep = "\t", index = False)

In [1001]:
cath_protein_entry_rels = cath_domains[["pdb_id", "protein_entity_id"]]
scop_protein_entry_rels = scop_domains[["pdb_id", "protein_entity_id"]]

protein_entry_rels = pd.concat([cath_protein_entry_rels,scop_protein_entry_rels]).drop_duplicates()
protein_entry_rels.rename(columns = {"pdb_id": ":END_ID", "protein_entity_id": ":START_ID"} , inplace = True)
protein_entry_rels.to_csv("protein_entry_rels.csv", sep = "\t" , index = False)

In [1002]:
pdb_ligands_entry_rel = pdb_ligands[["pdb_entry_id", "bl_id"]].drop_duplicates()
pdb_ligands_entry_rel = pdb_ligands_entry_rel.loc[pdb_ligands_entry_rel.bl_id.isin(pdb_ligands_nodes["entityID:ID"])] #this is theoretically temporary until we remove the ligands with only clashes from the retrieved ligands
pdb_ligands_entry_rel.rename(columns = {"pdb_entry_id" : ":END_ID", "bl_id" : ":START_ID"}, inplace = True)
pdb_ligands_entry_rel.to_csv("ligand_entry_rel.csv", sep = "\t", index = False)

In [1003]:
parity_scores = pd.read_pickle("../parity_calcs/parity_calcs_cath_scop/all_parity_calcs.pkl")

ligand_score_relationships = parity_scores.loc[(parity_scores.compound.isna() == False) & (parity_scores.bl_id.isin(pdb_ligands_nodes["entityID:ID"])),["bl_id", "compound", "score", "isCognate"]].rename(columns = {"bl_id" : ":START_ID", "compound": ":END_ID", "score": "parityScore"}).drop_duplicates()
ligand_score_relationships.to_csv("ligand_score_relationships.csv", sep = "\t", index = False)

In [1004]:
cath_domain_ligand_interactions = cath_domains[["cath_domain", "domain_contact_counts", "domain_contact_perc", "domain_hbond_counts", "domain_hbond_perc", "domain_ownership", "bound_ligand_id"]].drop_duplicates()
cath_domain_ligand_interactions.rename(columns = {"cath_domain": ":START_ID(cath-domain-ID)", "domain_contact_counts" : "domainContactCounts", "domain_contact_perc": "domainContactPerc", "domain_hbond_counts" : "domainHbondCounts", "domain_hbond_perc" : "domainHbondPerc" , "domain_ownership" : "interactionMode" , "bound_ligand_id" : ":END_ID"}, inplace = True)
cath_domain_ligand_interactions.to_csv("cath_domain_ligand_interactions.csv", sep = "\t", index = False)

In [1005]:
scop_domain_ligand_interactions = scop_domains[["scop_id", "domain_contact_counts", "domain_contact_perc", "domain_hbond_counts", "domain_hbond_perc", "domain_ownership", "bound_ligand_id"]].drop_duplicates()
scop_domain_ligand_interactions.rename(columns = {"scop_id": ":START_ID(scop-domain-id)", "domain_contact_counts" : "domainContactCounts", "domain_contact_perc": "domainContactPerc", "domain_hbond_counts" : "domainHbondCounts", "domain_hbond_perc" : "domainHbondPerc", "domain_ownership" : "interactionMode" , "bound_ligand_id" : ":END_ID"}, inplace = True)
scop_domain_ligand_interactions.to_csv("scop_domain_ligand_interactions.csv", sep = "\t", index = False)

In [1006]:
cath_domain_protein_rels = cath_domains[["cath_domain", "protein_entity_id"]].drop_duplicates()
cath_domain_protein_rels.rename(columns = {"cath_domain" : ":START_ID(cath-domain-ID)", "protein_entity_id" : ":END_ID"}, inplace = True)
cath_domain_protein_rels.to_csv("cath_domain_protein_rels.csv", sep = "\t", index = False)

In [1007]:
scop_domain_protein_rels = scop_domains[["scop_id", "protein_entity_id"]].drop_duplicates()
scop_domain_protein_rels.rename(columns = {"scop_id" : ":START_ID(scop-domain-id)", "protein_entity_id" : ":END_ID"}, inplace = True)
scop_domain_protein_rels.to_csv("scop_domain_protein_rels.csv", sep = "\t", index = False)

In [None]:
"""
bin/neo4j-admin database import full --skip-bad-relationships \
--delimiter="\t" \
--nodes=pdbEC=import/pdb_ec_nodes.csv \
--nodes=ecID=import/ec_id_nodes.csv \
--nodes=ecClass=import/ec_nodes_class.csv \
--nodes=ecSubClass=import/ec_nodes_subclass.csv \
--nodes=ecSubSubClass=import/ec_nodes_subsubclass.csv \
--nodes=pdbLigandEntity=import/pdb_ligands.csv \
--nodes=biologicalLigand=import/biological_ligand_mols.csv \
--nodes=pdbEntry=import/pdb_entry_nodes.csv \
--nodes=pdbProteinChain=import/pdb_protein_chain_nodes.csv \
--nodes=cathClass=import/cath_class_nodes.csv \
--nodes=cathArchitecture=import/cath_architecture_nodes.csv \
--nodes=cathTopology=import/cath_topology_nodes.csv \
--nodes=cathHomology=import/cath_homology_nodes.csv \
--nodes=cathDomain=import/cath_domains_nodes.csv \
--nodes=scopDomain=import/scop_domains_nodes.csv \
--nodes=scopFamily=import/scop_family_nodes.csv \
--nodes=scopSuperfamily=import/scop_superfamily_nodes.csv \
--nodes=scopFold=import/scop_fold_nodes.csv \
--nodes=scopClass=import/scop_class_nodes.csv \
--relationships=IS_IN_SCOP_FAMILY=import/scop_domain_family_rels.csv \
--relationships=IS_IN_SCOP_SUPERFAMILY=import/scop_family_superfam_rels.csv \
--relationships=IS_IN_SCOP_FOLD=import/scop_superfam_fold_rels.csv \
--relationships=IS_IN_SCOP_CLASS=import/scop_fold_class_rels.csv \
--relationships=MAPS_TO_EC=import/pdb_ec_rel.csv \
--relationships=HAS_SIMILARITY=import/ligand_score_relationships.csv \
--relationships=IS_IN_EC_CLASS=import/ec_class_subclass_rel.csv \
--relationships=IS_IN_EC_SUBCLASS=import/ec_subclass_subsubclass_rel.csv \
--relationships=IS_IN_EC_SUBSUBCLASS=import/ec_subsubclass_id_rel.csv \
--relationships=IS_IN_CATH_CLASS=import/cath_class_architecture_rels.csv \
--relationships=IS_IN_CATH_ARCHITECTURE=import/cath_architecture_topology_rels.csv \
--relationships=IS_IN_CATH_TOPOLOGY=import/cath_topology_homology_rels.csv \
--relationships=IS_IN_CATH_HOMOLOGY=import/cath_homology_domain_rels.csv \
--relationships=INTERACTS_WITH_LIGAND=import/cath_domain_ligand_interactions.csv \
--relationships=INTERACTS_WITH_LIGAND=import/scop_domain_ligand_interactions.csv \
--relationships=HAS_PDB_EC=import/protein_ec_rel.csv \
--relationships=HAS_PDB_EC=import/ligand_ec_rel.csv \
--relationships=IS_IN_PDB=import/ligand_entry_rel.csv \
--relationships=IS_IN_PDB=import/protein_entry_rels.csv \
--relationships=IS_IN_PROTEIN_CHAIN=import/cath_domain_protein_rels.csv \
--relationships=IS_IN_PROTEIN_CHAIN=import/scop_domain_protein_rels.csv \
--relationships=IS_IN_EC=import/biological_ligand_ec_rels.csv \
--overwrite-destination graph.db
"""

"""MATCH (n:pdbEntry {pdbEntry: "2vyv"})<-[l:IS_IN_PDB]-(p:pdbProteinChain)
<-[o:IS_IN_PROTEIN_CHAIN]-(c:cathDomain)-[k:INTERACTS_WITH_LIGAND]->
(g:pdbLigandEntity)-[q:IS_IN_LIGAND]->(h:pdbLigand)-[e:HAS_SIMILARITY]->
(w:biologicalLigand) RETURN c.cathDomain AS cath_domain, 
k.interactionMode as interaction_type, g.entityID AS ligand_entity,h.entityID AS pdb_ligand,
e.parityScore as parity_score, w.uniqueID AS biological_ligand
ORDER BY ligand_entity, parity_score DESCENDING"""


"""MATCH (p:pdbEntry)<-[a:IS_IN_PDB]-(e:pdbLigandEntity)<-[i:INTERACTS_WITH_LIGAND]-(cd:cathDomain) WHERE p.pdbEntry = "2vyv" AND i.interactionMode IN ["partner_binding_domain", "dominant_binding_domain", "uniquely_binding_domain"] RETURN e.entityID as ligand_entity_id, e.entityName as ligand_entity_name ,i.interactionMode as interaction_mode, i.domainContactCounts as contact_counts, i.domainContactPerc as contact_perc, i.domainHbondCounts as hbond_counts, i.domainHbondPerc as hbond_perc, cd.cathDomain as cath_domain"""

"""MATCH (sc:scopClass)<-[:IS_IN_SCOP_CLASS]-(sff:scopFold)<-[:IS_IN_SCOP_FOLD]-(ssf:scopSuperfamily)<-[:IS_IN_SCOP_SUPERFAMILY]-(sf:scopFamily)<-[:IS_IN_SCOP_FAMILY]-(sd:scopDomain)-[a:INTERACTS_WITH_LIGAND]->(p:pdbLigandEntity) WHERE a.interactionMode IN ["minor_binding_domain", "partner_binding_domain"] RETURN DISTINCT sf.sunidDescription LIMIT 100"""