In this notebook, an analysis of the data available from PDBe-graph is performed, and combined with data from KEGG and ligand similarity scoring, as well as benchmarking against the analysis performed in PARITY (2018). 

In [3]:
from neo4j import __version__ as neo4j_version
print(f"Neo4j python package version: {neo4j_version}")
#class is from https://towardsdatascience.com/neo4j-cypher-python-7a919a372be7
import pandas as pd
from neo4j import GraphDatabase
class Neo4jConnection:
    
    def __init__(self, uri, user, pwd):
        self.__uri = uri
        self.__user = user
        self.__pwd = pwd
        self.__driver = None
        try:
            self.__driver = GraphDatabase.driver(self.__uri, auth=(self.__user, self.__pwd))
        except Exception as e:
            print("Failed to create the driver:", e)
        
    def close(self):
        if self.__driver is not None:
            self.__driver.close()
        
    def query(self, query, db=None):
        assert self.__driver is not None, "Driver not initialized!"
        session = None
        response = None
        try: 
            session = self.__driver.session(database=db) if db is not None else self.__driver.session() 
            response = list(session.run(query))
        except Exception as e:
            print("Query failed:", e)
        finally: 
            if session is not None:
                session.close()
        return response

Neo4j python package version: 5.10.0


In [24]:
def return_partial_EC_list(ec, total_ec_list):
    if not isinstance(ec, str) and np.isnan(ec):
        return np.nan
    elif "-" in ec:
        replacement_character = r'.'
        modified_ec = re.sub(r'\.', r"_", ec)
        modified_ec = modified_ec.replace("-", ".")
        total_ec_list = [re.sub(r'\.', r"_", item) for item in total_ec_list]
        # Use re.match() to check if the modified string matches any item in the match_list
        matching_ec = [ec for ec in total_ec_list if re.match(modified_ec, ec)]
        matching_ec = [re.sub(r'_', r".", item) for item in matching_ec]
        return(matching_ec)
    else:
        return [ec]
    
def parse_table_data(elem):
    data = {}
    for row in elem.findall('row'):
        row_data = {}
        for field in row.findall('field'):
            row_data[field.attrib['name']] = field.text
        data[len(data)] = row_data
    return data

import pandas as pd
import xml.etree.ElementTree as ET
import re

Connect to the database:

In [5]:
conn = Neo4jConnection(uri="bolt://localhost:7687", user="neo4j", pwd="yTJutYQ$$d%!9h")

In [122]:
node_count_query = """
MATCH (n)
RETURN COUNT(DISTINCT labels(n)) AS totalUniqueNodeLabels"""
node_count = pd.DataFrame([dict(_) for _ in conn.query(node_count_query, db='neo4j')])

In [123]:
node_count_query

'\nMATCH (n)\nRETURN COUNT(DISTINCT labels(n)) AS totalUniqueNodeLabels'

In [121]:
rel_count_query = """
MATCH ()-[r]->()
RETURN COUNT(DISTINCT TYPE(r)) AS totalUniqueRelationshipTypes"""
rel_count = pd.DataFrame([dict(_) for _ in conn.query(rel_count_query, db='neo4j')])

In [125]:
rel_count_query

'\nMATCH ()-[r]->()\nRETURN COUNT(DISTINCT TYPE(r)) AS totalUniqueRelationshipTypes'

How many PDB entries are there in total in the dataset and what is the breakdown of these?

In [117]:
ec_entries_query = """
MATCH (a:Entry) 
RETURN "PDB entries" as Val, count(DISTINCT(a.ID)) 
UNION
MATCH (a:Entry)-[:HAS_ENTITY]->(e:Entity) WHERE e.TYPE = "p" AND e.POLYMER_TYPE = "P" 
RETURN "Unique Entries with protein polymers" as Val, count(DISTINCT(a.ID)) 
UNION
MATCH (b:BoundLigand)<-[:IS_AN_INSTANCE_OF]-(p:Entity)<-[:HAS_ENTITY]-(a:Entry)-[:HAS_ENTITY]->(e:Entity) WHERE NOT p.CHEM_COMP_LIST in ["UNX", "UNL"] AND e.TYPE = "p" AND e.POLYMER_TYPE = "P" 
RETURN "Unique Entries with protein polymers and bound ligands" as Val, count(DISTINCT(a.ID)) 
UNION
MATCH (b:BoundLigand)<-[:IS_AN_INSTANCE_OF]-(p:Entity)<-[:HAS_ENTITY]-(a:Entry)-[:HAS_ENTITY]->(e:Entity) WHERE NOT p.CHEM_COMP_LIST in ["UNX", "UNL"] AND e.TYPE = "p" AND e.POLYMER_TYPE = "P" and e.EC IS NOT NULL 
RETURN "Unique Entries with protein polymers, EC number and BoundLigands" as Val, count(DISTINCT(a.ID))
UNION
MATCH (b:BoundLigand)<-[:IS_AN_INSTANCE_OF]-(p:Entity)<-[:HAS_ENTITY]-(a:Entry)-[:HAS_ENTITY]->(e:Entity) WHERE NOT p.CHEM_COMP_LIST in ["UNX", "UNL"] AND e.TYPE = "p" AND e.POLYMER_TYPE = "P" and e.EC IS NOT NULL AND p.TYPE = "b" and p.POLYMER_TYPE = "B" 
RETURN "Unique Entries with protein polymers, EC number and BoundLigands which are B" as Val, count(DISTINCT(a.ID))
UNION
MATCH (b:BoundLigand)<-[:IS_AN_INSTANCE_OF]-(p:Entity)<-[:HAS_ENTITY]-(a:Entry)-[:HAS_ENTITY]->(e:Entity) WHERE NOT p.CHEM_COMP_LIST in ["UNX", "UNL"] AND e.TYPE = "p" AND e.POLYMER_TYPE = "P" and e.EC IS NOT NULL AND p.TYPE = "s" and p.POLYMER_TYPE = "S" 
RETURN "Unique Entries with protein polymers, EC number and BoundLigands which are S" as Val, count(DISTINCT(a.ID))
"""
ec_pdb_entries = pd.DataFrame([dict(_) for _ in conn.query(ec_entries_query, db='neo4j')])

In [118]:
ec_entries_no_bl_query = """MATCH (a:Entry)-[:HAS_ENTITY]->(e:Entity) WHERE e.TYPE = "p" AND e.POLYMER_TYPE = "P" and e.EC IS NOT NULL 
RETURN "Unique Entries with protein polymers, EC number" as Val, count(DISTINCT(a.ID))"""
ec_entries_no_bl = pd.DataFrame([dict(_) for _ in conn.query(ec_entries_no_bl_query, db='neo4j')])

In [119]:
ec_entries_no_bl

Unnamed: 0,Val,count(DISTINCT(a.ID))
0,"Unique Entries with protein polymers, EC number",92864


In [120]:
ec_pdb_entries

Unnamed: 0,Val,count(DISTINCT(a.ID))
0,PDB entries,200708
1,Unique Entries with protein polymers,196495
2,Unique Entries with protein polymers and bound...,150497
3,"Unique Entries with protein polymers, EC numbe...",80796
4,"Unique Entries with protein polymers, EC numbe...",80409
5,"Unique Entries with protein polymers, EC numbe...",4173


In [51]:
ec_pdb_entries.to_csv("ec_pdb_entries_info.txt")

What are the types of entities in the PDBe-Graph? 

In [30]:
entity_types_query = """
MATCH (e:Entity)
RETURN DISTINCT(e.TYPE)"""
entity_types_data = pd.DataFrame([dict(_) for _ in conn.query(entity_types_query, db='neo4j')])

In [31]:
entity_types_data

Unnamed: 0,(e.TYPE)
0,p
1,b
2,s
3,w


Describe what these mean now

The chemical component descriptor nodes in the graph can be used to access InChI or SMILES string representations of the ligands. How many bound ligands in the graph have an InChi or SMILES representation?

In [26]:
bl_entries_query = """
MATCH (p:Entity)-[:IS_AN_INSTANCE_OF]->(bl:BoundLigand) 
RETURN "Bound Ligands" as Val, count(DISTINCT(p.UNIQID)) 
UNION
MATCH (cc:ChemicalComponent)<-[:IS_A]-(p:Entity)-[:IS_AN_INSTANCE_OF]->(bl:BoundLigand) 
RETURN "Bound Ligand Entities which are chem comps" as Val, count(DISTINCT(p.UNIQID))
UNION
MATCH (d:ChemicalComponentDesc)<-[:DESCRIBED_BY]-(cc:ChemicalComponent)<-[:IS_A]-(p:Entity),(p)-[:IS_AN_INSTANCE_OF]->(bl:BoundLigand)
RETURN "Bound ligands with chem comp desc" as Val, count(DISTINCT(p.UNIQID))
UNION
MATCH (d:ChemicalComponentDesc)<-[:DESCRIBED_BY]-(cc:ChemicalComponent)<-[:IS_A]-(p:Entity),(p)-[:IS_AN_INSTANCE_OF]->(bl:BoundLigand)
WHERE d.TYPE in ["InChI","SMILES"] AND d.DESCRIPTOR IS NOT NULL AND cc.ID <> "UNL"
RETURN "Bound ligands described by InChI or SMILES" as Val, count(DISTINCT(p.UNIQID))
UNION
MATCH (cc:ChemicalComponent)<-[:IS_A]-(p:Entity),(p)-[:IS_AN_INSTANCE_OF]->(bl:BoundLigand)
WHERE cc.ID = "UNL"
RETURN "Unknown Bound ligands" as Val, count(DISTINCT(p.UNIQID))
"""
bl_entries = pd.DataFrame([dict(_) for _ in conn.query(bl_entries_query, db='neo4j')])

In [27]:
bl_entries

Unnamed: 0,Val,count(DISTINCT(p.UNIQID))
0,Bound Ligands,376331
1,Bound Ligand Entities which are chem comps,376331
2,Bound ligands with chem comp desc,375856
3,Bound ligands described by InChI or SMILES,375856
4,Unknown Bound ligands,475


What is the breakdown of protein domains in the dataset?

In [None]:
domains_query_string = '''
MATCH
(c:CATH), (p:Pfam), (s:SCOP)
(p)-[:IS_AN_INSTANCE_OF]->(bl:BoundLigand)<-[:BINDS_LIGAND]-(c)
WHERE
e.TYPE = "p"
AND e.POLYMER_TYPE = "P"
AND e.EC IS NOT NULL
AND d.DESCRIPTOR STARTS WITH "InChI"
RETURN DISTINCT a.ID as pdb_entry_id, p.UNIQID as ligand_entity_id, d.DESCRIPTOR as inchi_descriptor, e.EC as protein_polymer_EC LIMIT 10
'''
domains_entries_data = pd.DataFrame([dict(_) for _ in conn.query(query_string, db='neo4j')])

With the get_kegg_information.py script, we are able to get the cognate ligand information for EC entries. What is our coverage of the EC reactions annotated in the PDBe-graph?

Start with the number of EC reactions annotated in the PDB-graph:

In [28]:
unique_ec_reactions = '''
MATCH
(e:Entity)
WHERE
e.TYPE = "p"
AND e.POLYMER_TYPE = "P"
AND e.EC IS NOT NULL
RETURN COUNT(DISTINCT(e.EC)) as number_unique_protein_polymer_EC
'''
unique_ec_count = pd.DataFrame([dict(_) for _ in conn.query(unique_ec_reactions, db='neo4j')])

In [29]:
unique_ec_count

Unnamed: 0,number_unique_protein_polymer_EC
0,4227


In [71]:
cath_chains_domains_mapping_query = """
MATCH (p:Entity)<-[:HAS_ENTITY]-(a:Entry)-[:HAS_ENTITY]->(e:Entity)-[:HAS_PDB_RESIDUE]->(pr:PDBResidue),
(p)-[:IS_AN_INSTANCE_OF]->(bl:BoundLigand),
(pr)-[:IS_IN_CATH_DOMAIN]->(c:CATH)
WHERE
NOT bl.CHEM_COMP_ID in ["UNX", "UNL"] AND
e.TYPE = 'p' AND e.POLYMER_TYPE = 'P' AND e.EC IS NOT NULL 
RETURN DISTINCT e.UNIQID as chain, c.CATHCODE as cathcode, c.DOMAIN as CATH_domain, e.EC as EC_descriptor
"""

scop_chains_domains_mapping_query = """
MATCH (p:Entity)<-[:HAS_ENTITY]-(a:Entry)-[:HAS_ENTITY]->(e:Entity)-[:HAS_PDB_RESIDUE]->(pr:PDBResidue),
(p)-[:IS_AN_INSTANCE_OF]->(bl:BoundLigand),
(pr)-[:IS_IN_SCOP_DOMAIN]->(sc:SCOP)
WHERE
NOT bl.CHEM_COMP_ID in ["UNX", "UNL"] AND
e.TYPE = 'p' AND e.POLYMER_TYPE = 'P' AND e.EC IS NOT NULL 
RETURN DISTINCT e.UNIQID as chain, sc.SCCS as SCCS, sc.SUNID as SUNID, e.EC as EC_descriptor
"""

pfam_chains_domains_mapping_query = """
MATCH (p:Entity)<-[:HAS_ENTITY]-(a:Entry)-[:HAS_ENTITY]->(e:Entity), 
(p)-[:IS_AN_INSTANCE_OF]->(bl:BoundLigand),
(e)-[:HAS_PFAM]-(pf:Pfam)
WHERE
NOT bl.CHEM_COMP_ID in ["UNX", "UNL"] AND
e.TYPE = 'p' AND e.POLYMER_TYPE = 'P' AND e.EC IS NOT NULL 
RETURN DISTINCT 
    e.UNIQID as chain, 
    pf.NAME as pfam_name, 
    pf.PFAM_ACCESSION as pfam_accession, 
    e.EC as EC_descriptor
"""

all_chem_descriptors_query = """
MATCH (d:ChemicalComponentDesc)<-[:DESCRIBED_BY]-(cc:ChemicalComponent)<-[:IS_A]-(p:Entity)<-[:HAS_ENTITY]-(a:Entry)-[:HAS_ENTITY]->(e:Entity),
(p)-[:IS_AN_INSTANCE_OF]->(bl:BoundLigand)
WHERE
NOT bl.CHEM_COMP_ID in ["UNX", "UNL"] AND
e.TYPE = 'p'
AND e.POLYMER_TYPE = 'P'
AND e.EC IS NOT NULL
AND cc.ID = bl.CHEM_COMP_ID
RETURN DISTINCT 
    a.ID as pdb_entry_id, 
    bl.UNIQID as bl_id,
    bl.CHEM_COMP_ID as bl_name,
    p.UNIQID as ligand_entity_id, 
    d.DESCRIPTOR as descriptor, 
    d.TYPE as descriptor_type, 
    e.EC as protein_polymer_EC
"""

In [25]:
cath_pdb_residue_interactions_query_distinct = """
MATCH
(a:Entry)-[:HAS_ENTITY]->(e:Entity)-[:HAS_PDB_RESIDUE]->(pr:PDBResidue)-[cd:IS_IN_CATH_DOMAIN]->(c:CATH) WHERE
e.TYPE = 'p' AND e.POLYMER_TYPE = 'P' AND e.EC IS NOT NULL
WITH e, pr ,cd, c, a 
MATCH
(pr)<-[Arp:HAS_ARP_CONTACT]-(bl:BoundLigand)<-[:IS_AN_INSTANCE_OF]-(p:Entity) WHERE
Arp.CONTACT_TYPE <> "['vdw_clash']" AND Arp.CONTACT_TYPE <> "['clash']" 
AND cd.STRUCT_ASYM_ID = Arp.STRUCT_ASYM_ID_2 AND
NOT bl.CHEM_COMP_ID in ["UNX", "UNL"]
RETURN DISTINCT
  a.ID as pdb_id, 
  e.UNIQID as protein_entity_id,
  e.BEST_CHAIN_ID as pdb_best_chain_id,
  e.EC as protein_entity_ec,
  p.UNIQID as ligand_entity_id, 
  p.TYPE as ligand_entity_type,
  p.BEST_CHAIN_ID as bound_ligand_best_chain_id, 
  p.POLYMER_TYPE as ligand_entity_polymer_type, 
  pr.UNIQID as pdb_residue_id,
  pr.CHEM_COMP_ID as pdb_residue_type,
  c.DOMAIN as cath_domain,
  c.CLASS as cath_class,
  c.ARCH as cath_architecture,
  c.TOPOL as cath_topology,
  c.HOMOL as cath_homology,
  c.NAME as cath_name,
  bl.UNIQID as bound_ligand_id, 
  bl.CHEM_COMP_ID as bound_ligand_name,
  Arp.AUTH_SEQ_ID_2 as pdb_residue_auth_id,
  Arp.AUTH_SEQ_ID_1 as bound_ligand_auth_id,
  Arp.CONTACT_TYPE as contact_type, 
  Arp.DISTANCE as contact_distance, 
  Arp.INTERACTION_TYPE as interaction_type, 
  Arp.ATOM_1 as atom_1, 
  Arp.ATOM_2 as atom_2"""

In [26]:
cath_pdb_residue_interactions_distinct = pd.DataFrame([dict(_) for _ in conn.query(cath_pdb_residue_interactions_query_distinct, db='neo4j')])
cath_pdb_residue_interactions_distinct.to_csv("cath_pdb_residue_interactions_distinct.csv")

In [27]:
scop_pdb_residue_interactions_query_distinct = """
MATCH
(a:Entry)-[:HAS_ENTITY]->(e:Entity)-[:HAS_PDB_RESIDUE]->(pr:PDBResidue)-[sd:IS_IN_SCOP_DOMAIN]->(s:SCOP) WHERE
e.TYPE = 'p' AND e.POLYMER_TYPE = 'P' AND e.EC IS NOT NULL
WITH e, pr ,sd, s, a 
MATCH
(pr)<-[Arp:HAS_ARP_CONTACT]-(bl:BoundLigand)<-[:IS_AN_INSTANCE_OF]-(p:Entity) WHERE
Arp.CONTACT_TYPE <> "['vdw_clash']" AND Arp.CONTACT_TYPE <> "['clash']" 
AND sd.STRUCT_ASYM_ID = Arp.STRUCT_ASYM_ID_2 AND
NOT bl.CHEM_COMP_ID in ["UNX", "UNL"]
RETURN DISTINCT
  a.ID as pdb_id, 
  e.UNIQID as protein_entity_id,
  e.BEST_CHAIN_ID as pdb_best_chain_id,
  e.EC as protein_entity_ec,
  bl.UNIQID as bound_ligand_entity_id,
  p.UNIQID as ligand_entity_id, 
  p.TYPE as ligand_entity_type,
  p.BEST_CHAIN_ID as bound_ligand_best_chain_id, 
  p.POLYMER_TYPE as ligand_entity_polymer_type, 
  pr.UNIQID as pdb_residue_id,
  pr.CHEM_COMP_ID as pdb_residue_type,
  s.SUNID as scop_sunid,
  s.DESCRIPTION as scop_description,
  s.SCCS as scop_sccs,
  sd.CLASS_ID as scop_class_id,
  sd.FOLD_ID as scop_fold_id,
  sd.SUPERFAMILY_ID as scop_superfamily_id,
  sd.SCOP_ID as scop_id,
  bl.UNIQID as bound_ligand_id, 
  bl.AUTH_COMP_ID as bound_ligand_name,
  Arp.AUTH_SEQ_ID_2 as pdb_residue_auth_id,
  Arp.AUTH_SEQ_ID_1 as bound_ligand_auth_id,
  Arp.CONTACT_TYPE as contact_type, 
  Arp.DISTANCE as contact_distance, 
  Arp.INTERACTION_TYPE as interaction_type, 
  Arp.ATOM_1 as atom_1, 
  Arp.ATOM_2 as atom_2
"""

In [28]:
scop_pdb_residue_interactions_distinct = pd.DataFrame([dict(_) for _ in conn.query(scop_pdb_residue_interactions_query_distinct, db='neo4j')])
scop_pdb_residue_interactions_distinct.to_csv("scop_pdb_residue_interactions_distinct.csv")

In [28]:
# Load the XML file containing explorenz ec data
xml_file_path = 'kegg_information/enzyme-data.xml'
tree = ET.parse(xml_file_path)

# Find all table_data elements and parse their data
data = {}
for table_data_elem in tree.findall(".//table_data"):
    table_name = table_data_elem.attrib['name']
    data[table_name] = parse_table_data(table_data_elem)

# Convert the parsed data into a DataFrame
dfs = {table_name: pd.DataFrame.from_dict(data) for table_name, data in data.items()}

total_ec_nums = dfs["entry"].T
total_ec_nums_list = total_ec_nums["ec_num"].dropna().unique().tolist()

In [21]:
cath_chains_domains = pd.DataFrame([dict(_) for _ in conn.query(cath_chains_domains_mapping_query, db='neo4j')])
scop_chains_domains = pd.DataFrame([dict(_) for _ in conn.query(scop_chains_domains_mapping_query, db='neo4j')])
pfam_chains_domains = pd.DataFrame([dict(_) for _ in conn.query(pfam_chains_domains_mapping_query, db='neo4j')])

cath_chains_domains.to_csv("cath_chains_domains_mapping.csv")
scop_chains_domains.to_csv("scop_chains_domains_mapping.csv")
pfam_chains_domains.to_csv("pfam_chains_domains_mapping.csv")

In [22]:
cath_chains_domains_grouped = cath_chains_domains.groupby(["chain", "EC_descriptor"]).apply(lambda row: [(cathcode, domain) for cathcode, domain in zip(row['cathcode'], row['CATH_domain'])]).rename("cath").reset_index()
scop_chains_domains_grouped = scop_chains_domains.groupby(["chain", "EC_descriptor"]).apply(lambda row: [(sccs, sunid) for sccs, sunid in zip(row['SCCS'], row['SUNID'])]).rename("scop").reset_index()
pfam_chains_domains_grouped = pfam_chains_domains.groupby(["chain", "EC_descriptor"]).apply(lambda row: [(pfam_name, pfam_accession) for pfam_name, pfam_accession in zip(row['pfam_name'], row['pfam_accession'])]).rename("pfam").reset_index()

chains_domains = cath_chains_domains_grouped.merge(scop_chains_domains_grouped, on = ["chain", "EC_descriptor"], how='outer')
chains_domains = chains_domains.merge(pfam_chains_domains_grouped, on =  ["chain", "EC_descriptor"],how='outer')

chains_domains["pdb"] = chains_domains["chain"].str.extract("^([A-z0-9]+)_")
chains_domains["EC_descriptor"] = chains_domains.EC_descriptor.str.split(",")
chains_domains.loc[chains_domains["EC_descriptor"].isna() == False, "EC_descriptor"] = chains_domains.loc[chains_domains["EC_descriptor"].isna() == False, "EC_descriptor"]
chains_domains = chains_domains.explode("EC_descriptor")
chains_domains["EC_descriptor"] = chains_domains["EC_descriptor"].str.strip()

In [29]:
chains_domains["filled_EC_descriptor"] = chains_domains["EC_descriptor"]
chains_domains["filled_EC_descriptor"] = chains_domains["filled_EC_descriptor"].apply(lambda x: return_partial_EC_list(x, total_ec_nums_list))
chains_domains = chains_domains.explode("filled_EC_descriptor")
chains_domains["cath"] = chains_domains["cath"].apply(lambda x: x if isinstance(x, list) else [])
chains_domains["scop"] = chains_domains["scop"].apply(lambda x: x if isinstance(x, list) else [])

In [30]:
chains_domains.to_csv("chains_domains_formatted.csv", index = False)

In [66]:
all_chem_descriptors = pd.DataFrame([dict(_) for _ in conn.query(all_chem_descriptors_query, db='neo4j')])
all_chem_descriptors["protein_polymer_EC"] = all_chem_descriptors["protein_polymer_EC"].str.split(",")
all_chem_descriptors = all_chem_descriptors.explode("protein_polymer_EC")
all_chem_descriptors.to_csv("all_chem_descriptors_ligands.csv")
all_chem_descriptors_inchi = all_chem_descriptors.loc[all_chem_descriptors.descriptor_type == "InChI"]
all_chem_descriptors_inchi_unique_pairs = all_chem_descriptors_inchi.drop_duplicates(["bl_id","protein_polymer_EC"], keep='first') #get the unique pairs of inchi descriptors and EC numbers

chains_domains_chem_descriptors = all_chem_descriptors_inchi_unique_pairs.loc[all_chem_descriptors_inchi_unique_pairs.pdb_entry_id.isin(chains_domains.pdb.unique())]

chains_domains_chem_descriptors.to_pickle("chains_domains_chem_descriptors.pkl")

In [153]:
chains_domains_chem_descriptors_cath_scop = chains_domains_chem_descriptors.loc[(chains_domains_chem_descriptors.bl_id.isin(scop_pdb_residue_interactions_distinct.bound_ligand_id.unique())) | 
                                    (chains_domains_chem_descriptors.bl_id.isin(cath_pdb_residue_interactions_distinct.bound_ligand_id.unique()))]
chains_domains_chem_descriptors_cath_scop.to_pickle("chains_domains_chem_descriptors_cath_scop.pkl")

ligands_to_score = chains_domains_chem_descriptors_cath_scop[["bl_id", "descriptor", "protein_polymer_EC"]].drop_duplicates().reset_index().rename(columns = {"index" : "ligand_entity_id"})

ligands_to_score.to_pickle("cath_scop_ligands_to_score.pkl")

In [154]:
ligands_to_score.loc[ligands_to_score.descriptor.isin(["InChI=1S/C4H12O8P2/c5-3-1-2-4-11-14(9,10)12-13(6,7)8/h5H,1-4H2,(H,9,10)(H2,6,7,8)","InChI=1S/C5H12O8P2/c1-5(6)3-2-4-12-15(10,11)13-14(7,8)9/h2-4H2,1H3,(H,10,11)(H2,7,8,9)"])]

Unnamed: 0,ligand_entity_id,bl_id,descriptor,protein_polymer_EC
500,4680,3uv6_3_D_1,"InChI=1S/C4H12O8P2/c5-3-1-2-4-11-14(9,10)12-13...",1.17.1.2
501,4694,3utd_3_D_1,"InChI=1S/C5H12O8P2/c1-5(6)3-2-4-12-15(10,11)13...",1.17.1.2


In [157]:
parity_scores.loc[parity_scores.bl_id == "3cw8_2_B_1"].descriptor.value_counts()

descriptor
InChI=1S/C17H17ClN5O8P/c18-9-3-1-8(2-4-9)17(26)31-32(27,28)29-5-10-12(24)13(25)16(30-10)23-7-22-11-14(19)20-6-21-15(11)23/h1-4,6-7,10,12-13,16,24-25H,5H2,(H,27,28)(H2,19,20,21)/t10-,12-,13-,16-/m1/s1    12
Name: count, dtype: int64

In [148]:
parity_scores = pd.read_pickle("parity_calcs/parity_calcs_cath_scop/all_parity_calcs.pkl")

In [None]:
#FILTER THE ALL CHEM DESCRIPTORS FILE TO REMOVE LIGANDS WHICH DO NOT HAVE NON VDW CLASH OR CLASH INTERACTIONS WITH A LIGAND