In [1]:
import pandas as pd
import lxml

### Splitting Known and Unknown Molecules

In this section, we split the PTFI dataset into known and unknown molecules based on their definitions.

This step is essential for focusing the analysis on known compounds

### Loading the PTFI Dictionary

The PTFI dictionary contains metadata for approximately 24,000 molecules in the dataset.

We will load this dictionary to extract relevant information for further analysis.

In [2]:
ptfi_dic = pd.read_csv("../databases/ptfi_dic.csv")

In [3]:
ptfi_unknown = ptfi_dic[ptfi_dic["definition"].str.contains("unknown")].copy()
ptfi_known = ptfi_dic[~ptfi_dic["definition"].str.contains("unknown")].copy()
ptfi_known["inchi_key"] = ptfi_known["element_id"].str.replace("MET_", "")

### Matching Known Molecules

In this section, we focus on matching known molecules from the PTFI dataset with external databases.

This step helps identify molecules that are already characterized in databases like DrugBank and PubChem.

### Extracting DrugBank Molecules and InChIKeys

DrugBank provides detailed information about drugs, including their InChIKeys, SMILES, and molecular formulas.

In this section, we parse the DrugBank XML file to extract relevant data for matching with the PTFI dataset.

In [4]:
import pandas as pd
from lxml import etree as ET

xml_path = '../databases/full database.xml'
tree     = ET.parse(xml_path)
root     = tree.getroot()

NS = {'db': 'http://www.drugbank.ca'}

records = []
for drug in root.xpath('.//db:drug', namespaces=NS):
    # ---------- core identifiers ----------
    db_id = drug.xpath('./db:drugbank-id[@primary="true"]/text()',
                       namespaces=NS)
    db_id = db_id[0] if db_id else None

    name  = drug.findtext('db:name', namespaces=NS)
    
    
    inchikey = drug.xpath(
        './/db:calculated-properties/db:property[db:kind="InChIKey"]/db:value/text()',
        namespaces=NS
    )
    inchikey = inchikey[0] if inchikey else None

    # Get SMILES
    smiles = drug.xpath(
        './/db:calculated-properties/db:property[db:kind="SMILES"]/db:value/text()',
        namespaces=NS
    )
    smiles = smiles[0] if smiles else None
    
    # Get InChI
    inchi = drug.xpath(
        './/db:calculated-properties/db:property[db:kind="InChI"]/db:value/text()',
        namespaces=NS
    )
    inchi = inchi[0] if inchi else None

    formula = drug.xpath(
        './/db:calculated-properties/db:property[db:kind="Formula"]/db:value/text()',
        namespaces=NS
    )
    formula = formula[0] if formula else None

    # ---------- all category names ----------
    # Path: <categories><category><category>Category Name</category> … </category></categories>
    cat_names = drug.xpath('.//db:categories/db:category/db:category/text()',
                           namespaces=NS)
    categories = '|'.join(sorted(set(cat_names)))        # deduplicate & pack

    # optional: top-level “superclass” attribute (if present)
    cat_roots = drug.xpath('.//db:categories/db:category/@mesh-id', namespaces=NS)
    cat_roots = '|'.join(sorted(set(cat_roots)))

    # Find all <group> tags and get their text content
    groups = drug.xpath('.//db:groups/db:group/text()', namespaces=NS)

    # Join them into a pipe-separated string, similar to categories
    drug_groups = '|'.join(sorted(set(groups)))

    records.append({
        'drugbank_id': db_id,
        'name':        name,
        'inchikey':    inchikey,
        'formula':     formula,
        'categories':  categories,     # pipe-separated list of category names
        'mesh_ids':    cat_roots ,
        'groups':      drug_groups,
        'inchi':       inchi,
        'smiles':      smiles 
    })

drugbank_df = pd.DataFrame(records)
print(f'Parsed {len(drugbank_df):,} DrugBank entries with categories')

# ------------------------------------------------------------------

drugbank_df[['drugbank_id','name','categories', 'groups']].head()

Parsed 73,687 DrugBank entries with categories


Unnamed: 0,drugbank_id,name,categories,groups
0,DB00001,Lepirudin,"Amino Acids, Peptides, and Proteins|Anticoagul...",approved|withdrawn
1,,Lepirudin,,
2,,Phylloquinone,,
3,,Calcium,,
4,DB00002,Cetuximab,"Amino Acids, Peptides, and Proteins|Antibodies...",approved


### Merging DrugBank Data with PTFI

Here, we merge the extracted DrugBank data with the known molecules from the PTFI dataset.

This step helps identify which known molecules in the PTFI dataset are present in DrugBank.

In [5]:
ptfi_known_merge_drug_bank = ptfi_known.merge(
    drugbank_df,
    left_on="inchi_key",
    right_on="inchikey",
    how="inner")
ptfi_known_merge_drug_bank.head()

Unnamed: 0,element_id,element_name,definition,entity_id,vocabulary_encoding_scheme,obligation,module,datatype,occurrence,comment,...,inchi_key,drugbank_id,name,inchikey,formula,categories,mesh_ids,groups,inchi,smiles
0,MET_IBGBGRVKPALMCQ-UHFFFAOYSA-N,Protocatechuic aldehyde,Total abundance of molecule annotated as Proto...,MET_RP25-19720,,,ANALYTE_MET,float or NA,unique,0 if not detected,...,IBGBGRVKPALMCQ-UHFFFAOYSA-N,DB11268,Protocatechualdehyde,IBGBGRVKPALMCQ-UHFFFAOYSA-N,,Aldehydes|Anticoagulants|Benzene Derivatives|H...,,experimental,"InChI=1S/C7H6O3/c8-4-5-1-2-6(9)7(10)3-5/h1-4,9...",[H]C(=O)C1=CC(O)=C(O)C=C1
1,MET_ZOAMBXDOGPRZLP-UHFFFAOYSA-N,1H-Indole-3-acetamide,Total abundance of molecule annotated as 1H-In...,MET_RP25-19721,,,ANALYTE_MET,float or NA,unique,0 if not detected,...,ZOAMBXDOGPRZLP-UHFFFAOYSA-N,DB08652,Indoleacetamide,ZOAMBXDOGPRZLP-UHFFFAOYSA-N,,"Heterocyclic Compounds, Fused-Ring|Indoles",,experimental,InChI=1S/C10H10N2O/c11-10(13)5-7-6-12-9-4-2-1-...,NC(=O)CC1=CNC2=CC=CC=C12
2,MET_ZTHYODDOHIVTJV-UHFFFAOYSA-N,Propyl gallate,Total abundance of molecule annotated as Propy...,MET_RP25-19733,,,ANALYTE_MET,float or NA,unique,0 if not detected,...,ZTHYODDOHIVTJV-UHFFFAOYSA-N,DB12450,Propyl Gallate,ZTHYODDOHIVTJV-UHFFFAOYSA-N,,"Acids, Carbocyclic|Antioxidants|Benzene Deriva...",,investigational,InChI=1S/C10H12O5/c1-2-3-15-10(14)6-4-7(11)9(1...,CCCOC(=O)C1=CC(O)=C(O)C(O)=C1
3,MET_YBJHBAHKTGYVGT-ZKWXMUAHSA-N,Biotin,Total abundance of molecule annotated as Bioti...,MET_RP25-19737,,,ANALYTE_MET,float or NA,unique,0 if not detected,...,YBJHBAHKTGYVGT-ZKWXMUAHSA-N,DB00121,Biotin,YBJHBAHKTGYVGT-ZKWXMUAHSA-N,,Alimentary Tract and Metabolism|Coenzymes|Diet...,,approved|investigational|nutraceutical,InChI=1S/C10H16N2O3S/c13-8(14)4-2-1-3-7-9-6(5-...,[H][C@]12CS[C@@H](CCCCC(O)=O)[C@@]1([H])NC(=O)N2
4,MET_CXMXRPHRNRROMY-UHFFFAOYSA-N,NP-002989,Total abundance of molecule annotated as NP-00...,MET_RP25-19743,,,ANALYTE_MET,float or NA,unique,0 if not detected,...,CXMXRPHRNRROMY-UHFFFAOYSA-N,DB07645,Sebacic acid,CXMXRPHRNRROMY-UHFFFAOYSA-N,,"Acids, Acyclic|Fatty Acids|Lipids",,experimental,InChI=1S/C10H18O4/c11-9(12)7-5-3-1-2-4-6-8-10(...,OC(=O)CCCCCCCCC(O)=O


Pubchem drug matches

In [6]:
pubchem_drugs_df = pd.read_csv("../databases/pubchem_drugs.csv", sep=",")
ptfi_pubchem_drugs_match = ptfi_known.merge(
    pubchem_drugs_df,
    left_on="inchi_key",
    right_on="InChIKey",
    how="inner")
ptfi_pubchem_drugs_match.head()

Unnamed: 0,element_id,element_name,definition,entity_id,vocabulary_encoding_scheme,obligation,module,datatype,occurrence,comment,...,Linked_PubChem_Patent_Count,Linked_PubChem_Patent_Family_Count,MeSH_Headings,Annotation_Content,Annotation_Type_Count,Linked_BioAssays,Create_Date,Data_Source,Data_Source_Category,Tagged_by_PubChem
0,MET_IBGBGRVKPALMCQ-UHFFFAOYSA-N,Protocatechuic aldehyde,Total abundance of molecule annotated as Proto...,MET_RP25-19720,,,ANALYTE_MET,float or NA,unique,0 if not detected,...,12172,4986,,Biological Test Results|Interactions and Pathw...,17,155|157|161|165|167|175|200|202|206|212|220|24...,20050326,001Chemical|10X CHEM|1st Scientific|3WAY PHARM...,Chemical Vendors|Curation Efforts|Governmental...,D006401 - Hematologic Agents > D000925 - Antic...
1,MET_ZTHYODDOHIVTJV-UHFFFAOYSA-N,Propyl gallate,Total abundance of molecule annotated as Propy...,MET_RP25-19733,,,ANALYTE_MET,float or NA,unique,0 if not detected,...,92035,27650,Propyl Gallate,Biological Test Results|Interactions and Pathw...,18,1|3|5|7|9|13|15|19|21|23|25|29|31|33|35|37|39|...,20050325,001Chemical|10X CHEM|3WAY PHARM INC|A&J Pharmt...,Chemical Vendors|Curation Efforts|Governmental...,D020011 - Protective Agents > D000975 - Antiox...
2,MET_YBJHBAHKTGYVGT-ZKWXMUAHSA-N,Biotin,Total abundance of molecule annotated as Bioti...,MET_RP25-19737,,,ANALYTE_MET,float or NA,unique,0 if not detected,...,98795,41922,Biotin,Biological Test Results|Interactions and Pathw...,19,1195|1332|1376|1385|1422|1465|1511|1529|1530|1...,20040916,001Chemical|10X CHEM|3WAY PHARM INC|A&J Pharmt...,Chemical Vendors|Curation Efforts|Governmental...,A - Alimentary tract and metabolism > A11 - Vi...
3,MET_HCZHHEIFKROPDY-UHFFFAOYSA-N,Kynurenic acid,Total abundance of molecule annotated as Kynur...,MET_RP25-19746,,,ANALYTE_MET,float or NA,unique,0 if not detected,...,8343,2192,Kynurenic Acid,Biological Test Results|Interactions and Pathw...,17,155|157|161|165|167|175|256|357|410|411|422|42...,20040916,001Chemical|10X CHEM|3WAY PHARM INC|A&J Pharmt...,Chemical Vendors|Curation Efforts|Governmental...,D018377 - Neurotransmitter Agents > D018683 - ...
4,MET_FBZONXHGGPHHIY-UHFFFAOYSA-N,Xanthurenic acid,Total abundance of molecule annotated as Xanth...,MET_RP25-19747,,,ANALYTE_MET,float or NA,unique,0 if not detected,...,1920,597,,Biological Test Results|Interactions and Pathw...,14,155|157|161|165|167|175|179|608|875|880|881|88...,20050325,001Chemical|10X CHEM|3WAY PHARM INC|A&J Pharmt...,Chemical Vendors|Curation Efforts|Governmental...,D009676 - Noxae > D000963 - Antimetabolites|D0...


Pubchem + Drugbank drug matches

In [7]:
db_matches = set(ptfi_known_merge_drug_bank["element_id"].unique())
pubchem_drug_matches = set(ptfi_pubchem_drugs_match["element_id"].unique())
drug_matches = db_matches.union(pubchem_drug_matches)
len(drug_matches)

119

Agrochemical drug matches

In [8]:
pubchem_agro_df = pd.read_csv("../databases/pubchem_agrochemicals.csv", sep=",")
ptfi_agro_match = ptfi_known.merge(
    pubchem_agro_df,
    left_on="inchi_key",
    right_on="InChIKey",
    how="inner")
ptfi_agro_match.head()
ptfi_agro_matches = set(ptfi_agro_match["element_id"].unique())
len(ptfi_agro_matches)

9

FCCdb Feature matches

In [9]:
fccdb_df = pd.read_csv("../databases/fcc_cas_with_formulas_and_inchikey.csv", sep=",")
ptfi_fcc_match = ptfi_known.merge(
    fccdb_df,
    left_on="inchi_key",
    right_on="inchikey",
    how="inner")
ptfi_fcc_match.head()
ptfi_fcc_matches = set(ptfi_fcc_match["element_id"].unique())
len(ptfi_fcc_matches)

34

Build External database matching files

In [10]:
ptfi_match = ptfi_known[["element_id","element_name", "inchi_key"]].copy()
ptfi_match["drugbank"] = ptfi_match["element_id"].isin(db_matches)
ptfi_match["pubchem_drug"] = ptfi_match["element_id"].isin(pubchem_drug_matches)
ptfi_match["agro"] = ptfi_match["element_id"].isin(ptfi_agro_matches)
ptfi_match["fcc"] = ptfi_match["element_id"].isin(ptfi_fcc_matches)
ptfi_match.to_csv("../export_ptfi/ptfi_external_db_matches.csv", index=False)

Build external database with Drugbank categories

In [11]:
ptfi_drugbank_categories  = ptfi_known_merge_drug_bank[["element_id", "element_name", "name", "categories", "groups"]]
ptfi_drugbank_categories.to_csv("../export_ptfi/drugbank_categories_groups.csv", index=False)