In [1]:
import pandas as pd
import json
import requests

from pyspark.sql.types import ArrayType, StringType, IntegerType
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

# establish spark connection
spark = (
    SparkSession.builder
    .master('local[*]')
    .getOrCreate()
)



## Reading molecules


The following file contains the molecules we have in opentargets

In [3]:
drugfile = '/Users/dsuveges/project_data/molecule/'

drug_df = (
    spark.read.parquet(drugfile)
    .select(
        F.col('id'),
        F.col('drugType'),
        F.col('synonyms'),
        F.col('crossReferences.PubChem'),
        F.col('crossReferences.drugbank')
    )
)

print(f'Number of molecules: {drug_df.count()}')
print(f'Number of molecules with PubChem link: {drug_df.filter(F.col("PubChem").isNotNull()).count()}')
print(f'Number of molecules with drugbank link: {drug_df.filter(F.col("drugbank").isNotNull()).count()}')

Number of molecules: 12594
Number of molecules with PubChem link: 3681
Number of molecules with drugbank link: 7618


In [4]:
drug_df.show()

+-------------+--------------+--------------------+--------------------+---------+
|           id|      drugType|            synonyms|             PubChem| drugbank|
+-------------+--------------+--------------------+--------------------+---------+
|CHEMBL1079742|Small molecule|[CP-358, CP-358,7...|            [534851]|     null|
|CHEMBL1083993|Small molecule|[Amiodarone, Amio...|[144207027, 14421...|     null|
| CHEMBL110458|Small molecule|        [Migalastat]|          [11114003]|[DB05018]|
|CHEMBL1169388|Small molecule|    [(S)-Fluoxetine]|[11111168, 111111...|[DB08544]|
|CHEMBL1173055|Small molecule|[AG-014699, AG-14...|[103905261, 13727...|[DB12332]|
|CHEMBL1200443|Small molecule|[Merethoxylline p...|                null|     null|
|CHEMBL1200910|Small molecule|[Acetylsulfisoxaz...|[170465159, 17046...|[DB14033]|
|CHEMBL1200949|Small molecule|[Ametazole, Betaz...|[144204280, 17046...|     null|
|CHEMBL1200979|Small molecule|[(+)-panthenol, D...|         [170464647]|[DB09357]|
|CHE

In [6]:
# Let's see aspirin:
aspirin = 'CHEMBL25'

(
    drug_df
    .filter(F.col('id') == aspirin)
    .show(vertical=True, truncate=False)
)

-RECORD 0------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 id       | CHEMBL25                                                                                                                                                                                                                                                                                     
 drugType | Small molecule                                                                                                                                                                                                                                                                               
 synonyms | [Acetylsalicylic Acid, Acetylsalicylic acid, Aspirin, BAY1019036, NSC-27223, NSC-406186]      

So, we have a bunch of cross-references, but we now focus on pubchem substance identifers. These identifiers can be mapped to compound identifier, which then can be mapped to pdb id.

In [113]:
# Extract substance IDs for a given row:
substance_ids = drug_df.limit(1).collect()[0]['PubChem']

# substance_ids = ['938420398']


def get_CID(SIDs):
    # Concatenate all substance IDs and fetch compound identifier:
    if not SIDs:
        return []
    
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/substance/sid/{','.join(SIDs)}/cids/JSON"

    response = requests.get(url)
    mapping = response.json()

    # No CID values found, return empty array:
    if 'Fault' in mapping:
        return []

    # Parse CID values:
    substance_ids = []
    for x in mapping['InformationList']['Information']:
        substance_ids += x['CID']

    # Return with the unique list of CIDs:
    return list(set(substance_ids))

def get_pdb_ids(CIDs):
    pdbs = []
    for CID in CIDs:
        url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/structure/compound/{CID}/JSON'

        structure_view = requests.get(url)        
        data = structure_view.json()
        
        if 'Fault' in data:
            continue
            
        if 'Structure' not in data:
            continue
        elif 'Structures' not in data['Structure']:
            continue
            
        pdbs += [x['PDB_ID'] for x in data['Structure']['Structures'] if 'PDB_ID' in x]
        
    return list(set(pdbs))


get_CID_udf = F.udf(
    get_CID,
    ArrayType(IntegerType())
)

get_PDB_udf = F.udf(
    get_pdb_ids,
    ArrayType(StringType())
)
drug_df_annotated = (
    drug_df
#     .sample(False, 0.1)
#     .limit(1000)
    .withColumn('CIDs', get_CID_udf(F.col('PubChem')))
    .withColumn('PDBs', get_PDB_udf(F.col('CIDs')))
    .filter(F.size(F.col('PDBs')) > 0)
    .persist()
)

(
    drug_df_annotated
    .write.format('json').mode('overwrite').option('compression', 'gzip')
    .save('full_drugs_w_pdb.json.gz')
)




1000 randomly select molecules are mapped in 13 minutes.

In [126]:
non_small_molecules = (
    drug_df
    .filter(F.col('drugType') != 'Small molecule')
    .withColumn('CIDs', get_CID_udf(F.col('PubChem')))
    .withColumn('PDBs', get_PDB_udf(F.col('CIDs')))
    .filter(F.size(F.col('PDBs')) > 0)
    .persist()
)

non_small_molecules.show(truncate=False)


+-------------+---------------+--------------------+--------------------+---------+---------+--------------------+
|           id|       drugType|            synonyms|             PubChem| drugbank|     CIDs|                PDBs|
+-------------+---------------+--------------------+--------------------+---------+---------+--------------------+
|    CHEMBL160|        Protein|[27-400, ANTIBIOT...|[11532934, 144204...|[DB00091]|[5284373]|        [2RMC, 1CYN]|
|   CHEMBL1554|        Protein|[Actinomycin d, D...|[144205550, 14420...|[DB00970]| [457193]|        [1MNV, 1I3W]|
| CHEMBL408403|        Protein|[Angiotensin ii, ...|          [50112427]|[DB11842]| [172198]|[6JOD, 3CK0, 6OS0...|
|   CHEMBL1174|        Protein|      [Eptifibatide]|[144206056, 17046...|     null| [448812]|              [2VDN]|
|   CHEMBL1566|Oligosaccharide|[Acarbose, BAY G ...|[144205150, 17046...|     null| [444254]|[1UKT, 1KXH, 2QPU...|
|CHEMBL1230813|Oligosaccharide|[.alpha.-cyclodex...|[144204566, 14420...|[DB0190

In [130]:
(
    non_small_molecules
    .select('id','drugType','synonyms','CIDs','PDBs')
    .show(truncate=False)
)


+-------------+---------------+--------------------------------------------------------------------------------------------------------------------+---------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|id           |drugType       |synonyms                                                                                                            |CIDs     |PDBs                                                                                                                                                                    |
+-------------+---------------+--------------------------------------------------------------------------------------------------------------------+---------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|CHEMBL160    |P

In [118]:
drug_df_small.printSchema()

root
 |-- id: string (nullable = true)
 |-- drugType: string (nullable = true)
 |-- synonyms: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- PubChem: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- drugbank: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- CIDs: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- PDBs: array (nullable = true)
 |    |-- element: string (containsNull = true)



## 