Fetching molecule file from google bucket:

In [None]:
%%bash

DRUGS_FOLDER='/Users/dsuveges/project_data/drugs'

gsutil cp -r 'gs://ot-snapshots/etl/outputs/21.04.2/parquet/molecule' ${DRUGS_FOLDER}

In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

# establish spark connection
spark = (
    SparkSession.builder
    .master('local[*]')
    .getOrCreate()
)


drugfile = '/Users/dsuveges/project_data/drugs/molecule/'

drug_df = spark.read.parquet(drugfile)
drug_df.show(n=2, truncate=False, vertical=True)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

Out all the available information we only need the name, chemblId, type and cross references. The cross references will also be flattned.

In [5]:
print(f'Number of entries in the molecule dataset: {drug_df.count()}')
print(f'Number of entries in the molecule dataset without cross-references: {drug_df.filter(F.col("crossReferences").isNull()).count()}')
print(f'Available cross-references:')
print(
    drug_df
    .filter(F.col("crossReferences").isNotNull())
    .select(F.explode(F.map_keys(F.col("crossReferences"))))
    .distinct()
    .show()
)

Number of entries in the molecule dataset: 13076
Number of entries in the molecule dataset without cross-references: 3509
Available cross-references:
+-----------+
|        col|
+-----------+
|   TG-GATEs|
|DrugCentral|
|  Wikipedia|
|    PubChem|
|   drugbank|
|      chEBI|
|   DailyMed|
+-----------+

None


In [6]:
molecule_cross_refs = (
    drug_df
    
    # Exploding all cross references by type:
    .select(
        F.col('id'),
        F.col('name'),
        F.col('drugType'),
        F.col('synonyms'),
        F.explode(F.col('crossReferences')).alias('xref_source','xref_id')
    )
    
    # Exploding all cross references by id:
    .select(
        F.col('id'),
        F.col('name'),
        F.col('drugType'),
        F.col('synonyms'),
        F.col('xref_source'),
        F.explode(F.col('xref_id'))
    )
)


molecule_cross_refs.show()

+-------------+--------------+--------------+--------------------+-----------+--------------+
|           id|          name|      drugType|            synonyms|xref_source|           col|
+-------------+--------------+--------------+--------------------+-----------+--------------+
|   CHEMBL1064|   SIMVASTATIN|Small molecule|[C10AA01, MK-0733...|   DailyMed|   simvastatin|
|   CHEMBL1064|   SIMVASTATIN|Small molecule|[C10AA01, MK-0733...|    PubChem|     144204247|
|   CHEMBL1064|   SIMVASTATIN|Small molecule|[C10AA01, MK-0733...|    PubChem|     144210718|
|   CHEMBL1064|   SIMVASTATIN|Small molecule|[C10AA01, MK-0733...|    PubChem|     164339438|
|   CHEMBL1064|   SIMVASTATIN|Small molecule|[C10AA01, MK-0733...|    PubChem|     170464984|
|   CHEMBL1064|   SIMVASTATIN|Small molecule|[C10AA01, MK-0733...|    PubChem|        496592|
|   CHEMBL1064|   SIMVASTATIN|Small molecule|[C10AA01, MK-0733...|    PubChem|      50086525|
|   CHEMBL1064|   SIMVASTATIN|Small molecule|[C10AA01, MK-07

In [40]:
# Number of unique molecules:
print(f"Number of unique molecules: {molecule_cross_refs.select('id').distinct().count()}")
print(f"Molecules with PubChem cross-ref: {molecule_cross_refs.filter(F.col('xref_source') == 'PubChem').select('id').distinct().count()}")
print(f"Number of PubChem cross-refs: {molecule_cross_refs.filter(F.col('xref_source') == 'PubChem').select('col').distinct().count()}")

Number of unique molecules: 9567
Molecules with PubChem cross-ref: 3951
Number of PubChem cross-refs: 13884


In [20]:
# What are the distribution of drug types:
original_type_distribution = (
    drug_df
    .groupby('drugType')
    .count()
    .orderBy('count', ascending=False)
    .withColumnRenamed('count', 'count_original')
)

pubchem_type_distribution = (
    molecule_cross_refs
    .filter(F.col('xref_source') == 'PubChem')
    .select(F.col('drugType'), F.col('name'))
    .distinct()
    .groupby('drugType')
    .count()
    .orderBy('count', ascending=False)
    .withColumnRenamed('count', 'count_pubchem')
)

(
    pubchem_type_distribution
    .join(original_type_distribution, on='drugType', how='outer')
    .show()
)

+---------------+-------------+--------------+
|       drugType|count_pubchem|count_original|
+---------------+-------------+--------------+
|         Enzyme|         null|            83|
|        unknown|         null|            30|
|           null|         null|             6|
|        Unknown|            4|           546|
| Small molecule|         3872|         10972|
|       Antibody|         null|           770|
|Oligosaccharide|            5|            47|
|Oligonucleotide|         null|            59|
|           Gene|         null|            37|
|        Protein|           27|           497|
|           Cell|         null|            29|
+---------------+-------------+--------------+



Wast majority of the available cross-reference to PubChem is available for small molecules. Some important goups not represented at all.

In [55]:
import requests


def fetch_compound_id(sid=[]):
    
    url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug/substance/sid/{",".join(sid)}/cids/JSON'
    
    response = requests.get(url)
    json_data = response.json()

    if 'Fault' in json_data:
        return []

    else:
        return [structure['PDB_ID'] for structure in json_data['Structure']['Structures']]

    
def fetch_structure(cid):
    """
    Given the provided compound ID, retrieving the list of available structures
    """

    url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/structure/compound/{cid}/JSON'

    response = requests.get(url)
    json_data = response.json()

    if 'Fault' in json_data:
        return []

    else:
        return [structure['PDB_ID'] for structure in json_data['Structure']['Structures']]
    


fetch_structure_udf = F.udf(
    fetch_structure,
    F.StringType()
)

In [56]:
(
    molecule_cross_refs
    .filter(
        (F.col('xref_source') == 'PubChem')  &
        (F.col('name') == 'ASPIRIN')
        
    )
    .select(F.col('id'), F.col('col'))
    .withColumnRenamed('col', 'xref')
    .withColumn('structures', fetch_structure_udf(F.col('xref')))
    .show()
)

+--------+---------+----------+
|      id|     xref|structures|
+--------+---------+----------+
|CHEMBL25|144203627|        []|
|CHEMBL25|144209315|        []|
|CHEMBL25|144210466|        []|
|CHEMBL25|170465039|        []|
|CHEMBL25| 17389202|        []|
|CHEMBL25| 17390036|        []|
|CHEMBL25|174007205|        []|
|CHEMBL25| 26747283|        []|
|CHEMBL25| 26752858|        []|
|CHEMBL25| 47193676|        []|
|CHEMBL25| 50105490|        []|
|CHEMBL25| 85230910|        []|
|CHEMBL25|    87798|        []|
|CHEMBL25| 90340586|        []|
+--------+---------+----------+



In [57]:
(
    molecule_cross_refs
    .filter(
        (F.col('xref_source') == 'PubChem')  &
        (F.col('name') == 'ASPIRIN')
        
    )
    .select(F.col('id'), F.col('col'))
    .withColumnRenamed('col', 'xref')
    .toPandas()
    .xref
    .apply(fetch_structure)
)

0     []
1     []
2     []
3     []
4     []
5     []
6     []
7     []
8     []
9     []
10    []
11    []
12    []
13    []
Name: xref, dtype: object

In [60]:
molecule_cross_refs.filter(F.col('col') == 2244).show()

+---+----+--------+--------+-----------+---+
| id|name|drugType|synonyms|xref_source|col|
+---+----+--------+--------+-----------+---+
+---+----+--------+--------+-----------+---+



In [62]:
molecule_cross_refs.filter(F.col('name') == 'ASPIRIN').show()

+--------+-------+--------------+--------------------+-----------+---------+
|      id|   name|      drugType|            synonyms|xref_source|      col|
+--------+-------+--------------+--------------------+-----------+---------+
|CHEMBL25|ASPIRIN|Small molecule|[Acetylsalicylic ...|   DailyMed|  aspirin|
|CHEMBL25|ASPIRIN|Small molecule|[Acetylsalicylic ...|    PubChem|144203627|
|CHEMBL25|ASPIRIN|Small molecule|[Acetylsalicylic ...|    PubChem|144209315|
|CHEMBL25|ASPIRIN|Small molecule|[Acetylsalicylic ...|    PubChem|144210466|
|CHEMBL25|ASPIRIN|Small molecule|[Acetylsalicylic ...|    PubChem|170465039|
|CHEMBL25|ASPIRIN|Small molecule|[Acetylsalicylic ...|    PubChem| 17389202|
|CHEMBL25|ASPIRIN|Small molecule|[Acetylsalicylic ...|    PubChem| 17390036|
|CHEMBL25|ASPIRIN|Small molecule|[Acetylsalicylic ...|    PubChem|174007205|
|CHEMBL25|ASPIRIN|Small molecule|[Acetylsalicylic ...|    PubChem| 26747283|
|CHEMBL25|ASPIRIN|Small molecule|[Acetylsalicylic ...|    PubChem| 26752858|

In [74]:



df_1 = (
    spark
    .read.options(delimiter='\t', header=True)
    .csv('/Users/dsuveges/project_data/side_project/wholeSourceMapping/src_id1/src1src2.txt.gz')
)
df_1.show()






+-------------+----------+
| From src:'1'|To src:'2'|
+-------------+----------+
|CHEMBL1185242|   DB06994|
|CHEMBL1201270|   DB06799|
|    CHEMBL522|   DB02464|
|CHEMBL3989948|   DB14814|
| CHEMBL313405|   DB07337|
|CHEMBL1328913|   DB08996|
|CHEMBL2402737|   DB12325|
|  CHEMBL39221|   DB04228|
|   CHEMBL1410|   DB06804|
|  CHEMBL35228|   DB12975|
| CHEMBL290744|   DB03810|
|CHEMBL3301593|   DB11923|
|CHEMBL2104344|   DB13404|
| CHEMBL334167|   DB03294|
|   CHEMBL1364|   DB00165|
| CHEMBL328910|   DB02839|
|CHEMBL1231160|   DB11759|
| CHEMBL489657|   DB07693|
|    CHEMBL614|   DB00339|
|CHEMBL3184454|   DB09221|
+-------------+----------+
only showing top 20 rows



In [75]:
(
    df_1
    .withColumnRenamed("From src:'1'", "chemblId")
    .withColumnRenamed("To src:'2'", "otherId")
    .filter(F.col('chemblId') == 'CHEMBL25')
    .show()
)

+--------+-------+
|chemblId|otherId|
+--------+-------+
|CHEMBL25|DB00945|
+--------+-------+

