In [92]:
import json
from json import JSONDecodeError

import requests
from functools import reduce
import pandas as pd
from pyspark.sql.functions import (
    col, udf, struct, lit, split, expr, collect_set, struct, 
    regexp_replace, min as pyspark_min, explode, when,
    array_contains, count, first, element_at, size, sum as pyspark_sum
)
from pyspark.sql.types import FloatType, ArrayType, StructType, StructField, BooleanType, StringType
from pyspark.sql import SparkSession
from collections import defaultdict

# establish spark connection
spark = (
    SparkSession.builder
    .master('local[*]')
    .getOrCreate()
)

def get_structure(pdb_compound_id: str) -> list:
    """Fetching structure identifiers from PDBkb REST API

    Args:
        pdb_compound_id: string, a single compound identifier
    Returns:
        List of PDB structure identifiers where the compound can be found
    """
    url = f'https://www.ebi.ac.uk/pdbe/api/pdb/compound/in_pdb/{pdb_compound_id}'
    response = requests.get(url)
    try:
        data = response.json()
        return data[pdb_compound_id]
    except JSONDecodeError:
        print(f'Failed to return structures to: {pdb_compound_id}')
        if len(response.json()) == 0:
            return []
    except KeyError:
        print(f'Empty data was returned for: {pdb_compound_id}')
        return []

molecule_index = '/Users/dsuveges/project_data/molecule/'
unichem_map = '/Users/dsuveges/Downloads/src1src3.txt'
chain_map = '/Users/dsuveges/Downloads/pdb_chain_ensembl.csv'


In [90]:
# Reading and processing the molecule dataframe:
molecules_df = (
    spark.read.parquet(molecule_index)
    .select(col('id').alias('chembl_id'), 'name', 'linkedTargets', 'linkedDiseases')
)

print(molecules_df.count())

# Reading the unichem map:
unichem_df = (
    spark.read.csv(unichem_map, sep='\t', header=True)
    .withColumnRenamed('From src:\'1\'', 'chembl_id')
    .withColumnRenamed('To src:\'3\'', 'pdb_compound_id')
)

print(unichem_df.count())

# Joined unichem with OT molecules + map compounds to structures:
molecules_w_pdb = spark.createDataFrame(
    molecules_df
    
    # Joining dataset with compound identifier:
    .join(unichem_df, on='chembl_id', how='inner')
    
    # Convert dataset to pandas:
    .toPandas()
    
    # Map compound identifiers to structures:
    .assign(pdb_structure_id = lambda df: df.pdb_compound_id.apply(get_structure))
)

print(molecules_w_pdb.count())
molecules_w_pdb.show()

# Reading file wiht chain map to ensembl gene identifier:
pdb_chain_to_gene = (
    spark.read.csv(chain_map, sep=',', header=True, comment='#')
    .select(
        col('PDB').alias('pdb_structure_id'), 
        col('CHAIN').alias('chain'), col('SP_PRIMARY').alias('uniprot_id'),
        col('GENE_ID').alias('ensembl_gene_id')
    )
    .persist()
)

12594
14452
Empty data was returned for: 6LG
Empty data was returned for: C4C
Empty data was returned for: MEG
Empty data was returned for: FRR
Empty data was returned for: 72Q
Empty data was returned for: THZ
Empty data was returned for: MXL
Empty data was returned for: BTP
Empty data was returned for: ECT
Empty data was returned for: LIG
Empty data was returned for: DB0
Empty data was returned for: HSF
Empty data was returned for: EMR
Empty data was returned for: CXN
3657
+-------------+-----------------+--------------------+--------------------+---------------+--------------------+
|    chembl_id|             name|       linkedTargets|      linkedDiseases|pdb_compound_id|    pdb_structure_id|
+-------------+-----------------+--------------------+--------------------+---------------+--------------------+
| CHEMBL110458|       MIGALASTAT|{[ENSG00000102393...| {[Orphanet_324], 1}|            DGJ|[3s5y, 3thd, 3tv8...|
|CHEMBL1169388|   (S)-Fluoxetine|                null|               

In [113]:
annotated_molecules = (
    molecules_w_pdb
    .withColumn('structure_count', size(col('pdb_structure_id')))
    .withColumn('hasTarget', when(col('linkedTargets').isNull(), False).otherwise(True))
    .withColumn('hasDisease', when(col('linkedDiseases').isNull(), False).otherwise(True))
    .orderBy('structure_count', ascending=False)
    .select(
        'chembl_id', 'name', 'pdb_compound_id', 'structure_count', 'hasTarget', 'hasDisease',
        element_at(col('pdb_structure_id'), 1).alias('example')
    )
    .persist()
)

top_100 = (
    annotated_molecules
    .filter(col('structure_count') >= 100)
    .persist()    
)

In [111]:
print(f'Number of molecules: {top_100.count()}')
print(f'Number of structures: {top_100.select(pyspark_sum(col("structure_count")).alias("counts")).take(1)}')
print(f'Number of molecules without indications: {top_100.filter(~col("hasTarget")).count()}')
print(f'Number of molecules without mechanism of action: {top_100.filter(~col("hasDisease")).count()}')


Number of molecules: 129
Number of structures: [Row(counts=117548)]
Number of molecules without indications: 126
Number of molecules without mechanism of action: 126


In [114]:
print(f'Number of molecules: {annotated_molecules.count()}')
print(f'Number of structures: {annotated_molecules.select(pyspark_sum(col("structure_count")).alias("counts")).take(1)}')
print(f'Number of molecules without indications: {annotated_molecules.filter(~col("hasTarget")).count()}')
print(f'Number of molecules without mechanism of action: {annotated_molecules.filter(~col("hasDisease")).count()}')


Number of molecules: 3657
Number of structures: [Row(counts=137431)]
Number of molecules without indications: 3041
Number of molecules without mechanism of action: 3041


In [115]:
annotated_molecules.write.json('annotated_molecules.json.gz')

In [125]:
(
    annotated_molecules
    .filter(col('chembl_id') == 'CHEMBL3353410')
    .show()
)

+-------------+-----------+---------------+---------------+---------+----------+-------+
|    chembl_id|       name|pdb_compound_id|structure_count|hasTarget|hasDisease|example|
+-------------+-----------+---------------+---------------+---------+----------+-------+
|CHEMBL3353410|OSIMERTINIB|            YY3|             10|     true|      true|   4zau|
+-------------+-----------+---------------+---------------+---------+----------+-------+

