# Mapping drugs to structures to genes

Function of the script:
* Reading molecule index
* Joining molecule index with pdbe dataset by inchi key -> ligand identifier is now provided
* Extract pdb-ligand mapping from API
* Explode pdb column 
* join with chain identifier



Input:
* OT molecule index
* Some dataset for mapping inchi keys to pdb ligands
* SIFTS dataset to map pdb chains to target identifier



In [9]:
import json
from json import JSONDecodeError

import requests
from functools import reduce
import pandas as pd
from pyspark.sql.functions import (
    col, udf, struct, lit, split, expr, collect_set, struct, 
    regexp_replace, min as pyspark_min, explode, when,
    array_contains, count, first, element_at, size, sum as pyspark_sum
)
from pyspark.sql.types import FloatType, ArrayType, StructType, StructField, BooleanType, StringType
from pyspark.sql import SparkSession
from collections import defaultdict

# establish spark connection
spark = (
    SparkSession.builder
    .master('local[*]')
    .getOrCreate()
)

def get_structure(pdb_compound_id: str) -> list:
    """Fetching structure identifiers from PDBkb REST API

    Args:
        pdb_compound_id: string, a single compound identifier
    Returns:
        List of PDB structure identifiers where the compound can be found
    """
    url = f'https://www.ebi.ac.uk/pdbe/api/pdb/compound/in_pdb/{pdb_compound_id}'
    response = requests.get(url)
    try:
        data = response.json()
        return data[pdb_compound_id]
    except JSONDecodeError:
        print(f'Failed to return structures to: {pdb_compound_id}')
        if len(response.json()) == 0:
            return []
    except KeyError:
        print(f'Empty data was returned for: {pdb_compound_id}')
        return []

molecule_index = '/Users/dsuveges/project_data/molecule/'
unichem_map = '/Users/dsuveges/Downloads/src1src3.txt'
chain_map = '/Users/dsuveges/project/random_notebooks/issue-1891_extracting_drug-ligand_complex/pdb_chain_ensembl.csv'
pdb_chem_file = '/Users/dsuveges/project/random_notebooks/issue-1891_extracting_drug-ligand_complex/components_inchikeys.csv'


## Generate molecule table

1. Reading molecule index
2. Reading pubchem file and join by InchiKey
3. Mapping compound identifier using pdb API

In [10]:
# Reading and processing the molecule dataframe:
molecules_df = (
    spark.read.parquet(molecule_index)
    .select(
        col('id').alias('chembl_id'), 'inchiKey',
        'name', 'linkedTargets', 'linkedDiseases')
)

print(molecules_df.count())

# Reading the unichem map: => Instead Unichem, we use different dataset.
# unichem_df = (
#     spark.read.csv(unichem_map, sep='\t', header=True)
#     .withColumnRenamed('From src:\'1\'', 'chembl_id')
#     .withColumnRenamed('To src:\'3\'', 'pdb_compound_id')
# )

pdb_chem_df = (
    spark.read.csv(pdb_chem_file, sep=',', header=True)
    .withColumnRenamed('InChIKey', 'inchiKey')
    .withColumnRenamed('CCD_ID', 'pdb_compound_id')
)

print(pdb_chem_df.count())

# Joined unichem with OT molecules + map compounds to structures:
molecules_w_pdb = spark.createDataFrame(
    molecules_df
    
    # Joining dataset with compound identifier:
    .join(pdb_chem_df, on='inchiKey', how='inner')
    
    # Convert dataset to pandas:
    .toPandas()
    
    # Map compound identifiers to structures:
    .assign(pdb_structure_id = lambda df: df.pdb_compound_id.apply(get_structure))
)

print(molecules_w_pdb.count())
molecules_w_pdb.show()

12594
36500
Empty data was returned for: 1ZT
Empty data was returned for: 3IH
Empty data was returned for: 3KV
Empty data was returned for: 3SA
Empty data was returned for: 53P
Empty data was returned for: 6LG
Empty data was returned for: 72Q
Empty data was returned for: 9HE
Empty data was returned for: AA3
Empty data was returned for: ABG
Empty data was returned for: ACX
Empty data was returned for: ATG
Empty data was returned for: ATW
Empty data was returned for: B4M
Empty data was returned for: B7D
Empty data was returned for: BA4
Empty data was returned for: BCD
Empty data was returned for: BTP
Empty data was returned for: C4C
Empty data was returned for: CHA
Empty data was returned for: COR
Empty data was returned for: CSE
Empty data was returned for: CSW
Empty data was returned for: CTO
Empty data was returned for: CXN
Empty data was returned for: DB0
Empty data was returned for: DMR
Empty data was returned for: DR2
Empty data was returned for: DR5
Empty data was returned for: EC

## Mapping chains to genes

1. Reading pdb2chain file.
2. Join with the above generated dataset.
3. Save file as json.

In [29]:
# Reading file wiht chain map to ensembl gene identifier:
pdb_chain_to_gene = (
    spark.read.csv(chain_map, sep=',', header=True, comment='#')
    .select(
        col('PDB').alias('pdb_structure_id'), 
        col('GENE_ID').alias('ensembl_gene_id'),
        col('CHAIN').alias('chain_id'),
        col('SP_PRIMARY').alias('uniprot')
    )
    .distinct()
    .persist()
)

gene_mapped_structures = (
    molecules_w_pdb
    
    # Explode the pdb structure column and aggregate by compound: 
    .select(
        'pdb_compound_id', 
        explode(col('pdb_structure_id')).alias('pdb_structure_id')
    )
    .groupby('pdb_structure_id')
    .agg(collect_set(col('pdb_compound_id')).alias('compound_ids'))
    
    # Joining with genes by pdb_structure id:
    .join(pdb_chain_to_gene, on='pdb_structure_id', how='left')
    .filter(col('ensembl_gene_id').rlike('ENSG\d+'))
    
    # Reorganizing the dataset:
    .groupby('pdb_structure_id')
    .agg(
        collect_set(struct(
            col('ensembl_gene_id'),
            col('chain_id'),
            col('uniprot')
        )).alias('chains'),
        first(col('compound_ids')).alias('compound_ids')
    )
    .persist()
)

gene_mapped_structures.count()

26585

### Filter out unwanted compounds

1. Fetch data from `https://zhanggroup.org/BioLiP/ligand_list`
2. Explode compound
3. Do antijoin

In [50]:
excluded_compounds = spark.createDataFrame(
    pd.read_csv('https://zhanggroup.org/BioLiP/ligand_list', header=None)
    .rename(columns=({0: 'compound_id'}))
    .astype(str)
).persist()

# exploded, no filter: 38633
# exploded, filtered: 21750
(
    gene_mapped_structures
    .withColumn('compound_id', explode(col('compound_ids')))
    .join(excluded_compounds, on='compound_id', how='left_anti')
    .show()
)

+-----------+----------------+--------------------+--------------+
|compound_id|pdb_structure_id|              chains|  compound_ids|
+-----------+----------------+--------------------+--------------+
|        AS4|            2aa2|[{ENSG00000151623...|    [AS4, GOL]|
|        AS4|            6hgj|[{ENSG00000196136...|    [EDO, AS4]|
|        CCK|            3e3b|[{ENSG00000070770...|         [CCK]|
|        CCK|            3at4|[{ENSG00000101266...|         [CCK]|
|        CK9|            2a0c|[{ENSG00000123374...|         [CK9]|
|        CRS|            4gbc|[{ENSG00000254647...|     [CRS, ZN]|
|        CRS|            1zeh|[{ENSG00000254647...|     [CRS, ZN]|
|        CRS|            4gbi|[{ENSG00000254647...|     [CRS, ZN]|
|        CRS|            4gbl|[{ENSG00000254647...|     [CRS, ZN]|
|        CRS|            6gnq|[{ENSG00000254647...|[EDO, CRS, ZN]|
|        CRS|            1uz9|[{ENSG00000254647...|     [CRS, ZN]|
|        CRS|            4gbk|[{ENSG00000254647...|     [CRS, 

In [33]:
(
    gene_mapped_structures
    .write.mode('overwrite').json('gene_mapped_structures.json')
)

In [34]:
%%bash

cat gene_mapped_structures.json/*json | gzip > gene_mapped_structures.json.gz

gzcat gene_mapped_structures.json.gz | wc -l

   26585


In [17]:
# Reading file wiht chain map to ensembl gene identifier:
pdb_chain_to_gene = (
    spark.read.csv(chain_map, sep=',', header=True, comment='#')
    .select(
        col('PDB').alias('pdb_structure_id'), 
        col('GENE_ID').alias('ensembl_gene_id')
    )
    .distinct()
    .persist()
)

structure_to_genes = (
    molecules_w_pdb
    .select(col('pdb_compound_id'), explode(col('pdb_structure_id')).alias('pdb_structure_id'))
    .join(pdb_chain_to_gene, on='pdb_structure_id', how='left')
    .distinct()
    .groupBy(['pdb_compound_id', 'pdb_structure_id'])
    .agg(collect_set(col('ensembl_gene_id')).alias('ensembl_gene_id'))
    .persist()
)

(
    structure_to_genes
    .show(1, False, True)
)

-RECORD 0-----------------------------
 pdb_compound_id  | 1FL               
 pdb_structure_id | 6e78              
 ensembl_gene_id  | [ENSG00000118271] 
only showing top 1 row



In [67]:
(
    structure_to_genes
    .filter(size(col('ensembl_gene_id')) > 0)
    .withColumnRenamed('pdb_compound_id', 'CCD_id')
    .groupBy('CCD_id')
    .agg(collect_set(col('pdb_structure_id')).alias('structures_not_in_sifts'))
#     .show(truncate=False)
    .count()
)

                                                                                

2670

In [4]:
%%bash

gzcat annotated_molecules.json.gz/*gz | head 
 

gzcat: can't stat: annotated_molecules.json.gz/*gz (annotated_molecules.json.gz/*gz.gz): No such file or directory


In [115]:
annotated_molecules.write.json('annotated_molecules.json.gz')

In [1]:
(
    annotated_molecules
    .filter(col('chembl_id') == 'CHEMBL3353410')
    .show()
)

NameError: name 'annotated_molecules' is not defined

# Testing targets using sifts

Data fetched from: [/pub/databases/msd/sifts/flatfiles/csv](/pub/databases/msd/sifts/flatfiles/csv)

In [6]:
%%bash


# gzcat pdb_chain_ensembl.csv.gz | wc -l  #  /pub/databases/msd/sifts/flatfiles/flatfiles/csv
# wc -l pdb_chain_ensembl.csv #  /pub/databases/msd/sifts/flatfiles/csv

cut -f1,2 -d, pdb_chain_ensembl.csv | sort -u | wc -l
# head pdb_chain_ensembl.csv

cat pdb_chain_ensembl.csv | head

  215134
# 2022/02/20 - 18:52 | PDB: 07.22 | UniProt: 2022.01
PDB,CHAIN,SP_PRIMARY,GENE_ID,TRANSCRIPT_ID,TRANSLATION_ID,EXON_ID
10gs,A,P09211,ENSG00000084207,ENST00000398606,ENSP00000381607,ENSE00003491308
10gs,A,P09211,ENSG00000084207,ENST00000398606,ENSP00000381607,ENSE00003514541
10gs,A,P09211,ENSG00000084207,ENST00000398606,ENSP00000381607,ENSE00003556693
10gs,A,P09211,ENSG00000084207,ENST00000398606,ENSP00000381607,ENSE00003615072
10gs,A,P09211,ENSG00000084207,ENST00000398606,ENSP00000381607,ENSE00003750846
10gs,A,P09211,ENSG00000084207,ENST00000398606,ENSP00000381607,ENSE00003822108
10gs,B,P09211,ENSG00000084207,ENST00000398606,ENSP00000381607,ENSE00003491308
10gs,B,P09211,ENSG00000084207,ENST00000398606,ENSP00000381607,ENSE00003514541


In [32]:
# Reading and processing the molecule dataframe
id_map = (
    spark.read.csv('components_inchikeys.csv', sep=',', header=True)
    .select(
        col('CCD_ID').alias('compound_id'),
        col('InChIKey').alias('inchiKey')
    )
)

# Reading the unichem map:
unichem_df = (
    spark.read.csv(unichem_map, sep='\t', header=True)
    .withColumnRenamed('From src:\'1\'', 'chembl_id')
    .withColumnRenamed('To src:\'3\'', 'pdb_compound_id')
)



molecules = (
    # Reading molecule index:
    spark.read.parquet(molecule_index)
    .select(
        col('id').alias('chembl_id'),
        col('inchiKey'), col('name')
    )
    
    # Join with pdbChem data by inchiKey:
    .join(id_map, on='inchiKey', how='left')
    
    # Join with unichem data by chembl_id:
    .join(unichem_df, on='chembl_id', how='left')
    
    # Filtering for molecules that can be mapped by unichem OR pdbChem:
    .filter(
        col('compound_id').isNotNull() | col('pdb_compound_id').isNotNull()
    )
    .persist()
)

molecules.show()

+-------------+--------------------+-----------------+-----------+---------------+
|    chembl_id|            inchiKey|             name|compound_id|pdb_compound_id|
+-------------+--------------------+-----------------+-----------+---------------+
| CHEMBL110458|LXBIFEVIBLOUGU-DP...|       MIGALASTAT|        DGJ|            DGJ|
|CHEMBL1169388|RTHCYVBBDHJXIQ-IN...|   (S)-Fluoxetine|        SFX|            SFX|
|CHEMBL1173055|HMABYWSNWIZPAG-UH...|        RUCAPARIB|        RPB|            RPB|
|CHEMBL1231606|GHQCCHWTDLTMJT-UH...|    CHEMBL1231606|        C4F|            C4F|
|CHEMBL1232381|AUTOLBMXDDTRRT-JG...|    CHEMBL1232381|        DTB|            DTB|
|CHEMBL1234354|XDLYKKIQACFMJG-WK...|      PF-04691502|        ML9|            ML9|
|CHEMBL1236282|OTVAEFIXJLOWRX-NX...|    THIAMPHENICOL|        TH8|            TH8|
|   CHEMBL1515|PMRYVIKBURPHAH-UH...|      METHIMAZOLE|        MMZ|            MMZ|
|   CHEMBL1560|FAKRSMQSSFJEIM-RQ...|        CAPTOPRIL|        X8Z|            X8Z|
|  C

In [33]:
molecules.count()

3806

In [23]:
id_map.show()

+-----------+--------------------+
|compound_id|            inchiKey|
+-----------+--------------------+
|        000|CXHHBNMLPJOKQD-UH...|
|        001|NBYCDVVSYOMFMS-VM...|
|        002|MWZOULASPWUGJJ-NF...|
|        003|NNZDBCPMOOEFTE-UH...|
|        004|ZGUNAGUHMKGQNY-ZE...|
|        005|LDSJMFGYNFIFRK-IU...|
|        006|KKTYZYHUPKXLPL-RI...|
|        007|SJWOFBVBNFLWLP-UH...|
|        008|OEVYDSSAPNIURZ-AE...|
|        009|DCJGHBWTJFHQCR-UE...|
|        00A|RVDNKWBGRIIRML-XN...|
|        00B|PMQQFSDIECYOQV-SC...|
|        00C|XVOYSCVBGLVSOL-UW...|
|        00D|DXSUORGKJZADET-RX...|
|        00E|VIWZVFVJPXTXPA-UH...|
|        00F|SMNGNHRXUGKQDP-MW...|
|        00G|NSZDJRLPCLOQAM-UH...|
|        00H|CZMVVDBXWOZCRC-SN...|
|        00I|XRFHGJLVHMSQFX-IZ...|
|        00J|LGXVKMDGSIWEHL-UH...|
+-----------+--------------------+
only showing top 20 rows



In [27]:
# Reading the unichem map:
unichem_df = (
    spark.read.csv(unichem_map, sep='\t', header=True)
    .withColumnRenamed('From src:\'1\'', 'chembl_id')
    .withColumnRenamed('To src:\'3\'', 'compound_id')
    .join(id_map, on='compound_id', how='outer')
)

unichem_df.show()

+-----------+-------------+--------------------+
|compound_id|    chembl_id|            inchiKey|
+-----------+-------------+--------------------+
|        000|         null|CXHHBNMLPJOKQD-UH...|
|        001|         null|NBYCDVVSYOMFMS-VM...|
|        002|         null|MWZOULASPWUGJJ-NF...|
|        003| CHEMBL401874|NNZDBCPMOOEFTE-UH...|
|        004| CHEMBL378605|ZGUNAGUHMKGQNY-ZE...|
|        005| CHEMBL302932|LDSJMFGYNFIFRK-IU...|
|        006| CHEMBL231522|KKTYZYHUPKXLPL-RI...|
|        007| CHEMBL382127|SJWOFBVBNFLWLP-UH...|
|        008| CHEMBL381806|OEVYDSSAPNIURZ-AE...|
|        009| CHEMBL406317|DCJGHBWTJFHQCR-UE...|
|        00A|         null|RVDNKWBGRIIRML-XN...|
|        00B|CHEMBL1229504|PMQQFSDIECYOQV-SC...|
|        00C|         null|XVOYSCVBGLVSOL-UW...|
|        00D|         null|DXSUORGKJZADET-RX...|
|        00E|         null|VIWZVFVJPXTXPA-UH...|
|        00F|         null|SMNGNHRXUGKQDP-MW...|
|        00G|         null|NSZDJRLPCLOQAM-UH...|
|        00H|       

In [34]:
molecules = (
    # Reading molecule index:
    spark.read.parquet(molecule_index)
    .select(
        col('id').alias('chembl_id'),
        col('inchiKey'), col('name')
    )
    .persist()
)


im = (
    molecules
    .join(id_map, on='inchiKey')
)

um = (
    molecules
    .join(unichem_df, on='chembl_id')
)

(
    im.join(um, on='chembl_id', how='outer')
    .count()
)

3806

In [None]:
%%bash

ls -lah