In [213]:
import pandas as pd

pdb_list_file = 'pdb_list.txt'
data = '/Users/dsuveges/project/random_notebooks/issue-1891_extracting_drug-ligand_complex/molecules_w_targets/'

pdb_ids = pd.read_csv(pdb_list_file, sep=',', header=None, names=['pdbId'])
print(pdb_ids.head())
print(len(pdb_ids))


  pdbId
0  13gs
1  1avd
2  1b86
3  1bzm
4  1bzs
988


In [214]:
import json
from json import JSONDecodeError

import requests
from functools import reduce
import pandas as pd
from pyspark.sql.functions import (
    col, udf, struct, lit, split, expr, collect_set, struct, 
    regexp_replace, min as pyspark_min, explode, when,
    array_contains, count, first, element_at, size, sum as pyspark_sum, array
)
from pyspark.sql.types import (
    FloatType, ArrayType, StructType, StructField, BooleanType, StringType, IntegerType
)
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from collections import defaultdict
from pyspark.context import SparkContext

# establish spark connection
spark = (
    SparkSession.builder
    .master('local[*]')
    .getOrCreate()
)

# Dataset witht all the details, produced earlier:
input_dataset = (
    spark.read.parquet(data)
    .persist()
)

# This dataset is grouped by pdb id to get all the compounds:
data_to_look_up = (
    input_dataset
    .filter(col('ensembl_gene_id').startswith('ENSG'))
    .groupby('pdb_structure_id')
    .agg(collect_set(col('pdb_compound_id')).alias('coumpound_ids'))
    .persist()
)

data_to_look_up.show()
print(data_to_look_up.count())

+----------------+-------------+
|pdb_structure_id|coumpound_ids|
+----------------+-------------+
|            1avd|        [BTN]|
|            1d5m|        [ALC]|
|            1d6q|        [GOL]|
|            1e9b|        [ATM]|
|            1ere|        [EST]|
|            1j3z|        [CMO]|
|            1jan|         [ZN]|
|            1ln2|        [MSE]|
|            1lq8|   [NDG, IPA]|
|            1ozj|         [ZN]|
|            1qxe|   [FUX, OXY]|
|            1raz|         [ZN]|
|            1t2v|        [SEP]|
|            1t9s|    [ZN, 5GP]|
|            1y8q|    [ATP, ZN]|
|            1ydb|    [AZM, ZN]|
|            1yxu|        [AMP]|
|            1z0f|        [GDP]|
|            1z89|   [62P, NAP]|
|            2b02|        [MSE]|
+----------------+-------------+
only showing top 20 rows

27315


In [215]:
pdb_w_compound = (
    data_to_look_up
    .withColumnRenamed('pdb_structure_id', 'pdbId')
    .toPandas()
    .merge(pdb_ids, on='pdbId', how='right')
)

print(pdb_w_compound.head())
print(len(pdb_w_compound))

  pdbId    coumpound_ids
0  13gs  [SAS, MES, GSH]
1  1avd            [BTN]
2  1b86            [OXY]
3  1bzm             [ZN]
4  1bzs             [ZN]
988


In [253]:
from plip.structure.preparation import PDBComplex
from plip.exchange.report import BindingSiteReport
from plip.basic import config

import json

class GetPDB:
    
    PDB_URL = 'https://www.ebi.ac.uk/pdbe/entry-files/download/pdb{}.ent'
    
    def __init__(self, data_folder: str) -> None:
        self.data_folder = data_folder
        
    
    def get_pdb(self, pdb_structure_id: str) -> str:
        """Reading file from a given loaction fetch and save if not found"""
        try:
            # Readind data from the given location:
            with open(f'{self.data_folder}/pdb{pdb_structure_id}.ent', 'rt') as f:
                data = f.read()
            
        except FileNotFoundError:
            # Fetch data from the web
            data = self.fetch_pdb(pdb_structure_id)
            
            # Save file
            with open(f'{self.data_folder}/pdb{pdb_structure_id}.ent', 'wt') as f:
                f.write(data)
    
        return data
    

    def fetch_pdb(self, pdb_structure_id: str)-> str:
        """This function fetches the pdb file from ePDB server as a string

        Args:
            pdb_structure_id (str)
        Returns:
            structure data in pdb format as string eg 'AIN:A:1202'
        """
        if not pdb_structure_id:
            return ''

        try:
            response = requests.get(self.PDB_URL.format(pdb_structure_id))
            data = response.text
        except ConnectionError:
            data = ''

        return data


def parse_interaction(interaction: PLInteraction, compound_id:str, pdb_id:str) -> dict:

    interaction_type = interaction.__doc__.split('(')[0]
    
    if interaction_type == 'waterbridge':
        return {}

    # Parsing data form the interaction:
    return {
        'pdb_structure_id': pdb_id,
        'compound_id': compound_id,
        'interaction_type': interaction_type,
        'prot_residue_number': interaction.resnr,
        'prot_residue_type': interaction.restype,
        'prot_chain_id': interaction.reschain
    }

def characerize_complex(pdb_id, compounds, gpdb) -> list:
    # Get pdb data:    
    pdb_data = gpdb.get_pdb(pdb_id)
    if not pdb_data.startswith('HEADER'):
        return []

    # Load into plip:
    mol_complex = PDBComplex()
    mol_complex.load_pdb(pdb_data, as_string=True)

    # Filtering out only the relevant ligands:
    ligands_of_interest = [ligand for ligand in mol_complex.ligands if ligand.hetid in compounds]

    # Characterizing relevant complex:
    [mol_complex.characterize_complex(ligand) for ligand in ligands_of_interest]

    # Extract details from ligands:
    data = [parse_interaction(interaction, compound.split(':')[0], pdb_id) for compound, interaction_set in mol_complex.interaction_sets.items() for interaction in interaction_set.all_itypes]
    if data == []:
        return None
    
    with open(f'output/{pdb_id}.json', 'wt') as f:
        for document in data:
            if data != {}:
                json.dump(document, f)
    

    


In [263]:
import dask.dataframe as dd

pdb_w_compound =(
    pd.read_csv('structure_for_plip.csv',) # <= 99 structures    
    # Creating arrays of compounds:
    .groupby('pdbStructureId')
    .agg({
        'pdbCompoundId': lambda s: s.to_list()
    })
    .reset_index()
    
    # Rename columns:
    .rename(columns={
        'pdbStructureId': 'pdb_structure_id',
        'pdbCompoundId': 'coumpound_ids'
    })
)

# Converting pandas to dask dataframe:
ddf = dd.from_pandas(pdb_w_compound, npartitions=8)
gpdb = GetPDB(data_folder='pdbs/')

# Execute plip parsing:
res_df = (
    ddf
    .assign(
        new_col = ddf.map_partitions(
            lambda df: df.apply(lambda row: characerize_complex(*row, gpdb), axis=1), meta=(None, 'f8')
        )
    )
       .compute(scheduler='processes')
)

res_df.head()


BrokenProcessPool: A process in the process pool was terminated abruptly while the future was running or pending.

In [239]:
from itertools import chain

pd.DataFrame(list(chain.from_iterable(
    res_df
    .loc[lambda df: df.new_col.apply(lambda x: len(x) >0)]
    .assign(new_col = lambda df: df.new_col.apply(lambda l: [value for value in l if value != {}]))
    .new_col
    .to_list()
)))



Unnamed: 0,pdb_structure_id,compound_id,interaction_type,prot_residue_number,prot_residue_type,prot_chain_id
0,1cul,3PO,saltbridge,484,ARG,A
1,1cul,3PO,saltbridge,1029,ARG,B
2,1cul,3PO,saltbridge,1029,ARG,B
3,1cul,3PO,saltbridge,1029,ARG,B
4,1cul,3PO,saltbridge,1065,LYS,B
...,...,...,...,...,...,...
1486,7voe,9SC,hydroph_interaction,339,PHE,A
1487,7voe,9SC,hydroph_interaction,340,PHE,A
1488,7voe,9SC,hydroph_interaction,151,TRP,A
1489,7voe,9SC,hydroph_interaction,243,PHE,A


In [256]:
(
    spark.read.json('output/')
    .show()
)

+-----------+----------------+----------------+-------------+-------------------+-----------------+
|compound_id|interaction_type|pdb_structure_id|prot_chain_id|prot_residue_number|prot_residue_type|
+-----------+----------------+----------------+-------------+-------------------+-----------------+
|        A3P|      saltbridge|            5x2b|            C|                 47|              LYS|
|        3AT|      saltbridge|            1k90|            A|                329|              ARG|
|        29S|      saltbridge|            4xi3|            A|                351|              ASP|
|        3QZ|           hbond|            3qz1|            A|                232|              ARG|
|        6RL|           hbond|            5kby|            A|                125|              ARG|
|        0LI|           hbond|            3ik3|            A|                318|              MET|
|        3AT|      saltbridge|            4lt6|            B|                227|              LYS|


In [262]:
%tb

BrokenProcessPool: A process in the process pool was terminated abruptly while the future was running or pending.