In [8]:
from plip.structure.preparation import PDBComplex, Ligand
import requests
import pandas as pd



In [3]:
import json
from json import JSONDecodeError

import requests
from functools import reduce
import pandas as pd
from pyspark.sql.functions import (
    col, udf, struct, lit, split, expr, collect_set, struct, 
    regexp_replace, min as pyspark_min, explode, when,
    array_contains, count, first, element_at, size, sum as pyspark_sum
)
from pyspark.sql.types import FloatType, ArrayType, StructType, StructField, BooleanType, StringType
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from collections import defaultdict

# establish spark connection
spark = (
    SparkSession.builder
    .master('local[*]')
    .getOrCreate()
)



def get_ligand_identifier(ligand) -> str:
    """For a ligand object it parses the ligand identifier
    
    Args:
        ligands: plip ligand object
    Returns:
        ligand identifier eg. 
    """
    
    return f'{ligand.hetid}:{ligand.chain}:{ligand.position}'


def fetch_pdb(pdb_structure_id: str)-> str:
    """This function fetches the pdb file from ePDB server as a string
    
    Args:
        pdb_structure_id (str)
    Returns:
        structure data in pdb format as string eg 'AIN:A:1202'
    """
    if not pdb_structure_id:
        return ''
    
    url = f'https://www.ebi.ac.uk/pdbe/entry-files/download/pdb{pdb_structure_id}.ent'
    
    try:
        response = requests.get(url)
        data = response.text
    except ConnectionError:
        data = ''
        
    return data


## Fetching structure from web

In [9]:
pdb_structure_id = '1ed4'
pdb_compound_id = 'IPU'

# Columns of interest:
interaction_columns = {
    'a_orig_idx': 'acceptor_index',
    'd_orig_idx': 'donor_index',
    'type': 'type',
    'resnr': 'residue_number',
    'restype': 'resitude_type', 
    'reschain': 'residue_chain',
    'restype_l': 'pdb_compound_id'
}

# Managing complex:
mol_complex = PDBComplex()

# Fetch and load structure:
pdb_string = fetch_pdb(pdb_structure_id)
mol_complex.load_pdb(pdb_string, as_string=True)

# Filtering out only the relevant ligands:
ligands_of_interest = [ligand for ligand in mol_complex.ligands if ligand.hetid == pdb_compound_id ]

# Characterizing relevant complex:
[mol_complex.characterize_complex(ligand) for ligand in ligands_of_interest]

# Extract data in the right shape:
(
    pd.DataFrame([x._asdict() for x in mol_complex.interaction_sets['AIN:A:1202'].all_itypes])
    [interaction_columns.keys()]
    .rename(columns=interaction_columns)
)

KeyError: 'AIN:A:1202'

In [7]:
interactions = []
for molecule, interaction_set in mol_complex.interaction_sets.items():
    if pdb_compound_id not in molecule:
        continue

    interactions += [interaction._asdict() for interaction in interaction_set.all_itypes]

df = (
    pd.DataFrame(interactions)
    [interaction_columns.keys()]
    .rename(columns=interaction_columns)
)

df

NameError: name 'mol_complex' is not defined

In [6]:
pyspark.__version__

NameError: name 'pyspark' is not defined

In [92]:
mol_complex.interaction_sets['IPU:A:1830'].hbonds_ldon #all_itypes[0]._asdict()

[hbond(a=<openbabel.pybel.Atom object at 0x168229700>, a_orig_idx=2354, d=<openbabel.pybel.Atom object at 0x16866afd0>, d_orig_idx=6654, h=<openbabel.pybel.Atom object at 0x16866edf0>, distance_ah=1.7314022722175801, distance_ad=2.714410801628963, angle=176.04977730747, type='strong', protisdon=False, resnr=363, restype='GLU', reschain='A', resnr_l=1830, restype_l='IPU', reschain_l='A', sidechain=True, atype='O2', dtype='Ng+')]

## Reading structure for file

In [3]:
pdb_compound_id = 'AIN'
pdb_structure_id = '3iaz'

pdb_file = f'/Users/dsuveges/Downloads/pdb{pdb_structure_id}.ent'
pdb_file = '/Users/dsuveges/project_data//pdb2qqt.ent'

# Managing complex:
mol_complex = PDBComplex()

# Fetch and load structure:
mol_complex.load_pdb(pdb_file)


In [7]:
# Filtering out only the relevant ligands:
ligands_of_interest = [ligand for ligand in mol_complex.ligands if ligand.hetid == pdb_compound_id ]

# Characterizing relevant complex:
[mol_complex.characterize_complex(ligand) for ligand in ligands_of_interest]

# Extract data in the right shape:
# (
#     pd.DataFrame([x._asdict() for x in mol_complex.interaction_sets['AIN:A:1202'].all_itypes])
#     [interaction_columns.keys()]
#     .rename(columns=interaction_columns)
# )

[None]

In [46]:
parsed_interaction_list = [interaction._asdict() for interaction_set in mol_complex.interaction_sets.values() for interaction in interaction_set.all_itypes]
# spark.createDataFrame(parsed_interaction_list).show()
[print(x['a']) for x in parsed_interaction_list if 'a' in x]



Atom: 8 (3.81 13.30 33.95)
Atom: 7 (5.28 3.68 56.83)
Atom: 7 (2.54 5.86 55.48)
Atom: 7 (3.38 5.17 53.45)
Atom: 8 (7.45 9.76 45.23)
Atom: 7 (8.98 12.19 43.45)
Atom: 8 (-1.69 11.79 63.75)


[None, None, None, None, None, None, None]

In [43]:
interaction_set = list(mol_complex.interaction_sets.values())[0]
interaction_set.all_itypes

<plip.structure.preparation.PLInteraction at 0x165878fa0>

In [142]:
type(mol_complex.ligands[1])

plip.structure.preparation.ligand

In [48]:
source_data_file = '/Users/dsuveges/project/random_notebooks/_/molecules_w_pdb.json.gz'

source_df = spark.read.json('/Users/dsuveges/project/random_notebooks/_/')

source_df.show()




+-------------+-------+----------+---------+--------------------+---------------+---------------+
|    chembl_id|example|hasDisease|hasTarget|                name|pdb_compound_id|structure_count|
+-------------+-------+----------+---------+--------------------+---------------+---------------+
|CHEMBL1231606|   3c4f|     false|    false|       CHEMBL1231606|            C4F|              1|
|CHEMBL1236282|   2jkj|     false|    false|       THIAMPHENICOL|            TH8|              1|
|CHEMBL2107333|   6lrz|      true|     true|   DIMETHYL FUMARATE|            EOU|              1|
| CHEMBL227711|   1udc|     false|    false|        CHEMBL227711|            UFM|              1|
|CHEMBL3039513|   4yti|      true|     true|        DECERNOTINIB|            VJK|              1|
| CHEMBL355048|   1yvz|     false|    false|        CHEMBL355048|            JPC|              1|
| CHEMBL392452|   2rbe|     false|    false|        CHEMBL392452|            ZMG|              1|
| CHEMBL403772|   1w