# Exploring PLIP application


**Source data**: annotated drugs, where the drug id (`chembl_id`) is annotated with pdb structure and compound id + the corresponding target identifier based on the SIFTS dataset. 

**Schema**:

```
root
 |-- pdb_structure_id: string (nullable = true)
 |-- chembl_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- pdb_compound_id: string (nullable = true)
 |-- ensembl_gene_id: string (nullable = true)
 |-- pdb_chain_id: string (nullable = true)
```

In [26]:
import argparse
import json
import logging
from json import JSONDecodeError
import urllib

import requests
from functools import reduce
import pandas as pd
from pyspark.sql.functions import (
    col, udf, struct, lit, split, expr, collect_set,
    regexp_replace, min as pyspark_min, explode, when,
    array_contains, count, first, element_at, size, sum as pyspark_sum
)
from pyspark.sql.types import FloatType, ArrayType, StructType, StructField, BooleanType, StringType
from pyspark.sql import SparkSession
from pyspark.sql import DataFrame
from collections import defaultdict

def fetch_pdb(pdb_strcture_id: str, folder: str) -> None:
    """This function fetches a single PDB structure from PDBEurope
    
    Args:
        pdb_strcture_id: string, a single PDB structure identifier
        folder: string, a folder to save the structure
    """
    try:
        urllib.request.urlretrieve(f'https://www.ebi.ac.uk/pdbe/entry-files/download/pdb{pdb_strcture_id}.ent', f'{folder}/{pdb_strcture_id}.pdb')
    except:
        # logging.warning(f'Failed to fetch PDB structure: {pdb_strcture_id}')
        print(f'Failed to fetch PDB structure: {pdb_strcture_id}')


spark = (
    SparkSession.builder
    .master('local[*]')
    .getOrCreate()
)

source_data = 'molecules_w_targets'

# Reading data:
annotated_molecules = (
    spark.read.parquet(source_data)
    .filter(col('ensembl_gene_id').isNotNull())
    .sample(False, 0.01)
    .distinct()
    .persist()
)

# Extracting a list of strcutures:
pdbs = [row.pdb_structure_id for row in annotated_molecules.select(col('pdb_structure_id')).distinct().collect()]
len(pdbs)

# Dowlading the structures:
folder = './pdb_structures'
[fetch_pdb(pdb, folder) for pdb in pdbs]

Failed to fetch PDB structure: 4u51
Failed to fetch PDB structure: 7l20
Failed to fetch PDB structure: 6zsg
Failed to fetch PDB structure: 6tpy
Failed to fetch PDB structure: 6ftj
Failed to fetch PDB structure: 5tga
Failed to fetch PDB structure: 7pd3
Failed to fetch PDB structure: 6lqv
Failed to fetch PDB structure: 7a5g
Failed to fetch PDB structure: 7d5s
Failed to fetch PDB structure: 7nqh
Failed to fetch PDB structure: 7jgl
Failed to fetch PDB structure: 7ajt
Failed to fetch PDB structure: 7of4
Failed to fetch PDB structure: 6qc7
Failed to fetch PDB structure: 6zsb
Failed to fetch PDB structure: 4u4n
Failed to fetch PDB structure: 6zvj
Failed to fetch PDB structure: 5mrc
Failed to fetch PDB structure: 7ohu
Failed to fetch PDB structure: 5tgm
Failed to fetch PDB structure: 6zsc
Failed to fetch PDB structure: 6zmi
Failed to fetch PDB structure: 5xti
Failed to fetch PDB structure: 6o2s
Failed to fetch PDB structure: 6vlz
Failed to fetch PDB structure: 6msg
Failed to fetch PDB structur

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

Conclusions:
* ~100 structures: 1minute, 
* the test contained 1.7k structures out of which only 1.4k could be downloaded
* Some larger compexes has no pdb, but tar.gz files. -> no, we don't go that directino
* It seems we still need to pre-filter some ligands that are not relevant.
* Timewise estimated time constraint: ~6hours. We need to find simpler way to get the data.
* Also the full dataset is around 55GB

In [32]:
spark.read.parquet(source_data).printSchema()

root
 |-- pdb_structure_id: string (nullable = true)
 |-- chembl_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- pdb_compound_id: string (nullable = true)
 |-- ensembl_gene_id: string (nullable = true)
 |-- pdb_chain_id: string (nullable = true)



In [31]:
2.6 * 36000 / 1700 

55.05882352941177