In [9]:
import pandas as pd
import json
import requests

from pyspark.sql.types import ArrayType, StringType, IntegerType
from pyspark.sql import SparkSession
import pyspark.sql.functions as f

# establish spark connection
spark = (
    SparkSession.builder
    .master('local[*]')
    .getOrCreate()
)


df = (
    spark.read.csv('/Users/dsuveges/project_data/marine/plip_output.csv', sep=',', header=True)
    .distinct()
    .groupBy(['pdb_structure_id', 'compound_id', 'prot_residue_number','prot_chain_id', 'prot_residue_type'])
    .agg(
        f.collect_set(f.col('interaction_type')).alias('interaction_types')
    )
)

df.write.json('/Users/dsuveges/project_data/marine/plip_output_aggregated.json')

In [10]:
%%bash 

cat /Users/dsuveges/project_data/marine/plip_output_aggregated.json/*json \
    | gzip > /Users/dsuveges/project_data/marine/plip_output_aggregated.json.gz


In [29]:
import pandas as pd
from functools import reduce

# 319408 -> unique 265603
df = (
    pd.read_json('/Users/dsuveges/project_data/marine/plip_output_aggregated.json.gz', orient='records', lines=True)
)

df.head()

Unnamed: 0,pdb_structure_id,compound_id,prot_residue_number,prot_chain_id,prot_residue_type,interaction_types
0,1abi,DPN,53,I,ASN,[hbond]
1,1agn,ZN,174,C,CYS,[metal_complex]
2,1ai0,ZN,10,L,HIS,[metal_complex]
3,1am4,GNP,660,F,LEU,[hbond]
4,1an0,GDP,118,A,ASP,[saltbridge]


In [86]:
# grouped = df.groupby('pdb_structure_id')
# test_df = grouped.get_group('3e7g')
test_df.query('prot_chain_id == "A"').query('compound_id == "AT2"')

Unnamed: 0,pdb_structure_id,compound_id,prot_residue_number,prot_chain_id,prot_residue_type,interaction_types
26321,3e7g,AT2,350,A,PRO,[hydroph_interaction]
112137,3e7g,AT2,369,A,PHE,[hydroph_interaction]
209266,3e7g,AT2,377,A,GLU,[hbond]


In [119]:
def get_pdb_sifts_mapping(pdb_id: str) -> pd.DataFrame:
    URL = f'https://www.ebi.ac.uk/pdbe/graph-api/mappings/ensembl/{pdb_id}'
    data = requests.get(URL).json()

    return (
        pd.DataFrame(reduce(lambda x,y: x + y['mappings'], data[pdb_id]['Ensembl'].values(), []))
        .assign(
            author_start = lambda df: df.start.apply(lambda start: start['author_residue_number']),
            author_end = lambda df: df.end.apply(lambda end: end['author_residue_number']),
            uniprot_position = lambda df: df.apply(lambda row: list(range(row['unp_start'], row['unp_end']+1)), axis=1),
            diff = lambda df: df.apply(lambda row: row['author_start'] - row['unp_start'], axis=1)
        )
        .explode('uniprot_position')
        .assign(
            prot_residue_number = lambda df: df.apply(lambda row: row['uniprot_position'] + row['diff'], axis=1)
        )
        [['accession', 'chain_id', 'uniprot_position', 'prot_residue_number']]
        .rename(columns={'chain_id': 'prot_chain_id'})
        .drop_duplicates()
    )


# def map2uniprot(plip_df: pd.DataFrame) -> pd.DataFrame:
#     pdb_id = plip_df.pdb_id.iloc

pdb_id = test_df.pdb_structure_id.iloc[0]
sifts_df = get_pdb_sifts_mapping(pdb_id)


In [120]:
sifts_df


Unnamed: 0,accession,prot_chain_id,uniprot_position,prot_residue_number
0,P35228-2,A,82,82
0,P35228-2,A,83,83
0,P35228-2,A,84,84
0,P35228-2,A,85,85
0,P35228-2,A,86,86
...,...,...,...,...
272,P35228,D,501,501
272,P35228,D,502,502
272,P35228,D,503,503
272,P35228,D,504,504


In [121]:
print(
    test_df
    .merge(sifts_df, on=['prot_chain_id',  'prot_residue_number'], how='left')
    .query('prot_chain_id == "A" & compound_id == "AT2"')
    .head(1000)
)



    pdb_structure_id compound_id  prot_residue_number prot_chain_id  \
12              3e7g         AT2                  350             A   
13              3e7g         AT2                  350             A   
14              3e7g         AT2                  350             A   
54              3e7g         AT2                  369             A   
55              3e7g         AT2                  369             A   
56              3e7g         AT2                  369             A   
105             3e7g         AT2                  377             A   
106             3e7g         AT2                  377             A   
107             3e7g         AT2                  377             A   

    prot_residue_type      interaction_types       accession uniprot_position  
12                PRO  [hydroph_interaction]        P35228-2              311  
13                PRO  [hydroph_interaction]  PRO_0000170930              350  
14                PRO  [hydroph_interaction]     

In [118]:
mappings_df = (
    pd.DataFrame(reduce(lambda x,y: x+y['mappings'], data[pdb_id]['Ensembl'].values(), []))
    .assign(
        author_start = lambda df: df.start.apply(lambda start: start['author_residue_number']),
        author_end = lambda df: df.end.apply(lambda end: end['author_residue_number']),
        diff = lambda df: df.apply(lambda row: row['author_start'] - row['unp_start'], axis=1)
    )
    [['accession', 'chain_id', 'unp_start', 'author_start', 'unp_end', 'author_end']]
    .query('chain_id == "A"')
    .drop_duplicates()
)

value = 377
(
    mappings_df
    .query('author_start <= @value & author_end >= @value')
)

Unnamed: 0,accession,chain_id,unp_start,author_start,unp_end,author_end
45,PRO_0000170930,A,335,335,393,393
46,P35228,A,335,335,393,393
47,P35228-2,A,296,335,354,393


In [104]:
spark.createDataFrame(
    pd.DataFrame(data['3e7g']['Ensembl']['ENSG00000007171']['mappings'])
    [['accession', 'translation_id', 'transcript_id', 'chain_id', 'unp_start', 'unp_end', 'coverage']]
    .query('unp_start == 335 | unp_start == 296')
    .query('translation_id == "ENSP00000327251" & chain_id == "A"')
).show()

+--------------+---------------+---------------+--------+---------+-------+--------+
|     accession| translation_id|  transcript_id|chain_id|unp_start|unp_end|coverage|
+--------------+---------------+---------------+--------+---------+-------+--------+
|PRO_0000170930|ENSP00000327251|ENST00000313735|       A|      335|    393|     1.0|
|        P35228|ENSP00000327251|ENST00000313735|       A|      335|    393|     1.0|
|      P35228-2|ENSP00000327251|ENST00000313735|       A|      296|    354|     1.0|
+--------------+---------------+---------------+--------+---------+-------+--------+



In [114]:
value = 115
spark.createDataFrame(
    pd.DataFrame(data['3e7g']['Ensembl']['ENSG00000007171']['mappings'])
    [['accession', 'translation_id', 'transcript_id', 'exon_id',
      'chain_id', 'unp_start', 'unp_end', 'coverage', 'genome_start', 'genome_end']]
    .query('unp_start <= @value & unp_end >= @value')
    .query('chain_id == "A"')
    .query('accession != "PRO_0000170930"')
).show()

+---------+---------------+---------------+---------------+--------+---------+-------+--------+------------+----------+
|accession| translation_id|  transcript_id|        exon_id|chain_id|unp_start|unp_end|coverage|genome_start|genome_end|
+---------+---------------+---------------+---------------+--------+---------+-------+--------+------------+----------+
| P35228-2|ENSP00000482291|ENST00000621962|ENSE00001347946|       A|      107|    156|     1.0|    27787826|  27787973|
|   P35228|ENSP00000482291|ENST00000621962|ENSE00001347946|       A|      107|    156|     1.0|    27787826|  27787973|
| P35228-2|ENSP00000327251|ENST00000313735|ENSE00001347946|       A|      107|    156|     1.0|    27787826|  27787973|
|   P35228|ENSP00000327251|ENST00000313735|ENSE00001347946|       A|      107|    156|     1.0|    27787826|  27787973|
+---------+---------------+---------------+---------------+--------+---------+-------+--------+------------+----------+



In [108]:
spark.createDataFrame(
    pd.DataFrame(data['3e7g']['Ensembl']['ENSG00000007171']['mappings'])
    [['translation_id', 'transcript_id']]
    .drop_duplicates()
).show()

+---------------+---------------+
| translation_id|  transcript_id|
+---------------+---------------+
|ENSP00000327251|ENST00000313735|
|ENSP00000482291|ENST00000621962|
+---------------+---------------+



In [115]:
27787826 -27787973

-147

In [116]:
27787678 - 27787826

-148