This is just an aggregation. Has nothing to do with the downstream processes. Those are 100% in pandas:

In [9]:
import pandas as pd
import json
import requests

from pyspark.sql.types import ArrayType, StringType, IntegerType
from pyspark.sql import SparkSession
import pyspark.sql.functions as f

# establish spark connection
spark = (
    SparkSession.builder
    .master('local[*]')
    .getOrCreate()
)

df = (
    spark.read.csv('/Users/dsuveges/project_data/marine/plip_output.csv', sep=',', header=True)
    # Somehow there are duplications:
    .distinct() 
    .groupBy(['pdb_structure_id', 'compound_id', 'prot_residue_number','prot_chain_id', 'prot_residue_type'])
    .agg(
        f.collect_set(f.col('interaction_type')).alias('interaction_types')
    )
)

df.write.json('/Users/dsuveges/project_data/marine/plip_output_aggregated.json')

Concatenating the partitions

In [10]:
%%bash 

cat /Users/dsuveges/project_data/marine/plip_output_aggregated.json/*json \
    | gzip > /Users/dsuveges/project_data/marine/plip_output_aggregated.json.gz


In [29]:
import pandas as pd
from functools import reduce

# 319408 -> unique 265603
df = (
    pd.read_json('/Users/dsuveges/project_data/marine/plip_output_aggregated.json.gz', orient='records', lines=True)
)

df.head()

Unnamed: 0,pdb_structure_id,compound_id,prot_residue_number,prot_chain_id,prot_residue_type,interaction_types
0,1abi,DPN,53,I,ASN,[hbond]
1,1agn,ZN,174,C,CYS,[metal_complex]
2,1ai0,ZN,10,L,HIS,[metal_complex]
3,1am4,GNP,660,F,LEU,[hbond]
4,1an0,GDP,118,A,ASP,[saltbridge]


In [122]:
# Grouping data by pdb structure id:
grouped = df.groupby('pdb_structure_id')

# Selecting one of the groups:
test_df = grouped.get_group('3e7g')
test_df.head()

Unnamed: 0,pdb_structure_id,compound_id,prot_residue_number,prot_chain_id,prot_residue_type,interaction_types
332,3e7g,H4B,479,C,GLU,[hydroph_interaction]
5304,3e7g,AT2,373,C,TYR,[hbond]
5305,3e7g,ZN,110,A,CYS,[metal_complex]
12503,3e7g,AT2,347,B,TYR,[hbond]
26321,3e7g,AT2,350,A,PRO,[hydroph_interaction]


In [123]:
def get_pdb_sifts_mapping(pdb_id: str) -> pd.DataFrame:
    URL = f'https://www.ebi.ac.uk/pdbe/graph-api/mappings/ensembl/{pdb_id}'
    data = requests.get(URL).json()

    return (
        pd.DataFrame(reduce(lambda x,y: x + y['mappings'], data[pdb_id]['Ensembl'].values(), []))
        .assign(
            author_start = lambda df: df.start.apply(lambda start: start['author_residue_number']),
            author_end = lambda df: df.end.apply(lambda end: end['author_residue_number']),
            uniprot_position = lambda df: df.apply(lambda row: list(range(row['unp_start'], row['unp_end']+1)), axis=1),
            diff = lambda df: df.apply(lambda row: row['author_start'] - row['unp_start'], axis=1)
        )
        .explode('uniprot_position')
        .assign(
            prot_residue_number = lambda df: df.apply(lambda row: row['uniprot_position'] + row['diff'], axis=1)
        )
        [['accession', 'chain_id', 'uniprot_position', 'prot_residue_number']]
        .rename(columns={'chain_id': 'prot_chain_id'})
        .drop_duplicates()
    )


# def map2uniprot(plip_df: pd.DataFrame) -> pd.DataFrame:
#     pdb_id = plip_df.pdb_id.iloc

pdb_id = test_df.pdb_structure_id.iloc[0]
sifts_df = get_pdb_sifts_mapping(pdb_id)
sifts_df.head()

Unnamed: 0,accession,prot_chain_id,uniprot_position,prot_residue_number
0,P35228-2,A,82,82
0,P35228-2,A,83,83
0,P35228-2,A,84,84
0,P35228-2,A,85,85
0,P35228-2,A,86,86


In [126]:
(
    test_df
    .merge(sifts_df, on=['prot_chain_id', 'prot_residue_number'], how='left')
)

Unnamed: 0,pdb_structure_id,compound_id,prot_residue_number,prot_chain_id,prot_residue_type,interaction_types,accession,uniprot_position
0,3e7g,H4B,479,C,GLU,[hydroph_interaction],PRO_0000170930,479
1,3e7g,H4B,479,C,GLU,[hydroph_interaction],P35228,479
2,3e7g,H4B,479,C,GLU,[hydroph_interaction],P35228-2,440
3,3e7g,AT2,373,C,TYR,[hbond],P35228-2,334
4,3e7g,AT2,373,C,TYR,[hbond],PRO_0000170930,373
...,...,...,...,...,...,...,...,...
115,3e7g,ZN,115,C,CYS,[metal_complex],PRO_0000170930,115
116,3e7g,ZN,115,C,CYS,[metal_complex],P35228,115
117,3e7g,AT2,377,C,GLU,[hbond],P35228-2,338
118,3e7g,AT2,377,C,GLU,[hbond],PRO_0000170930,377
