In [1]:
import os
import json
import requests
import pandas as pd
import numpy as np
import networkx
# conda install -c biobuilds obonet
import obonet
import math

In [2]:
go_obo_file = "data/disprot/go-basic.obo"
disprot_old_file = "data/disprot/entries_2022_06.json"
disprot_new_file = "data/disprot/entries_2022_06_c.json"
sifts_file = "data/sifts/uniprot_segments_observed.tsv.gz"
alphafold_dir = "data/alphafold"
gene3d_file = ""

In [3]:
def expand_region(df_:pd.DataFrame, start_col:str='start', end_col:str='end', res_col:str='reg_position') -> pd.DataFrame:
    df_[res_col] = list(range(int(df_[start_col]), int(df_[end_col]) + 1, 1))
    return df_

def expand_sequence(df_:pd.DataFrame, seq_column:str='sequence', res_col:str='seq_aa') -> pd.DataFrame:
    df_[res_col] = [(i+1, aa) for i, aa in enumerate(df_[seq_column])]
    return df_

## Associate DisProt annotation terms to CAID challenges

In [4]:
data_idpo = [('IDPO:00076', 'disorder'), ('IDPO:00077', 'disorder'), ('IDPO:00078', 'disorder'), 
                   ('IDPO:00501', 'linker'), ('IDPO:00502', 'linker'), ('IDPO:00503', 'linker'), 
                   ('IDPO:00504', 'linker'), ('IDPO:00049', 'transition'), ('IDPO:00050', 'transition'), 
                   ('IDPO:00051', 'transition'), ('IDPO:00052', 'transition'), ('IDPO:00053', 'transition'), 
                   ('IDPO:00060', 'transition'), ('IDPO:00055', 'transition'), ('IDPO:00056', 'transition'), 
                   ('IDPO:00061', 'transition'), ('IDPO:00054', 'transition'), ('IDPO:00057', 'transition'), 
                   ('IDPO:00058', 'transition'), ('IDPO:00059', 'transition')]

In [5]:
# Ancestor terms corresponding to CAID2 challenges
ancestors = {'GO:0005488': 'binding', 'GO:0003676': 'nucleic acid binding', 'GO:0005515': 'protein binding'}

# The OBO must have "ontology: GO" header (first line)
graph = obonet.read_obo(go_obo_file)
to_remove = []
for e in graph.edges:
    if e[2] != 'is_a':
        to_remove.append((e[0], e[1]))

for ele in to_remove:
    graph.remove_edge(*ele)
    

# Create children table
data_go = []    
for node in graph.nodes(data=True):
    challenge = ancestors.get(node[0])
    if challenge is not None:
        data_go.append([node[0], challenge])
        for children in networkx.ancestors(graph, node[0]): 
            data_go.append([children, challenge])

In [6]:
df_challenge = pd.DataFrame(data=data_idpo + data_go, columns=['term_id', 'challenge'])
df_challenge

Unnamed: 0,term_id,challenge
0,IDPO:00076,disorder
1,IDPO:00077,disorder
2,IDPO:00078,disorder
3,IDPO:00501,linker
4,IDPO:00502,linker
...,...,...
3082,GO:0031850,protein binding
3083,GO:0001968,protein binding
3084,GO:0042021,protein binding
3085,GO:0005131,protein binding


## Process DisProt annotations

In [7]:
# Get DisProt annotations
disprot_old = {}
with open(disprot_old_file, "r") as f:
    for line in f:
        obj = json.loads(line)
        disprot_old[obj["disprot_id"]] = obj
        
disprot_new = {}
with open(disprot_new_file, "r") as f:
    for line in f:
        obj = json.loads(line)
        disprot_new[obj["disprot_id"]] = obj

In [8]:
# Get new annotations
dataset = []
for disprot_id in disprot_new:
    if disprot_id not in disprot_old and "obsolete" not in disprot_new[disprot_id] and "X" not in disprot_new[disprot_id]["sequence"]:
        # Filter out obsolete regions
        disprot_new[disprot_id]["regions"] = [region for region in disprot_new[disprot_id]["regions"] if "obsolete" not in region]
        if disprot_new[disprot_id]["regions"]:
            dataset.append(disprot_new[disprot_id])

In [9]:
entry_columns = ['disprot_id', 'acc', 'ncbi_taxon_id', 'organism', 'sequence']
region_columns = ["start", "end", "term_id"]

df = pd.json_normalize(data=dataset, record_path=['regions'], meta=entry_columns, meta_prefix='', record_prefix='')
df = df.loc[:, entry_columns + region_columns]
df

Unnamed: 0,disprot_id,acc,ncbi_taxon_id,organism,sequence,start,end,term_id
0,DP02342,P06837,10090,Mus musculus,MLCCMRRTKQVEKNDEDQKIEQDGVKPEDKAHKAATKIQASFRGHI...,1,227,IDPO:00078
1,DP02342,P06837,10090,Mus musculus,MLCCMRRTKQVEKNDEDQKIEQDGVKPEDKAHKAATKIQASFRGHI...,34,57,GO:0051179
2,DP02342,P06837,10090,Mus musculus,MLCCMRRTKQVEKNDEDQKIEQDGVKPEDKAHKAATKIQASFRGHI...,34,57,GO:0098772
3,DP02342,P06837,10090,Mus musculus,MLCCMRRTKQVEKNDEDQKIEQDGVKPEDKAHKAATKIQASFRGHI...,1,227,IDPO:00078
4,DP02342,P06837,10090,Mus musculus,MLCCMRRTKQVEKNDEDQKIEQDGVKPEDKAHKAATKIQASFRGHI...,34,57,GO:0005515
...,...,...,...,...,...,...,...,...
1163,DP03744,Q9VVJ7,7227,Drosophila melanogaster,MHKCAIFLLLALSCQQIQAELTAADCRALGFIKAQLMCSSCEKLDD...,53,178,GO:0045454
1164,DP03745,Q8VHC3,10090,Mus musculus,MSILLSPPSLLLLLAALVAPATSTTNYRPDWNRLRGLARGRVETCG...,25,34,IDPO:00076
1165,DP03745,Q8VHC3,10090,Mus musculus,MSILLSPPSLLLLLAALVAPATSTTNYRPDWNRLRGLARGRVETCG...,121,145,IDPO:00076
1166,DP03745,Q8VHC3,10090,Mus musculus,MSILLSPPSLLLLLAALVAPATSTTNYRPDWNRLRGLARGRVETCG...,24,145,GO:0045454


In [10]:
df_regions = df.apply(expand_region, axis=1).loc[:, ["disprot_id", "term_id", "reg_position"]].copy(deep=True)
df_regions = pd.merge(left=df_regions, right=df_challenge, how="inner", left_on="term_id", right_on="term_id").drop(columns=["term_id"])
df_regions = df_regions.explode("reg_position").drop_duplicates()
df_regions['has_region'] = 1
df_regions

Unnamed: 0,disprot_id,reg_position,challenge,has_region
0,DP02342,1,disorder,1
0,DP02342,2,disorder,1
0,DP02342,3,disorder,1
0,DP02342,4,disorder,1
0,DP02342,5,disorder,1
...,...,...,...,...
1105,DP03743,576,binding,1
1105,DP03743,577,binding,1
1105,DP03743,578,binding,1
1105,DP03743,579,binding,1


In [11]:
# Create the pivot table
df_regions = pd.pivot_table(
    df_regions,
    columns="challenge",
    index=['disprot_id', 'reg_position'],
    values='has_region')
df_regions

Unnamed: 0_level_0,challenge,binding,disorder,linker,nucleic acid binding,protein binding,transition
disprot_id,reg_position,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
DP02342,1,,1.0,,,,
DP02342,2,,1.0,,,,
DP02342,3,,1.0,,,,
DP02342,4,,1.0,,,,
DP02342,5,,1.0,,,,
...,...,...,...,...,...,...,...
DP03746,382,,1.0,,,,
DP03746,383,,1.0,,,,
DP03746,384,,1.0,,,,
DP03746,385,,1.0,,,,


In [12]:
# Get dataset sequences (1 residue per row)
df_sequence = df.apply(expand_sequence, axis=1).copy(deep=True).drop(columns=["acc", "ncbi_taxon_id", "organism", "start", "end", "sequence", "term_id"])
df_sequence = df_sequence.explode("seq_aa")
df_sequence[['pos', 'aa']] = pd.DataFrame(df_sequence['seq_aa'].tolist(), index=df_sequence.index)
df_sequence.drop(columns='seq_aa', inplace=True)
df_sequence

Unnamed: 0,disprot_id,pos,aa
0,DP02342,1,M
0,DP02342,2,L
0,DP02342,3,C
0,DP02342,4,C
0,DP02342,5,M
...,...,...,...
1167,DP03746,1280,Q
1167,DP03746,1281,L
1167,DP03746,1282,L
1167,DP03746,1283,I


In [13]:
# Add sequence positions not mapping to any DisProt region
df_regions = pd.merge(left=df_regions, right=df_sequence, how="right", left_on=["disprot_id", "reg_position"], right_on=["disprot_id", "pos"])
df_regions

Unnamed: 0,disprot_id,binding,disorder,linker,nucleic acid binding,protein binding,transition,pos,aa
0,DP02342,,1.0,,,,,1,M
1,DP02342,,1.0,,,,,2,L
2,DP02342,,1.0,,,,,3,C
3,DP02342,,1.0,,,,,4,C
4,DP02342,,1.0,,,,,5,M
...,...,...,...,...,...,...,...,...,...
963294,DP03746,,,,,,,1280,Q
963295,DP03746,,,,,,,1281,L
963296,DP03746,,,,,,,1282,L
963297,DP03746,,,,,,,1283,I


## Map PDB onbserved positions using SIFTS

In [14]:
df_sifts = pd.read_csv(sifts_file, sep="\t", header=1)
# Filter for dataset entries
df_sifts = df_sifts.loc[df_sifts['SP_PRIMARY'].isin(df['acc'])]
# Explode observed regions 
df_sifts = df_sifts.apply(expand_region, start_col="SP_BEG", end_col="SP_END", axis=1)
df_sifts = df_sifts.explode("reg_position")
# Get the DisProt ID
df_sifts = pd.merge(df_sifts, df, how="left", left_on="SP_PRIMARY", right_on="acc").dropna(subset="disprot")
df_sifts = df_sifts.loc[:, ["disprot_id", "reg_position"]]
df_sifts

Unnamed: 0,disprot_id,reg_position
0,DP02728,2
1,DP02728,2
2,DP02728,3
3,DP02728,3
4,DP02728,4
...,...,...
3485449,DP03623,3563
3485450,DP03623,3563
3485451,DP03623,3564
3485452,DP03623,3564


In [15]:
df_regions = pd.merge(df_regions, df_sifts, left_on=["disprot_id", "pos"], right_on=["disprot_id", "reg_position"], how="left")
df_regions.rename(columns={"reg_position": "pdb_observed"}, inplace=True)
df_regions

Unnamed: 0,disprot_id,binding,disorder,linker,nucleic acid binding,protein binding,transition,pos,aa,pdb_observed
0,DP02342,,1.0,,,,,1,M,
1,DP02342,,1.0,,,,,2,L,
2,DP02342,,1.0,,,,,3,C,
3,DP02342,,1.0,,,,,4,C,
4,DP02342,,1.0,,,,,5,M,
...,...,...,...,...,...,...,...,...,...,...
16731191,DP03746,,,,,,,1280,Q,
16731192,DP03746,,,,,,,1281,L,
16731193,DP03746,,,,,,,1282,L,
16731194,DP03746,,,,,,,1283,I,


## Add AlphaFold prediction

In [16]:
df_list = []
for af_file in os.listdir(alphafold_dir):
    df_list.append(pd.read_csv("{}/{}".format(alphafold_dir, af_file), sep='\t'))
df_af = pd.concat(df_list, ignore_index=True)
del df_list

In [17]:
# WARNING: rename depends on the version (only full lenght predictions should be used)
df_af['name'] = df_af['name'].apply(lambda x: x.replace('AF-','').replace('-F1-model_v3',''))
df_af = pd.merge(df_af, df, how="left", left_on="name", right_on="acc").dropna(subset="disprot_id")
df_af = df_af.loc[:, ["disprot_id", "pos", "aa", "lddt", "disorder", "disorder-25", "binding-25-0.581"]]
df_af['pos'] = df_af['pos'].astype('int')
df_af

Unnamed: 0,disprot_id,pos,aa,lddt,disorder,disorder-25,binding-25-0.581
0,DP02342,1,M,0.730,0.270,0.897,0.887
1,DP02342,2,L,0.734,0.266,0.891,0.889
2,DP02342,3,C,0.744,0.256,0.885,0.893
3,DP02342,4,C,0.704,0.296,0.878,0.876
4,DP02342,5,M,0.708,0.292,0.873,0.878
...,...,...,...,...,...,...,...
677949,DP03744,174,R,0.578,0.422,0.933,0.823
677950,DP03744,175,T,0.585,0.415,0.937,0.826
677951,DP03744,176,N,0.497,0.503,0.932,0.789
677952,DP03744,177,R,0.616,0.384,0.937,0.839


In [None]:
df_regions = pd.merge(df_regions, df_af, left_on=["disprot_id", "pos", "aa"], right_on=["disprot_id", "pos", "aa"], how="left")
df_regions