In [1]:
import os
import json
import pandas as pd
import networkx
import numpy as np
import obonet  # conda install -c biobuilds obonet

In [2]:
go_obo_file = "data/disprot/go-basic.obo"
disprot_old_file = "data/disprot/entries_2022_06.json"
disprot_new_file = "data/disprot/entries_2022_06_c.json"
sifts_file = "data/sifts/uniprot_segments_observed.tsv.gz"
alphafold_dir = "data/alphafold"
gene3d_file = ""
references_dir = "data/references"  # output

In [3]:
def expand_region(df_:pd.DataFrame, start_col:str='start', end_col:str='end', res_col:str='reg_position') -> pd.DataFrame:
    df_[res_col] = list(range(int(df_[start_col]), int(df_[end_col]) + 1, 1))
    return df_

def expand_sequence(df_:pd.DataFrame, seq_column:str='sequence', res_col:str='seq_aa') -> pd.DataFrame:
    df_[res_col] = [(i+1, aa) for i, aa in enumerate(df_[seq_column])]
    return df_

## Associate DisProt annotation terms to CAID challenges

In [4]:
# IDPO terms
data_idpo = [('IDPO:00076', 'disorder'), ('IDPO:00077', 'disorder'), ('IDPO:00078', 'disorder'), 
                   ('IDPO:00501', 'linker'), ('IDPO:00502', 'linker'), ('IDPO:00503', 'linker'), 
                   ('IDPO:00504', 'linker'), ('IDPO:00049', 'transition'), ('IDPO:00050', 'transition'), 
                   ('IDPO:00051', 'transition'), ('IDPO:00052', 'transition'), ('IDPO:00053', 'transition'), 
                   ('IDPO:00060', 'transition'), ('IDPO:00055', 'transition'), ('IDPO:00056', 'transition'), 
                   ('IDPO:00061', 'transition'), ('IDPO:00054', 'transition'), ('IDPO:00057', 'transition'), 
                   ('IDPO:00058', 'transition'), ('IDPO:00059', 'transition')]

# GO ancestor terms coresponding to CAID2 challenges
ancestors = {'GO:0005488': 'binding', 'GO:0003676': 'nucleic acid binding', 'GO:0005515': 'protein binding'}

In [5]:
# The OBO must have "ontology: GO" header (first line)
graph = obonet.read_obo(go_obo_file)

# Remove all edges which are not "is_a"
to_remove = []
for e in graph.edges:
    if e[2] != 'is_a':
        to_remove.append((e[0], e[1]))
for ele in to_remove:
    graph.remove_edge(*ele)
    
# Create children table
data_go = []    
for node in graph.nodes(data=True):
    challenge = ancestors.get(node[0])
    if challenge is not None:
        data_go.append([node[0], challenge])
        for children in networkx.ancestors(graph, node[0]): 
            data_go.append([children, challenge])

In [6]:
df_challenge = pd.DataFrame(data=data_idpo + data_go, columns=['term_id', 'challenge']).drop_duplicates()
df_challenge

Unnamed: 0,term_id,challenge
0,IDPO:00076,disorder
1,IDPO:00077,disorder
2,IDPO:00078,disorder
3,IDPO:00501,linker
4,IDPO:00502,linker
...,...,...
3082,GO:0031710,protein binding
3083,GO:0031435,protein binding
3084,GO:0045518,protein binding
3085,GO:0032183,protein binding


## Process DisProt annotations

In [7]:
# Get DisProt annotations
disprot_old = {}
with open(disprot_old_file, "r") as f:
    for line in f:
        obj = json.loads(line)
        disprot_old[obj["disprot_id"]] = obj
        
disprot_new = {}
with open(disprot_new_file, "r") as f:
    for line in f:
        obj = json.loads(line)
        disprot_new[obj["disprot_id"]] = obj

In [8]:
# Get new annotations (delta = new - old)
dataset = []
for disprot_id in disprot_new:
    if disprot_id not in disprot_old and "obsolete" not in disprot_new[disprot_id] and "X" not in disprot_new[disprot_id]["sequence"]:
        # Filter out obsolete regions
        disprot_new[disprot_id]["regions"] = [region for region in disprot_new[disprot_id]["regions"] if "obsolete" not in region]
        if disprot_new[disprot_id]["regions"]:
            dataset.append(disprot_new[disprot_id])

In [9]:
entry_columns = ['disprot_id', 'acc', 'ncbi_taxon_id', 'organism', 'sequence']
region_columns = ["start", "end", "term_id"]

df = pd.json_normalize(data=dataset, record_path=['regions'], meta=entry_columns, meta_prefix='', record_prefix='')
df = df.loc[:, entry_columns + region_columns]
df

Unnamed: 0,disprot_id,acc,ncbi_taxon_id,organism,sequence,start,end,term_id
0,DP02342,P06837,10090,Mus musculus,MLCCMRRTKQVEKNDEDQKIEQDGVKPEDKAHKAATKIQASFRGHI...,1,227,IDPO:00078
1,DP02342,P06837,10090,Mus musculus,MLCCMRRTKQVEKNDEDQKIEQDGVKPEDKAHKAATKIQASFRGHI...,34,57,GO:0051179
2,DP02342,P06837,10090,Mus musculus,MLCCMRRTKQVEKNDEDQKIEQDGVKPEDKAHKAATKIQASFRGHI...,34,57,GO:0098772
3,DP02342,P06837,10090,Mus musculus,MLCCMRRTKQVEKNDEDQKIEQDGVKPEDKAHKAATKIQASFRGHI...,1,227,IDPO:00078
4,DP02342,P06837,10090,Mus musculus,MLCCMRRTKQVEKNDEDQKIEQDGVKPEDKAHKAATKIQASFRGHI...,34,57,GO:0005515
...,...,...,...,...,...,...,...,...
1163,DP03744,Q9VVJ7,7227,Drosophila melanogaster,MHKCAIFLLLALSCQQIQAELTAADCRALGFIKAQLMCSSCEKLDD...,53,178,GO:0045454
1164,DP03745,Q8VHC3,10090,Mus musculus,MSILLSPPSLLLLLAALVAPATSTTNYRPDWNRLRGLARGRVETCG...,25,34,IDPO:00076
1165,DP03745,Q8VHC3,10090,Mus musculus,MSILLSPPSLLLLLAALVAPATSTTNYRPDWNRLRGLARGRVETCG...,121,145,IDPO:00076
1166,DP03745,Q8VHC3,10090,Mus musculus,MSILLSPPSLLLLLAALVAPATSTTNYRPDWNRLRGLARGRVETCG...,24,145,GO:0045454


In [10]:
# Transform the per-protein dataframe into a per-residue dataframe 
df_regions = df.apply(expand_region, axis=1).loc[:, ["disprot_id", "term_id", "reg_position"]].copy(deep=True)
df_regions = pd.merge(left=df_regions, right=df_challenge, how="inner", left_on="term_id", right_on="term_id").drop(columns=["term_id"])
df_regions = df_regions.explode("reg_position").drop_duplicates()
df_regions['has_region'] = 1
df_regions

Unnamed: 0,disprot_id,reg_position,challenge,has_region
0,DP02342,1,disorder,1
0,DP02342,2,disorder,1
0,DP02342,3,disorder,1
0,DP02342,4,disorder,1
0,DP02342,5,disorder,1
...,...,...,...,...
1105,DP03743,576,binding,1
1105,DP03743,577,binding,1
1105,DP03743,578,binding,1
1105,DP03743,579,binding,1


In [11]:
# Create the pivot table. Transpose challenge values into columns 
df_regions = pd.pivot_table(
    df_regions,
    columns="challenge",
    index=['disprot_id', 'reg_position'],
    values='has_region')
df_regions = df_regions.reset_index()
df_regions

challenge,disprot_id,reg_position,binding,disorder,linker,nucleic acid binding,protein binding,transition
0,DP02342,1,,1.0,,,,
1,DP02342,2,,1.0,,,,
2,DP02342,3,,1.0,,,,
3,DP02342,4,,1.0,,,,
4,DP02342,5,,1.0,,,,
...,...,...,...,...,...,...,...,...
35754,DP03746,382,,1.0,,,,
35755,DP03746,383,,1.0,,,,
35756,DP03746,384,,1.0,,,,
35757,DP03746,385,,1.0,,,,


In [12]:
# Get dataset sequences (1 residue per row)
df_sequence = df.apply(expand_sequence, axis=1).copy(deep=True).drop(columns=["ncbi_taxon_id", "organism", "start", "end", "sequence", "term_id"])
df_sequence = df_sequence.explode("seq_aa")
df_sequence[['pos', 'aa']] = pd.DataFrame(df_sequence['seq_aa'].tolist(), index=df_sequence.index)
df_sequence.drop(columns='seq_aa', inplace=True)
df_sequence

Unnamed: 0,disprot_id,acc,pos,aa
0,DP02342,P06837,1,M
0,DP02342,P06837,2,L
0,DP02342,P06837,3,C
0,DP02342,P06837,4,C
0,DP02342,P06837,5,M
...,...,...,...,...
1167,DP03746,Q9QUH6-2,1280,Q
1167,DP03746,Q9QUH6-2,1281,L
1167,DP03746,Q9QUH6-2,1282,L
1167,DP03746,Q9QUH6-2,1283,I


In [14]:
# Add sequence positions not mapping to any DisProt region
df_regions = pd.merge(left=df_regions, right=df_sequence, how="right", left_on=["disprot_id", "reg_position"], right_on=["disprot_id", "pos"])
df_regions.drop(columns="reg_position", inplace=True)
df_regions

Unnamed: 0,disprot_id,binding,disorder,linker,nucleic acid binding,protein binding,transition,acc,pos,aa
0,DP02342,,1.0,,,,,P06837,1,M
1,DP02342,,1.0,,,,,P06837,2,L
2,DP02342,,1.0,,,,,P06837,3,C
3,DP02342,,1.0,,,,,P06837,4,C
4,DP02342,,1.0,,,,,P06837,5,M
...,...,...,...,...,...,...,...,...,...,...
963294,DP03746,,,,,,,Q9QUH6-2,1280,Q
963295,DP03746,,,,,,,Q9QUH6-2,1281,L
963296,DP03746,,,,,,,Q9QUH6-2,1282,L
963297,DP03746,,,,,,,Q9QUH6-2,1283,I


## Map PDB onbserved positions using SIFTS

In [17]:
df_sifts = pd.read_csv(sifts_file, sep="\t", header=1)
# Filter for dataset entries
df_sifts = df_sifts.loc[df_sifts['SP_PRIMARY'].isin(df_regions['acc'])]
# Explode observed regions 
df_sifts = df_sifts.apply(expand_region, start_col="SP_BEG", end_col="SP_END", axis=1)
df_sifts = df_sifts.explode("reg_position")
df_sifts = df_sifts.loc[:, ['SP_PRIMARY', 'reg_position']].drop_duplicates().reset_index()
df_sifts

Unnamed: 0,index,SP_PRIMARY,reg_position
0,1082,P42166,2
1,1082,P42166,3
2,1082,P42166,4
3,1082,P42166,5
4,1082,P42166,6
...,...,...,...
98769,958662,Q9Y4D1,600
98770,958662,Q9Y4D1,656
98771,958662,Q9Y4D1,657
98772,958662,Q9Y4D1,658


In [20]:
df_regions = pd.merge(df_regions, df_sifts, left_on=["acc", "pos"], right_on=["SP_PRIMARY", "reg_position"], how="left")
df_regions.rename(columns={"reg_position": "pdb"}, inplace=True)
df_regions

Unnamed: 0,disprot_id,binding,disorder,linker,nucleic acid binding,protein binding,transition,acc,pos,aa,index,SP_PRIMARY,pdb
0,DP02342,,1.0,,,,,P06837,1,M,,,
1,DP02342,,1.0,,,,,P06837,2,L,,,
2,DP02342,,1.0,,,,,P06837,3,C,,,
3,DP02342,,1.0,,,,,P06837,4,C,,,
4,DP02342,,1.0,,,,,P06837,5,M,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
963294,DP03746,,,,,,,Q9QUH6-2,1280,Q,,,
963295,DP03746,,,,,,,Q9QUH6-2,1281,L,,,
963296,DP03746,,,,,,,Q9QUH6-2,1282,L,,,
963297,DP03746,,,,,,,Q9QUH6-2,1283,I,,,


## Add AlphaFold prediction

In [22]:
df_list = []
for af_file in os.listdir(alphafold_dir):
    df_list.append(pd.read_csv("{}/{}".format(alphafold_dir, af_file), sep='\t'))
df_af = pd.concat(df_list, ignore_index=True)
del df_list

In [23]:
# WARNING: rename depends on the version (only full lenght predictions should be used)
df_af['name'] = df_af['name'].apply(lambda x: x.replace('AF-','').replace('-F1-model_v3',''))
df_af.rename(columns={"disorder": "af-disorder", "disorder-25": "af-rsa", "binding-25-0.581": "af-binding"}, inplace=True)
df_af['pos'] = df_af['pos'].astype('int')
df_af

Unnamed: 0,name,pos,aa,lddt,af-disorder,rsa,ss,af-rsa,af-binding
0,Q9JZ10,1,M,0.413,0.587,1.000,-,0.280,0.280
1,Q9JZ10,2,G,0.535,0.465,0.429,-,0.271,0.271
2,Q9JZ10,3,N,0.859,0.141,0.478,-,0.278,0.278
3,Q9JZ10,4,F,0.920,0.080,0.345,E,0.290,0.290
4,Q9JZ10,5,L,0.910,0.090,0.000,E,0.295,0.295
...,...,...,...,...,...,...,...,...,...
218152,E9Q4Y4,2510,S,0.860,0.140,0.038,E,0.353,0.353
218153,E9Q4Y4,2511,L,0.775,0.225,0.476,E,0.355,0.355
218154,E9Q4Y4,2512,G,0.707,0.293,0.405,-,0.323,0.323
218155,E9Q4Y4,2513,K,0.512,0.488,0.727,-,0.324,0.324


In [24]:
df_regions = pd.merge(df_regions, df_af, left_on=["acc", "pos", "aa"], right_on=["name", "pos", "aa"], how="left")
df_regions

Unnamed: 0,disprot_id,binding,disorder,linker,nucleic acid binding,protein binding,transition,acc,pos,aa,index,SP_PRIMARY,pdb,name,lddt,af-disorder,rsa,ss,af-rsa,af-binding
0,DP02342,,1.0,,,,,P06837,1,M,,,,P06837,0.730,0.270,1.000,-,0.897,0.887
1,DP02342,,1.0,,,,,P06837,2,L,,,,P06837,0.734,0.266,0.994,G,0.891,0.889
2,DP02342,,1.0,,,,,P06837,3,C,,,,P06837,0.744,0.256,0.919,G,0.885,0.893
3,DP02342,,1.0,,,,,P06837,4,C,,,,P06837,0.704,0.296,0.874,G,0.878,0.876
4,DP02342,,1.0,,,,,P06837,5,M,,,,P06837,0.708,0.292,0.888,G,0.873,0.878
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
963294,DP03746,,,,,,,Q9QUH6-2,1280,Q,,,,,,,,,,
963295,DP03746,,,,,,,Q9QUH6-2,1281,L,,,,,,,,,,
963296,DP03746,,,,,,,Q9QUH6-2,1282,L,,,,,,,,,,
963297,DP03746,,,,,,,Q9QUH6-2,1283,I,,,,,,,,,,


## Write files

Challenge definitions

- The first list are the columns to be considered as positive (any)
- The second list (mask) are the columns to be considered as negative (any)
- If the second list is not provided all non-positives are considered negatives
- In case of conflicts, the positives always overwrite the negatives
- If mask is provided proteins without at least one residue that could be masked (even when overwritten by a positive) are excluded (e.g. only proteins with PDB observed residues are considered) 


In [33]:
challenges = [[['linker'], []], 
              [['disorder'], []], 
              [['protein binding'], []], 
              [['nucleic acid binding'], []], 
              [['binding'], []],
              [['disorder'], ['pdb']],
             ]

for challenge, mask in challenges:
    
    file_name = "-".join(["_".join(c.split()) for c in challenge])
    if mask:
        file_name = file_name + "@" + "-".join(["-".join(m.split()) for m in mask])
    
    with open("{}/{}.fasta".format(references_dir, file_name), "w") as fout:
        for disprot_id, df_g in df_regions.groupby('disprot_id'):

            if mask:
                df_g['output'] = '-'
                df_g.loc[df_g[mask].notnull().any(axis='columns'), 'output'] = '0'
            else:
                df_g['output'] = '0'
                
            df_g.loc[df_g[challenge].notnull().any(axis='columns'), 'output'] = '1'
            
            # If mask is provided also check the protein has a at least one residue that could be masked (even when overwritten by a positive) 
            if df_g[challenge].notnull().any(axis='columns').any() and (not mask or df_g[mask].notnull().any(axis='columns').any()):  
                fout.write(">{}\n{}\n{}\n".format(disprot_id, "".join(df_g['aa']), "".join(df_g['output'])))
