# Generate CAID-2 references

DisProt data can be obtained directly exporting the relevant database collections (ask the developers): 

```bash
# 18 Aug 2022
mongoexport -d disprot8 -c entries_2022_06 -o disprot_entries_2022_06.mjson
mongoexport -d disprot8 -c entries_2022_12_c -o disprot_entries_2022_12_c.mjson
scp moros:disprot_entries* .
```
Or using the download service from the website (lastest annotations might not be available to the public). Note the formats are slightly different.

AlphaFold (processed) predictions can be obtained using the code in the [AlphaFold-disorder](https://github.com/BioComputingUP/AlphaFold-disorder) repository.

Preliminary steps:
```bash
# Generate the folder structure
mkdir -p ../data/{disprot,sifts,alphafold,output/references}
    
# Download data (18 Aug 2022)
wget -O ../data/sifts/ ftp://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/uniprot_segments_observed.tsv.gz
wget -O ../data/disprot/ http://purl.obolibrary.org/obo/go/go-basic.obo
```

In [1]:
import os
import json
import pandas as pd
import networkx
import numpy as np
import obonet  # conda install -c biobuilds obonet

In [2]:
go_obo_file = "../data/disprot/go-basic.obo"
disprot_old_file = "../data/disprot/disprot_entries_2022_06.mjson"
disprot_new_file = "../data/disprot/disprot_entries_2022_12_c.mjson"
sifts_file = "../data/sifts/uniprot_segments_observed.tsv.gz"

# Optional
alphafold_dir = "../data/alphafold"
gene3d_file = ""

# Output
references_dir = "../data/output/references"
raw_dataset_file = "../data/output/raw_dataset.tsv"
dataset_file = "../data/output/dataset.tsv"

In [3]:
def expand_region(df_:pd.DataFrame, start_col:str='start', end_col:str='end', res_col:str='reg_position') -> pd.DataFrame:
    df_[res_col] = list(range(int(df_[start_col]), int(df_[end_col]) + 1, 1))
    return df_

def expand_sequence(df_:pd.DataFrame, seq_column:str='sequence', res_col:str='seq_aa') -> pd.DataFrame:
    df_[res_col] = [(i+1, aa) for i, aa in enumerate(df_[seq_column])]
    return df_

## Associate DisProt annotation terms to CAID challenges

In [4]:
# IDPO terms
data_idpo = [('IDPO:00076', 'disorder'), ('IDPO:00077', 'disorder'), ('IDPO:00078', 'disorder'), 
                   ('IDPO:00501', 'linker'), ('IDPO:00502', 'linker'), ('IDPO:00503', 'linker'), 
                   ('IDPO:00504', 'linker'), ('IDPO:00049', 'transition'), ('IDPO:00050', 'transition'), 
                   ('IDPO:00051', 'transition'), ('IDPO:00052', 'transition'), ('IDPO:00053', 'transition'), 
                   ('IDPO:00060', 'transition'), ('IDPO:00055', 'transition'), ('IDPO:00056', 'transition'), 
                   ('IDPO:00061', 'transition'), ('IDPO:00054', 'transition'), ('IDPO:00057', 'transition'), 
                   ('IDPO:00058', 'transition'), ('IDPO:00059', 'transition')]

# GO ancestor terms coresponding to CAID2 challenges
ancestors = {'GO:0005488': 'binding', 'GO:0003676': 'nucleic acid binding', 'GO:0005515': 'protein binding'}

In [5]:
# The OBO must have "ontology: GO" header (first line)
graph = obonet.read_obo(go_obo_file)

# Remove all edges which are not "is_a"
to_remove = []
for e in graph.edges:
    if e[2] != 'is_a':
        to_remove.append((e[0], e[1]))
for ele in to_remove:
    graph.remove_edge(*ele)
    
# Create children table
data_go = []    
for node in graph.nodes(data=True):
    challenge = ancestors.get(node[0])
    if challenge is not None:
        data_go.append([node[0], challenge])
        for children in networkx.ancestors(graph, node[0]): 
            data_go.append([children, challenge])

In [6]:
df_challenge = pd.DataFrame(data=data_idpo + data_go, columns=['term_id', 'challenge']).drop_duplicates()
df_challenge

Unnamed: 0,term_id,challenge
0,IDPO:00076,disorder
1,IDPO:00077,disorder
2,IDPO:00078,disorder
3,IDPO:00501,linker
4,IDPO:00502,linker
...,...,...
3082,GO:0031711,protein binding
3083,GO:0031866,protein binding
3084,GO:0042009,protein binding
3085,GO:0005147,protein binding


## Process DisProt annotations

In [7]:
# Get DisProt annotations
disprot_old = {}
with open(disprot_old_file, "r") as f:
    for line in f:
        obj = json.loads(line)
        disprot_old[obj["disprot_id"]] = obj
        
disprot_new = {}
with open(disprot_new_file, "r") as f:
    for line in f:
        obj = json.loads(line)
        disprot_new[obj["disprot_id"]] = obj

In [8]:
# Get new annotations (delta = new - old)
dataset = []
for disprot_id in disprot_new:
    if disprot_id not in disprot_old and "obsolete" not in disprot_new[disprot_id] and "X" not in disprot_new[disprot_id]["sequence"]:
        # Filter out obsolete regions
        disprot_new[disprot_id]["regions"] = [region for region in disprot_new[disprot_id]["regions"] if "obsolete" not in region]
        if disprot_new[disprot_id]["regions"]:
            dataset.append(disprot_new[disprot_id])

In [9]:
entry_columns = ['disprot_id', 'acc', 'ncbi_taxon_id', 'organism', 'sequence']
df = pd.json_normalize(data=dataset, record_path=['regions'], meta=entry_columns, meta_prefix='', record_prefix='')
df.to_csv(raw_dataset_file, sep="\t", index=False)
df

Unnamed: 0,region_id,unpublished,ec_ontology,end,term_id,start,version,statement,term_name,ec_name,...,sequence_construct,construct_alterations,states_connection,conditions,annotation_extensions,disprot_id,acc,ncbi_taxon_id,organism,sequence
0,DP02342r003,True,ECO,227,IDPO:00078,1,3,[{'text': 'CD spectra of Nm and Ng showed an i...,pre-molten globule,far-UV circular dichroism evidence used in man...,...,,,,,,DP02342,P06837,10090,Mus musculus,MLCCMRRTKQVEKNDEDQKIEQDGVKPEDKAHKAATKIQASFRGHI...
1,DP02342r006,True,ECO,57,GO:0051179,34,3,[{'text': 'One proposed biochemical function o...,localization,isothermal titration calorimetry evidence used...,...,,,,,,DP02342,P06837,10090,Mus musculus,MLCCMRRTKQVEKNDEDQKIEQDGVKPEDKAHKAATKIQASFRGHI...
2,DP02342r007,True,ECO,57,GO:0098772,34,3,[{'text': 'One proposed biochemical function o...,molecular function regulator,isothermal titration calorimetry evidence used...,...,,,,,,DP02342,P06837,10090,Mus musculus,MLCCMRRTKQVEKNDEDQKIEQDGVKPEDKAHKAATKIQASFRGHI...
3,DP02342r009,True,ECO,227,IDPO:00078,1,2,[{'text': 'The NMR spectra of full-length Nm a...,pre-molten globule,nuclear magnetic resonance spectroscopy eviden...,...,,,,,,DP02342,P06837,10090,Mus musculus,MLCCMRRTKQVEKNDEDQKIEQDGVKPEDKAHKAATKIQASFRGHI...
4,DP02342r010,True,ECO,57,GO:0005515,34,4,[{'text': 'ITC experiments revealed that the f...,protein binding,isothermal titration calorimetry evidence used...,...,,,,,,DP02342,P06837,10090,Mus musculus,MLCCMRRTKQVEKNDEDQKIEQDGVKPEDKAHKAATKIQASFRGHI...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1146,DP03758r001,True,ECO,468,IDPO:00076,459,0,"[{'text': 'Here, we identified a common binary...",disorder,nuclear magnetic resonance spectroscopy eviden...,...,,,,,,DP03758,Q96ST2,9606,Homo sapiens,MDSEYYSGDQSDDGGATPVQDERDSGSDGEDDVNEQHSGSDTGSVE...
1147,DP03758r002,True,ECO,458,IDPO:00050,449,0,"[{'text': 'Here, we identified a common binary...",disorder to order,nuclear magnetic resonance spectroscopy eviden...,...,,,,,,DP03758,Q96ST2,9606,Homo sapiens,MDSEYYSGDQSDDGGATPVQDERDSGSDGEDDVNEQHSGSDTGSVE...
1148,DP03758r003,True,ECO,458,GO:0005515,449,0,"[{'text': 'Here, we identified a common binary...",protein binding,nuclear magnetic resonance spectroscopy eviden...,...,,,,,,DP03758,Q96ST2,9606,Homo sapiens,MDSEYYSGDQSDDGGATPVQDERDSGSDGEDDVNEQHSGSDTGSVE...
1149,DP03758r004,True,ECO,460,GO:0005515,449,0,[{'text': 'To characterize and validate TND-TI...,protein binding,mammalian 2-hybrid assay evidence used in manu...,...,,,,,,DP03758,Q96ST2,9606,Homo sapiens,MDSEYYSGDQSDDGGATPVQDERDSGSDGEDDVNEQHSGSDTGSVE...


In [10]:
region_columns = ["start", "end", "term_id"]
df = df.loc[:, entry_columns + region_columns]
df

Unnamed: 0,disprot_id,acc,ncbi_taxon_id,organism,sequence,start,end,term_id
0,DP02342,P06837,10090,Mus musculus,MLCCMRRTKQVEKNDEDQKIEQDGVKPEDKAHKAATKIQASFRGHI...,1,227,IDPO:00078
1,DP02342,P06837,10090,Mus musculus,MLCCMRRTKQVEKNDEDQKIEQDGVKPEDKAHKAATKIQASFRGHI...,34,57,GO:0051179
2,DP02342,P06837,10090,Mus musculus,MLCCMRRTKQVEKNDEDQKIEQDGVKPEDKAHKAATKIQASFRGHI...,34,57,GO:0098772
3,DP02342,P06837,10090,Mus musculus,MLCCMRRTKQVEKNDEDQKIEQDGVKPEDKAHKAATKIQASFRGHI...,1,227,IDPO:00078
4,DP02342,P06837,10090,Mus musculus,MLCCMRRTKQVEKNDEDQKIEQDGVKPEDKAHKAATKIQASFRGHI...,34,57,GO:0005515
...,...,...,...,...,...,...,...,...
1146,DP03758,Q96ST2,9606,Homo sapiens,MDSEYYSGDQSDDGGATPVQDERDSGSDGEDDVNEQHSGSDTGSVE...,459,468,IDPO:00076
1147,DP03758,Q96ST2,9606,Homo sapiens,MDSEYYSGDQSDDGGATPVQDERDSGSDGEDDVNEQHSGSDTGSVE...,449,458,IDPO:00050
1148,DP03758,Q96ST2,9606,Homo sapiens,MDSEYYSGDQSDDGGATPVQDERDSGSDGEDDVNEQHSGSDTGSVE...,449,458,GO:0005515
1149,DP03758,Q96ST2,9606,Homo sapiens,MDSEYYSGDQSDDGGATPVQDERDSGSDGEDDVNEQHSGSDTGSVE...,449,460,GO:0005515


In [11]:
# Transform the per-protein dataframe into a per-residue dataframe 
df_regions = df.apply(expand_region, axis=1).loc[:, ["disprot_id", "term_id", "reg_position"]].copy(deep=True)
df_regions = pd.merge(left=df_regions, right=df_challenge, how="inner", left_on="term_id", right_on="term_id").drop(columns=["term_id"])
df_regions = df_regions.explode("reg_position").drop_duplicates()
df_regions['has_region'] = 1
df_regions

Unnamed: 0,disprot_id,reg_position,challenge,has_region
0,DP02342,1,disorder,1
0,DP02342,2,disorder,1
0,DP02342,3,disorder,1
0,DP02342,4,disorder,1
0,DP02342,5,disorder,1
...,...,...,...,...
1098,DP03708,36,protein binding,1
1098,DP03708,37,protein binding,1
1098,DP03708,38,protein binding,1
1098,DP03708,39,protein binding,1


In [12]:
# Create the pivot table. Transpose challenge values into columns 
df_regions = pd.pivot_table(
    df_regions,
    columns="challenge",
    index=['disprot_id', 'reg_position'],
    values='has_region')
df_regions = df_regions.reset_index()
df_regions

challenge,disprot_id,reg_position,binding,disorder,linker,nucleic acid binding,protein binding,transition
0,DP02342,1,,1.0,,,,
1,DP02342,2,,1.0,,,,
2,DP02342,3,,1.0,,,,
3,DP02342,4,,1.0,,,,
4,DP02342,5,,1.0,,,,
...,...,...,...,...,...,...,...,...
30899,DP03758,464,,1.0,,,,
30900,DP03758,465,,1.0,,,,
30901,DP03758,466,,1.0,,,,
30902,DP03758,467,,1.0,,,,


In [13]:
# Get dataset sequences (1 residue per row)
df_sequence = df.apply(expand_sequence, axis=1).copy(deep=True).drop(columns=["ncbi_taxon_id", "organism", "start", "end", "sequence", "term_id"])
df_sequence = df_sequence.explode("seq_aa")
df_sequence[['pos', 'aa']] = pd.DataFrame(df_sequence['seq_aa'].tolist(), index=df_sequence.index)
df_sequence = df_sequence.drop(columns='seq_aa').drop_duplicates()
df_sequence

Unnamed: 0,disprot_id,acc,pos,aa
0,DP02342,P06837,1,M
0,DP02342,P06837,2,L
0,DP02342,P06837,3,C
0,DP02342,P06837,4,C
0,DP02342,P06837,5,M
...,...,...,...,...
1146,DP03758,Q96ST2,815,N
1146,DP03758,Q96ST2,816,K
1146,DP03758,Q96ST2,817,M
1146,DP03758,Q96ST2,818,P


In [14]:
# Add sequence positions not mapping to any DisProt region
df_regions = pd.merge(left=df_regions, right=df_sequence, how="right", left_on=["disprot_id", "reg_position"], right_on=["disprot_id", "pos"])
df_regions.drop(columns="reg_position", inplace=True)
df_regions

Unnamed: 0,disprot_id,binding,disorder,linker,nucleic acid binding,protein binding,transition,acc,pos,aa
0,DP02342,,1.0,,,,,P06837,1,M
1,DP02342,,1.0,,,,,P06837,2,L
2,DP02342,,1.0,,,,,P06837,3,C
3,DP02342,,1.0,,,,,P06837,4,C
4,DP02342,,1.0,,,,,P06837,5,M
...,...,...,...,...,...,...,...,...,...,...
297691,DP03758,,,,,,,Q96ST2,815,N
297692,DP03758,,,,,,,Q96ST2,816,K
297693,DP03758,,,,,,,Q96ST2,817,M
297694,DP03758,,,,,,,Q96ST2,818,P


## Map PDB onbserved positions using SIFTS

In [15]:
df_sifts = pd.read_csv(sifts_file, sep="\t", header=1)
# Filter for dataset entries
df_sifts = df_sifts.loc[df_sifts['SP_PRIMARY'].isin(df_regions['acc'])]
# Explode observed regions 
df_sifts = df_sifts.apply(expand_region, start_col="SP_BEG", end_col="SP_END", axis=1)
df_sifts = df_sifts.explode("reg_position")
df_sifts = df_sifts.loc[:, ['SP_PRIMARY', 'reg_position']].drop_duplicates().reset_index(drop=True).rename(columns={"SP_PRIMARY": "acc"})
df_sifts

Unnamed: 0,acc,reg_position
0,Q9BY41,14
1,Q9BY41,15
2,Q9BY41,16
3,Q9BY41,17
4,Q9BY41,18
...,...,...
101402,Q96QZ7,1386
101403,Q96QZ7,1387
101404,Q96QZ7,1388
101405,Q96QZ7,1389


In [16]:
df_regions = pd.merge(df_regions, df_sifts, left_on=["acc", "pos"], right_on=["acc", "reg_position"], how="left")
df_regions.rename(columns={"reg_position": "pdb"}, inplace=True)
df_regions.loc[df_regions['pdb'].notnull(), 'pdb'] = 1.0
df_regions

Unnamed: 0,disprot_id,binding,disorder,linker,nucleic acid binding,protein binding,transition,acc,pos,aa,pdb
0,DP02342,,1.0,,,,,P06837,1,M,
1,DP02342,,1.0,,,,,P06837,2,L,
2,DP02342,,1.0,,,,,P06837,3,C,
3,DP02342,,1.0,,,,,P06837,4,C,
4,DP02342,,1.0,,,,,P06837,5,M,
...,...,...,...,...,...,...,...,...,...,...,...
297691,DP03758,,,,,,,Q96ST2,815,N,
297692,DP03758,,,,,,,Q96ST2,816,K,
297693,DP03758,,,,,,,Q96ST2,817,M,
297694,DP03758,,,,,,,Q96ST2,818,P,


## Add AlphaFold prediction

In [17]:
df_list = []
for af_file in os.listdir(alphafold_dir):
    df_list.append(pd.read_csv("{}/{}".format(alphafold_dir, af_file), sep='\t'))
df_af = pd.concat(df_list, ignore_index=True)
del df_list

In [18]:
# WARNING: rename depends on the version (only full lenght predictions should be used)
df_af['acc'] = df_af['name'].apply(lambda x: x.replace('AF-','').replace('-F1-model_v3',''))
df_af = df_af.rename(columns={"disorder": "af-disorder", "disorder-25": "af-rsa", "binding-25-0.581": "af-binding"})
df_af = df_af[["acc", "pos", "aa", "af-disorder", "af-rsa", "af-binding"]]
df_af

Unnamed: 0,acc,pos,aa,af-disorder,af-rsa,af-binding
0,Q9JZ10,1,M,0.587,0.280,0.280
1,Q9JZ10,2,G,0.465,0.271,0.271
2,Q9JZ10,3,N,0.141,0.278,0.278
3,Q9JZ10,4,F,0.080,0.290,0.290
4,Q9JZ10,5,L,0.090,0.295,0.295
...,...,...,...,...,...,...
218152,E9Q4Y4,2510,S,0.140,0.353,0.353
218153,E9Q4Y4,2511,L,0.225,0.355,0.355
218154,E9Q4Y4,2512,G,0.293,0.323,0.323
218155,E9Q4Y4,2513,K,0.488,0.324,0.324


In [19]:
df_regions = pd.merge(df_regions, df_af, on=["acc", "pos", "aa"], how="left")
df_regions

Unnamed: 0,disprot_id,binding,disorder,linker,nucleic acid binding,protein binding,transition,acc,pos,aa,pdb,af-disorder,af-rsa,af-binding
0,DP02342,,1.0,,,,,P06837,1,M,,0.270,0.897,0.887
1,DP02342,,1.0,,,,,P06837,2,L,,0.266,0.891,0.889
2,DP02342,,1.0,,,,,P06837,3,C,,0.256,0.885,0.893
3,DP02342,,1.0,,,,,P06837,4,C,,0.296,0.878,0.876
4,DP02342,,1.0,,,,,P06837,5,M,,0.292,0.873,0.878
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297691,DP03758,,,,,,,Q96ST2,815,N,,,,
297692,DP03758,,,,,,,Q96ST2,816,K,,,,
297693,DP03758,,,,,,,Q96ST2,817,M,,,,
297694,DP03758,,,,,,,Q96ST2,818,P,,,,


## Write files

Challenge definitions

- The first list are the columns to be considered as positive (any)
- The second list (mask) are the columns to be considered as negative (any)
- If the second list is not provided all non-positives are considered negatives
- In case of conflicts, the positives always overwrite the negatives
- If mask is provided proteins without at least one residue that could be masked (even when overwritten by a positive) are excluded (e.g. only proteins with PDB observed residues are considered) 


In [20]:
# Reorder the columns
head_cols = ['disprot_id', 'acc', 'pos', 'aa']
disprot_cols = list(df_challenge['challenge'].unique())
other_cols = sorted(list((set(df_regions.columns.tolist()) - set(head_cols)) - set(disprot_cols)))
cols = head_cols + disprot_cols + other_cols
print(cols)

df_regions = df_regions[cols]
df_regions

['disprot_id', 'acc', 'pos', 'aa', 'disorder', 'linker', 'transition', 'nucleic acid binding', 'binding', 'protein binding', 'af-binding', 'af-disorder', 'af-rsa', 'pdb']


Unnamed: 0,disprot_id,acc,pos,aa,disorder,linker,transition,nucleic acid binding,binding,protein binding,af-binding,af-disorder,af-rsa,pdb
0,DP02342,P06837,1,M,1.0,,,,,,0.887,0.270,0.897,
1,DP02342,P06837,2,L,1.0,,,,,,0.889,0.266,0.891,
2,DP02342,P06837,3,C,1.0,,,,,,0.893,0.256,0.885,
3,DP02342,P06837,4,C,1.0,,,,,,0.876,0.296,0.878,
4,DP02342,P06837,5,M,1.0,,,,,,0.878,0.292,0.873,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297691,DP03758,Q96ST2,815,N,,,,,,,,,,
297692,DP03758,Q96ST2,816,K,,,,,,,,,,
297693,DP03758,Q96ST2,817,M,,,,,,,,,,
297694,DP03758,Q96ST2,818,P,,,,,,,,,,


In [21]:
# Write the dataframe
df_regions.to_csv(dataset_file, sep="\t", index=False)

In [22]:
# Write the fastas
challenges = [[['linker'], []], 
              [['disorder'], []], 
              [['protein binding'], []], 
              [['nucleic acid binding'], []], 
              [['binding'], []],
              [['disorder'], ['pdb']],
             ]

for challenge, mask in challenges:
    
    file_name = "-".join(["_".join(c.split()) for c in challenge])
    if mask:
        file_name = file_name + "@" + "-".join(["-".join(m.split()) for m in mask])
    
    with open("{}/{}.fasta".format(references_dir, file_name), "w") as fout:
        for disprot_id, df_g in df_regions.groupby('disprot_id'):

            if mask:
                df_g['output'] = '-'
                df_g.loc[df_g[mask].notnull().any(axis='columns'), 'output'] = '0'
            else:
                df_g['output'] = '0'
                
            df_g.loc[df_g[challenge].notnull().any(axis='columns'), 'output'] = '1'
            
            # If mask is provided also check the protein has a at least one residue that could be masked (even when overwritten by a positive) 
            if df_g[challenge].notnull().any(axis='columns').any() and (not mask or df_g[mask].notnull().any(axis='columns').any()):  
                fout.write(">{}\n{}\n{}\n".format(disprot_id, "".join(df_g['aa']), "".join(df_g['output'])))
