# Generate CAID-2 references

DisProt data can be obtained directly exporting the relevant database collections (ask the developers): 

```bash
# 20 Nov 2022
mongoexport -d disprot8 -c entries_2022_06 -o disprot_entries_2022_06.mjson
mongoexport -d disprot8 -c entries_2022_12_c -o disprot_entries_2022_12_c.mjson
scp moros:disprot_entries* .
```
Or using the download service from the website (lastest annotations might not be available to the public). Note the formats are slightly different.

AlphaFold (processed) predictions can be obtained using the code in the [AlphaFold-disorder](https://github.com/BioComputingUP/AlphaFold-disorder) repository.

Preliminary steps:
```bash
# Generate the folder structure
mkdir -p ../data/{disprot,sifts,alphafold,output/references}
    
# Download data (20 Nov 2022)
wget -O data/sifts/uniprot_segments_observed.tsv.gz ftp://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/uniprot_segments_observed.tsv.gz
wget -O data/disprot/go-basic.obo http://purl.obolibrary.org/obo/go/go-basic.obo
```

## TODO
Additional references
* disorder without PDB missing residues regions

In [1]:
import os
import json
import pandas as pd
import networkx
import numpy as np
import obonet  # conda install -c biobuilds obonet

In [2]:
go_obo_file = "../data/disprot/go-basic.obo"
disprot_old_file = "../data/disprot/disprot_entries_2022_06.mjson"
disprot_new_file = "../data/disprot/disprot_entries_2022_12_c.mjson"
sifts_file = "../data/sifts/uniprot_segments_observed.tsv.gz"

# Output
references_dir = "../data/output/references"
dataset_raw_file = "../data/output/dataset_raw.tsv"
dataset_ec_file = "../data/output/dataset_ec.tsv"
dataset_file = "../data/output/dataset.tsv"
fasta_new_file = "../data/output/disprot_new.fasta"
fasta_old_file = "../data/output/disprot_old.fasta"

In [3]:
def expand_region(df_:pd.DataFrame, start_col:str='start', end_col:str='end', res_col:str='reg_position') -> pd.DataFrame:
    df_[res_col] = list(range(int(df_[start_col]), int(df_[end_col]) + 1, 1))
    return df_

def expand_sequence(df_:pd.DataFrame, seq_column:str='sequence', res_col:str='seq_aa') -> pd.DataFrame:
    df_[res_col] = [(i+1, aa) for i, aa in enumerate(df_[seq_column])]
    return df_

## Associate DisProt annotation terms to CAID challenges

In [4]:
# IDPO terms
data_idpo = [
            ('IDPO:00076', 'disorder'), ('IDPO:00077', 'disorder'), ('IDPO:00078', 'disorder'), 
            ('IDPO:00502', 'linker'),
            ('IDPO:00049', 'transition'), ('IDPO:00050', 'transition'), 
            ('IDPO:00051', 'transition'), ('IDPO:00052', 'transition'), ('IDPO:00053', 'transition'), 
            ('IDPO:00060', 'transition'), ('IDPO:00055', 'transition'), ('IDPO:00056', 'transition'), 
            ('IDPO:00061', 'transition'), ('IDPO:00054', 'transition'), ('IDPO:00057', 'transition'), 
            ('IDPO:00058', 'transition'), ('IDPO:00059', 'transition')]

# ('IDPO:00501', 'linker'),  ('IDPO:00503', 'linker'), ('IDPO:00504', 'linker')

# GO ancestor terms corresponding to CAID2 challenges
ancestors = {'GO:0005488': 'binding', 'GO:0003676': 'binding nucleic acid', 'GO:0005515': 'binding protein'}

In [5]:
# The OBO must have "ontology: GO" header (first line)
graph = obonet.read_obo(go_obo_file)

# Remove all edges which are not "is_a"
to_remove = []
for e in graph.edges:
    if e[2] != 'is_a':
        to_remove.append((e[0], e[1]))
for ele in to_remove:
    graph.remove_edge(*ele)
    
# Create children table
data_go = []    
for node in graph.nodes(data=True):
    challenge = ancestors.get(node[0])
    if challenge is not None:
        data_go.append([node[0], challenge])
        for children in networkx.ancestors(graph, node[0]): 
            data_go.append([children, challenge])

In [6]:
df_challenge = pd.DataFrame(data=data_idpo + data_go, columns=['term_id', 'challenge']).drop_duplicates()
df_challenge

Unnamed: 0,term_id,challenge
0,IDPO:00076,disorder
1,IDPO:00077,disorder
2,IDPO:00078,disorder
3,IDPO:00502,linker
4,IDPO:00049,transition
...,...,...
3082,GO:0050811,binding protein
3083,GO:0031803,binding protein
3084,GO:1990311,binding protein
3085,GO:0032183,binding protein


## Process DisProt annotations

In [7]:
# Get DisProt annotations
disprot_old = {}
with open(disprot_old_file, "r") as f:
    for line in f:
        obj = json.loads(line)
        disprot_old[obj["disprot_id"]] = obj
        
disprot_new = {}
with open(disprot_new_file, "r") as f:
    for line in f:
        obj = json.loads(line)
        disprot_new[obj["disprot_id"]] = obj

In [8]:
# Get new annotations (delta = new - old)
dataset = []  # New valid annotations
for disprot_id in disprot_new:
    if disprot_id not in disprot_old and "obsolete" not in disprot_new[disprot_id]:
        if "X" not in disprot_new[disprot_id]["sequence"]:
            # Filter out obsolete regions
            disprot_new[disprot_id]["regions"] = [region for region in disprot_new[disprot_id]["regions"] if "obsolete" not in region]
            if disprot_new[disprot_id]["regions"]:
                dataset.append(disprot_new[disprot_id])
            else:
                print("{} excluded, only obsolete regions".format(disprot_id))
        else:
            print("{} excluded, contain X".format(disprot_id))

DP02362 excluded, only obsolete regions
DP02450 excluded, only obsolete regions
DP02902 excluded, only obsolete regions
DP02906 excluded, only obsolete regions
DP02932 excluded, only obsolete regions
DP02938 excluded, only obsolete regions
DP02942 excluded, only obsolete regions
DP02952 excluded, only obsolete regions
DP02955 excluded, only obsolete regions
DP02963 excluded, only obsolete regions
DP02974 excluded, only obsolete regions
DP02997 excluded, only obsolete regions
DP02998 excluded, only obsolete regions
DP03009 excluded, only obsolete regions
DP03036 excluded, only obsolete regions
DP03051 excluded, contain X
DP03077 excluded, only obsolete regions
DP03084 excluded, only obsolete regions
DP03097 excluded, only obsolete regions
DP03103 excluded, only obsolete regions
DP03128 excluded, only obsolete regions
DP03146 excluded, only obsolete regions
DP03165 excluded, only obsolete regions
DP03188 excluded, only obsolete regions
DP03208 excluded, only obsolete regions
DP03244 excl

In [9]:
# Write fasta for homology calculation
with open(fasta_new_file, "w") as fout:
    for obj in dataset:
        fout.write(">{}|{}\n{}\n".format(obj['disprot_id'], obj['acc'], obj['sequence']))

with open(fasta_old_file, "w") as fout:
    for disprot_id, obj in disprot_old.items():
        if "obsolete" not in obj:
            fout.write(">{}|{}\n{}\n".format(obj['disprot_id'], obj['acc'], obj['sequence']))


In [10]:
# Convert json to dataframe
entry_columns = ['disprot_id', 'acc', 'ncbi_taxon_id', 'organism', 'sequence']
df = pd.json_normalize(data=dataset, record_path=['regions'], meta=entry_columns, meta_prefix='', record_prefix='')
df = pd.merge(left=df, right=df_challenge, how="inner", on="term_id")
df.to_csv(dataset_raw_file, sep="\t", index=False)
df.columns

Index(['region_id', 'unpublished', 'ec_ontology', 'end', 'term_id', 'start',
       'version', 'statement', 'term_name', 'ec_name', 'curator_orcid',
       'released', 'term_ontology', 'curator_name', 'reference_id', 'date',
       'reference_source', 'term_namespace', 'ec_id', 'curator_id',
       'reference_html', 'ec_go', 'disprot_namespace', 'validated.curator_id',
       'validated.timestamp', 'validated.curator_name', 'term_not_annotate',
       'term_is_obsolete', 'term_comment', 'term_def', 'term_is_binding',
       'interaction_partner', 'cross_refs', 'confidence', 'sample',
       'term_xref', 'construct_alterations', 'sequence_construct',
       'term_go_domain', 'states_connection', 'conditions',
       'annotation_extensions', 'disprot_id', 'acc', 'ncbi_taxon_id',
       'organism', 'sequence', 'challenge'],
      dtype='object')

In [11]:
region_columns = ["start", "end", "term_id", "ec_id", "challenge"]
df = df.loc[:, entry_columns + region_columns]
df

Unnamed: 0,disprot_id,acc,ncbi_taxon_id,organism,sequence,start,end,term_id,ec_id,challenge
0,DP02342,P06837,10090,Mus musculus,MLCCMRRTKQVEKNDEDQKIEQDGVKPEDKAHKAATKIQASFRGHI...,1,227,IDPO:00078,ECO:0006204,disorder
1,DP02342,P06837,10090,Mus musculus,MLCCMRRTKQVEKNDEDQKIEQDGVKPEDKAHKAATKIQASFRGHI...,1,227,IDPO:00078,ECO:0006165,disorder
2,DP03000,Q997F2,121791,Nipah virus,MDKLELVNDGLNIIDFIQKNQKEIQKTYGRSSIQQPSIKDQTKAWE...,1,456,IDPO:00078,ECO:0007680,disorder
3,DP03000,Q997F2,121791,Nipah virus,MDKLELVNDGLNIIDFIQKNQKEIQKTYGRSSIQQPSIKDQTKAWE...,1,456,IDPO:00078,ECO:0006204,disorder
4,DP03001,O55777,928303,Hendra virus (isolate Horse/Autralia/Hendra/1994),MDKLDLVNDGLDIIDFIQKNQKEIQKTYGRSSIQQPSTKDRTRAWE...,1,457,IDPO:00078,ECO:0007680,disorder
...,...,...,...,...,...,...,...,...,...,...
1450,DP03708,Q8IGP5,7227,Drosophila melanogaster,MEDLTKNIIFTNAINGQPATIQYQTADGTILKQPKIEGQKTEQQPT...,1,86,GO:0042803,ECO:0006210,binding
1451,DP03708,Q8IGP5,7227,Drosophila melanogaster,MEDLTKNIIFTNAINGQPATIQYQTADGTILKQPKIEGQKTEQQPT...,1,86,GO:0042803,ECO:0006210,binding protein
1452,DP03739,P24588,9606,Homo sapiens,METTISEIHVENKDEKRSAEGSPGAERQKEKASMLCFKRRKKAAKA...,390,417,GO:0005516,ECO:0006165,binding
1453,DP03739,P24588,9606,Homo sapiens,METTISEIHVENKDEKRSAEGSPGAERQKEKASMLCFKRRKKAAKA...,390,417,GO:0005516,ECO:0006165,binding protein


In [12]:
# Get dataset sequences (1 residue per row)
df_sequence = df.apply(expand_sequence, axis=1).copy(deep=True).drop(columns=["ncbi_taxon_id", "organism", "start", "end", "sequence", "term_id", "ec_id", 'challenge'])
df_sequence = df_sequence.explode("seq_aa")
df_sequence[['pos', 'aa']] = pd.DataFrame(df_sequence['seq_aa'].tolist(), index=df_sequence.index)
df_sequence = df_sequence.drop(columns='seq_aa').drop_duplicates()
df_sequence

Unnamed: 0,disprot_id,acc,pos,aa
0,DP02342,P06837,1,M
0,DP02342,P06837,2,L
0,DP02342,P06837,3,C
0,DP02342,P06837,4,C
0,DP02342,P06837,5,M
...,...,...,...,...
1361,DP02544,Q04410,368,Q
1361,DP02544,Q04410,369,S
1361,DP02544,Q04410,370,S
1361,DP02544,Q04410,371,S


## Map PDB observed positions using SIFTS

In [13]:
df_sifts = pd.read_csv(sifts_file, sep="\t", header=1)
# Filter for dataset entries
df_sifts = df_sifts.loc[df_sifts['SP_PRIMARY'].isin(df_sequence['acc'])]
# Explode observed regions 
df_sifts = df_sifts.apply(expand_region, start_col="SP_BEG", end_col="SP_END", axis=1)
df_sifts = df_sifts.explode("reg_position")
df_sifts = df_sifts.loc[:, ['SP_PRIMARY', 'reg_position']].drop_duplicates().reset_index(drop=True).rename(columns={"SP_PRIMARY": "acc"})
df_sifts

Unnamed: 0,acc,reg_position
0,A0A024B7W1,291
1,A0A024B7W1,292
2,A0A024B7W1,293
3,A0A024B7W1,294
4,A0A024B7W1,295
...,...,...
154860,P01015,437
154861,P01015,438
154862,P01015,439
154863,P01015,440


In [14]:
df_sequence = pd.merge(df_sequence, df_sifts, left_on=["acc", "pos"], right_on=["acc", "reg_position"], how="left")
df_sequence.rename(columns={"reg_position": "pdb"}, inplace=True)
df_sequence.loc[df_sequence['pdb'].notnull(), 'pdb'] = 1.0
df_sequence.loc[df_sequence['pdb'].notnull()]

Unnamed: 0,disprot_id,acc,pos,aa,pdb
33,DP02342,P06837,34,A,1.0
34,DP02342,P06837,35,A,1.0
35,DP02342,P06837,36,T,1.0
36,DP02342,P06837,37,K,1.0
37,DP02342,P06837,38,I,1.0
...,...,...,...,...,...
378433,DP02544,Q04410,140,Y,1.0
378434,DP02544,Q04410,141,I,1.0
378435,DP02544,Q04410,142,S,1.0
378436,DP02544,Q04410,143,I,1.0


## Define regions

Transform the per-protein dataframe into a per-residue dataframe 


In [15]:
df_regions = df.apply(expand_region, axis=1).loc[:, ["disprot_id", "reg_position", "ec_id", "challenge"]].copy(deep=True)
# df_regions = pd.merge(left=df_regions, right=df_challenge, how="inner", left_on="term_id", right_on="term_id").drop(columns=["term_id"])
df_regions

Unnamed: 0,disprot_id,reg_position,ec_id,challenge
0,DP02342,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",ECO:0006204,disorder
1,DP02342,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",ECO:0006165,disorder
2,DP03000,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",ECO:0007680,disorder
3,DP03000,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",ECO:0006204,disorder
4,DP03001,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",ECO:0007680,disorder
...,...,...,...,...
1450,DP03708,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",ECO:0006210,binding
1451,DP03708,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",ECO:0006210,binding protein
1452,DP03739,"[390, 391, 392, 393, 394, 395, 396, 397, 398, ...",ECO:0006165,binding
1453,DP03739,"[390, 391, 392, 393, 394, 395, 396, 397, 398, ...",ECO:0006165,binding protein


In [16]:
# ECO:0006220, X-ray crystallography-based structural model with missing residue coordinates used in manual assertion  
df_ = df_regions.loc[(df_regions['challenge'] == 'disorder') & (df_regions['ec_id'] != 'ECO:0006220')]
df_.loc[:, 'challenge'] = 'disorder_nox'
df_regions = pd.concat([df_regions, df_])
df_regions

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_.loc[:, 'challenge'] = 'disorder_nox'


Unnamed: 0,disprot_id,reg_position,ec_id,challenge
0,DP02342,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",ECO:0006204,disorder
1,DP02342,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",ECO:0006165,disorder
2,DP03000,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",ECO:0007680,disorder
3,DP03000,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",ECO:0006204,disorder
4,DP03001,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",ECO:0007680,disorder
...,...,...,...,...
1361,DP02544,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",ECO:0007689,disorder_nox
1362,DP02544,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",ECO:0007680,disorder_nox
1363,DP02544,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",ECO:0006204,disorder_nox
1364,DP03507,"[1146, 1147, 1148, 1149, 1150, 1151, 1152, 115...",ECO:0006204,disorder_nox


In [17]:
df_regions_all = df_regions.drop(columns=['ec_id']).explode("reg_position").drop_duplicates()
df_regions_all['has_region'] = 1
df_regions_all

Unnamed: 0,disprot_id,reg_position,challenge,has_region
0,DP02342,1,disorder,1
0,DP02342,2,disorder,1
0,DP02342,3,disorder,1
0,DP02342,4,disorder,1
0,DP02342,5,disorder,1
...,...,...,...,...
1364,DP03507,1908,disorder_nox,1
1364,DP03507,1909,disorder_nox,1
1364,DP03507,1910,disorder_nox,1
1364,DP03507,1911,disorder_nox,1


In [18]:
# Create the pivot table. Transpose challenge values into columns 
df_regions_all = pd.pivot_table(
    df_regions_all,
    columns="challenge",
    index=['disprot_id', 'reg_position'],
    values='has_region')
df_regions_all = df_regions_all.reset_index()
df_regions_all

challenge,disprot_id,reg_position,binding,binding nucleic acid,binding protein,disorder,disorder_nox,linker,transition
0,DP02342,1,,,,1.0,1.0,,
1,DP02342,2,,,,1.0,1.0,,
2,DP02342,3,,,,1.0,1.0,,
3,DP02342,4,,,,1.0,1.0,,
4,DP02342,5,,,,1.0,1.0,,
...,...,...,...,...,...,...,...,...,...
49282,DP03906,178,,,,1.0,1.0,,
49283,DP03906,179,,,,1.0,1.0,,
49284,DP03906,180,,,,1.0,1.0,,
49285,DP03906,181,,,,1.0,1.0,,


In [19]:
# Add sequence positions not mapping to any DisProt region
df_regions_all = pd.merge(left=df_regions_all, right=df_sequence, how="right", left_on=["disprot_id", "reg_position"], right_on=["disprot_id", "pos"])
df_regions_all.drop(columns="reg_position", inplace=True)
df_regions_all

Unnamed: 0,disprot_id,binding,binding nucleic acid,binding protein,disorder,disorder_nox,linker,transition,acc,pos,aa,pdb
0,DP02342,,,,1.0,1.0,,,P06837,1,M,
1,DP02342,,,,1.0,1.0,,,P06837,2,L,
2,DP02342,,,,1.0,1.0,,,P06837,3,C,
3,DP02342,,,,1.0,1.0,,,P06837,4,C,
4,DP02342,,,,1.0,1.0,,,P06837,5,M,
...,...,...,...,...,...,...,...,...,...,...,...,...
378661,DP02544,,,,1.0,1.0,,,Q04410,368,Q,
378662,DP02544,,,,1.0,1.0,,,Q04410,369,S,
378663,DP02544,,,,1.0,1.0,,,Q04410,370,S,
378664,DP02544,,,,1.0,1.0,,,Q04410,371,S,


## Write files

Challenge definitions

- The first list are the columns to be considered as positive (any)
- The second list (mask) are the columns to be considered as negative (any)
- If the second list is not provided all non-positives are considered negatives
- In case of conflicts, the positives always overwrite the negatives
- If mask is provided proteins without at least one residue that could be masked (even when overwritten by a positive) are excluded (e.g. only proteins with PDB observed residues are considered) 


In [21]:
# Reorder the columns
head_cols = ['disprot_id', 'acc', 'pos', 'aa']
disprot_cols = list(df_challenge['challenge'].unique())
other_cols = sorted(list((set(df_regions_all.columns.tolist()) - set(head_cols)) - set(disprot_cols)))
cols = head_cols + disprot_cols + other_cols
df_regions_all = df_regions_all[cols]
df_regions_all

Unnamed: 0,disprot_id,acc,pos,aa,disorder,linker,transition,binding nucleic acid,binding,binding protein,disorder_nox,pdb
0,DP02342,P06837,1,M,1.0,,,,,,1.0,
1,DP02342,P06837,2,L,1.0,,,,,,1.0,
2,DP02342,P06837,3,C,1.0,,,,,,1.0,
3,DP02342,P06837,4,C,1.0,,,,,,1.0,
4,DP02342,P06837,5,M,1.0,,,,,,1.0,
...,...,...,...,...,...,...,...,...,...,...,...,...
378661,DP02544,Q04410,368,Q,1.0,,,,,,1.0,
378662,DP02544,Q04410,369,S,1.0,,,,,,1.0,
378663,DP02544,Q04410,370,S,1.0,,,,,,1.0,
378664,DP02544,Q04410,371,S,1.0,,,,,,1.0,


In [22]:
# Write the dataframe
df_regions_all.to_csv(dataset_file, sep="\t", index=False)

### Write references (Fasta format)

* The next element overwrites the previous in the "class" list
* The "fill" field is used to fill unassigned positions
* Only proteins with at least a "1" are written to file

In [23]:
challenges = {'linker': {'class': [('linker', '1')], 'fill': '0'}, 
              'linker_disorder': {'class': [('disorder', '0'), ('linker', '1')], 'fill': '-'},
              'disorder': {'class': [('disorder', '1')], 'fill': '0'}, 
              'disorder_nox': {'class': [('disorder_nox', '1')], 'fill': '0'}, 
              'disorder_pdb': {'class': [('pdb', '0'), ('disorder', '1')], 'fill': '-'},
              'disorder_pdb_fill': {'class': [('pdb', '0'), ('disorder', '1')], 'fill': '1'},
              'binding': {'class': [('binding', '1')], 'fill': '0'},
              'binding_nucleic_acid': {'class': [('binding nucleic acid', '1')], 'fill': '0'},
              'binding_disorder': {'class': [('disorder', '0'), ('binding', '1')], 'fill': '-'},
             }

for file_name, challenge in challenges.items():
    with open("{}/{}.fasta".format(references_dir, file_name), "w") as fout:
        for disprot_id, df_g in df_regions_all.groupby('disprot_id'):
            df_g['output'] = np.nan
            # Assign class
            for column, value in challenge['class']:
                df_g.loc[df_g[column].notnull(), 'output'] = value
            # Fill
            if df_g['output'].notnull().any() and challenge.get('fill'):
                df_g.loc[df_g['output'].isnull(), 'output'] = challenge['fill'] 
            # Write proteins with at least one positive assignment
            if (df_g['output'] == '1').any():
                fout.write(">{}\n{}\n{}\n".format(disprot_id, "".join(df_g['aa']), "".join(df_g['output'])))
