# Generate CAID-2 references

DisProt data can be obtained directly exporting the relevant database collections (ask the developers): 

```bash
# 18 Aug 2022
mongoexport -d disprot8 -c entries_2022_06 -o disprot_entries_2022_06.mjson
mongoexport -d disprot8 -c entries_2022_12_c -o disprot_entries_2022_12_c.mjson
scp moros:disprot_entries* .
```
Or using the download service from the website (lastest annotations might not be available to the public). Note the formats are slightly different.

AlphaFold (processed) predictions can be obtained using the code in the [AlphaFold-disorder](https://github.com/BioComputingUP/AlphaFold-disorder) repository.

Preliminary steps:
```bash
# Generate the folder structure
mkdir -p ../data/{disprot,sifts,alphafold,output/references}
    
# Download data (18 Aug 2022)
wget -O ../data/sifts/ ftp://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/uniprot_segments_observed.tsv.gz
wget -O ../data/disprot/ http://purl.obolibrary.org/obo/go/go-basic.obo
```

In [1]:
import os
import json
import pandas as pd
import networkx
import numpy as np
import obonet  # conda install -c biobuilds obonet

In [2]:
go_obo_file = "../data/disprot/go-basic.obo"
disprot_old_file = "../data/disprot/disprot_entries_2022_06.mjson"
disprot_new_file = "../data/disprot/disprot_entries_2022_12_c.mjson"
sifts_file = "../data/sifts/uniprot_segments_observed.tsv.gz"

# Optional
alphafold_dir = "../data/alphafold"
gene3d_file = ""

# Output
references_dir = "../data/output/references"
raw_dataset_file = "../data/output/raw_dataset.tsv"
dataset_file = "../data/output/dataset.tsv"
fasta_new_file = "../data/output/disprot_new.fasta"
fasta_old_file = "../data/output/disprot_old.fasta"

In [3]:
def expand_region(df_:pd.DataFrame, start_col:str='start', end_col:str='end', res_col:str='reg_position') -> pd.DataFrame:
    df_[res_col] = list(range(int(df_[start_col]), int(df_[end_col]) + 1, 1))
    return df_

def expand_sequence(df_:pd.DataFrame, seq_column:str='sequence', res_col:str='seq_aa') -> pd.DataFrame:
    df_[res_col] = [(i+1, aa) for i, aa in enumerate(df_[seq_column])]
    return df_

## Associate DisProt annotation terms to CAID challenges

In [4]:
# IDPO terms
data_idpo = [('IDPO:00076', 'disorder'), ('IDPO:00077', 'disorder'), ('IDPO:00078', 'disorder'), 
                   ('IDPO:00501', 'linker'), ('IDPO:00502', 'linker'), ('IDPO:00503', 'linker'), 
                   ('IDPO:00504', 'linker'), ('IDPO:00049', 'transition'), ('IDPO:00050', 'transition'), 
                   ('IDPO:00051', 'transition'), ('IDPO:00052', 'transition'), ('IDPO:00053', 'transition'), 
                   ('IDPO:00060', 'transition'), ('IDPO:00055', 'transition'), ('IDPO:00056', 'transition'), 
                   ('IDPO:00061', 'transition'), ('IDPO:00054', 'transition'), ('IDPO:00057', 'transition'), 
                   ('IDPO:00058', 'transition'), ('IDPO:00059', 'transition')]

# GO ancestor terms coresponding to CAID2 challenges
ancestors = {'GO:0005488': 'binding', 'GO:0003676': 'nucleic acid binding', 'GO:0005515': 'protein binding'}

In [5]:
# The OBO must have "ontology: GO" header (first line)
graph = obonet.read_obo(go_obo_file)

# Remove all edges which are not "is_a"
to_remove = []
for e in graph.edges:
    if e[2] != 'is_a':
        to_remove.append((e[0], e[1]))
for ele in to_remove:
    graph.remove_edge(*ele)
    
# Create children table
data_go = []    
for node in graph.nodes(data=True):
    challenge = ancestors.get(node[0])
    if challenge is not None:
        data_go.append([node[0], challenge])
        for children in networkx.ancestors(graph, node[0]): 
            data_go.append([children, challenge])

In [6]:
df_challenge = pd.DataFrame(data=data_idpo + data_go, columns=['term_id', 'challenge']).drop_duplicates()
df_challenge

Unnamed: 0,term_id,challenge
0,IDPO:00076,disorder
1,IDPO:00077,disorder
2,IDPO:00078,disorder
3,IDPO:00501,linker
4,IDPO:00502,linker
...,...,...
3082,GO:0051010,protein binding
3083,GO:0086080,protein binding
3084,GO:0017058,protein binding
3085,GO:0035373,protein binding


## Process DisProt annotations

In [7]:
# Get DisProt annotations
disprot_old = {}
with open(disprot_old_file, "r") as f:
    for line in f:
        obj = json.loads(line)
        disprot_old[obj["disprot_id"]] = obj
        
disprot_new = {}
with open(disprot_new_file, "r") as f:
    for line in f:
        obj = json.loads(line)
        disprot_new[obj["disprot_id"]] = obj

In [8]:
# Get new annotations (delta = new - old)
dataset = []
for disprot_id in disprot_new:
    if disprot_id not in disprot_old and "obsolete" not in disprot_new[disprot_id] and "X" not in disprot_new[disprot_id]["sequence"]:
        # Filter out obsolete regions
        disprot_new[disprot_id]["regions"] = [region for region in disprot_new[disprot_id]["regions"] if "obsolete" not in region]
        if disprot_new[disprot_id]["regions"]:
            dataset.append(disprot_new[disprot_id])

In [10]:
# Write fasta
with open(fasta_new_file, "w") as fout:
    for obj in dataset:
        fout.write(">{}|{}\n{}\n".format(obj['disprot_id'], obj['acc'], obj['sequence']))

with open(fasta_old_file, "w") as fout:
    for disprot_id, obj in disprot_old.items():
        if "obsolete" not in obj:
            fout.write(">{}|{}\n{}\n".format(obj['disprot_id'], obj['acc'], obj['sequence']))


In [None]:
entry_columns = ['disprot_id', 'acc', 'ncbi_taxon_id', 'organism', 'sequence']
df = pd.json_normalize(data=dataset, record_path=['regions'], meta=entry_columns, meta_prefix='', record_prefix='')
df.to_csv(raw_dataset_file, sep="\t", index=False)
df

In [None]:
region_columns = ["start", "end", "term_id"]
df = df.loc[:, entry_columns + region_columns]
df

In [None]:
# Transform the per-protein dataframe into a per-residue dataframe 
df_regions = df.apply(expand_region, axis=1).loc[:, ["disprot_id", "term_id", "reg_position"]].copy(deep=True)
df_regions = pd.merge(left=df_regions, right=df_challenge, how="inner", left_on="term_id", right_on="term_id").drop(columns=["term_id"])
df_regions = df_regions.explode("reg_position").drop_duplicates()
df_regions['has_region'] = 1
df_regions

In [None]:
# Create the pivot table. Transpose challenge values into columns 
df_regions = pd.pivot_table(
    df_regions,
    columns="challenge",
    index=['disprot_id', 'reg_position'],
    values='has_region')
df_regions = df_regions.reset_index()
df_regions

In [None]:
# Get dataset sequences (1 residue per row)
df_sequence = df.apply(expand_sequence, axis=1).copy(deep=True).drop(columns=["ncbi_taxon_id", "organism", "start", "end", "sequence", "term_id"])
df_sequence = df_sequence.explode("seq_aa")
df_sequence[['pos', 'aa']] = pd.DataFrame(df_sequence['seq_aa'].tolist(), index=df_sequence.index)
df_sequence = df_sequence.drop(columns='seq_aa').drop_duplicates()
df_sequence

In [None]:
# Add sequence positions not mapping to any DisProt region
df_regions = pd.merge(left=df_regions, right=df_sequence, how="right", left_on=["disprot_id", "reg_position"], right_on=["disprot_id", "pos"])
df_regions.drop(columns="reg_position", inplace=True)
df_regions

## Map PDB onbserved positions using SIFTS

In [None]:
df_sifts = pd.read_csv(sifts_file, sep="\t", header=1)
# Filter for dataset entries
df_sifts = df_sifts.loc[df_sifts['SP_PRIMARY'].isin(df_regions['acc'])]
# Explode observed regions 
df_sifts = df_sifts.apply(expand_region, start_col="SP_BEG", end_col="SP_END", axis=1)
df_sifts = df_sifts.explode("reg_position")
df_sifts = df_sifts.loc[:, ['SP_PRIMARY', 'reg_position']].drop_duplicates().reset_index(drop=True).rename(columns={"SP_PRIMARY": "acc"})
df_sifts

In [None]:
df_regions = pd.merge(df_regions, df_sifts, left_on=["acc", "pos"], right_on=["acc", "reg_position"], how="left")
df_regions.rename(columns={"reg_position": "pdb"}, inplace=True)
df_regions.loc[df_regions['pdb'].notnull(), 'pdb'] = 1.0
df_regions

## Add AlphaFold prediction

In [None]:
df_list = []
for af_file in os.listdir(alphafold_dir):
    df_list.append(pd.read_csv("{}/{}".format(alphafold_dir, af_file), sep='\t'))
df_af = pd.concat(df_list, ignore_index=True)
del df_list

In [None]:
# WARNING: rename depends on the version (only full lenght predictions should be used)
df_af['acc'] = df_af['name'].apply(lambda x: x.replace('AF-','').replace('-F1-model_v3',''))
df_af = df_af.rename(columns={"disorder": "af-disorder", "disorder-25": "af-rsa", "binding-25-0.581": "af-binding"})
df_af = df_af[["acc", "pos", "aa", "af-disorder", "af-rsa", "af-binding"]]
df_af

In [None]:
df_regions = pd.merge(df_regions, df_af, on=["acc", "pos", "aa"], how="left")
df_regions

## Write files

Challenge definitions

- The first list are the columns to be considered as positive (any)
- The second list (mask) are the columns to be considered as negative (any)
- If the second list is not provided all non-positives are considered negatives
- In case of conflicts, the positives always overwrite the negatives
- If mask is provided proteins without at least one residue that could be masked (even when overwritten by a positive) are excluded (e.g. only proteins with PDB observed residues are considered) 


In [None]:
# Reorder the columns
head_cols = ['disprot_id', 'acc', 'pos', 'aa']
disprot_cols = list(df_challenge['challenge'].unique())
other_cols = sorted(list((set(df_regions.columns.tolist()) - set(head_cols)) - set(disprot_cols)))
cols = head_cols + disprot_cols + other_cols
print(cols)

df_regions = df_regions[cols]
df_regions

In [None]:
# Write the dataframe
df_regions.to_csv(dataset_file, sep="\t", index=False)

In [None]:
# Write the fastas
challenges = [[['linker'], []], 
              [['disorder'], []], 
              [['protein binding'], []], 
              [['nucleic acid binding'], []], 
              [['binding'], []],
              [['disorder'], ['pdb']],
             ]

for challenge, mask in challenges:
    
    file_name = "-".join(["_".join(c.split()) for c in challenge])
    if mask:
        file_name = file_name + "@" + "-".join(["-".join(m.split()) for m in mask])
    
    with open("{}/{}.fasta".format(references_dir, file_name), "w") as fout:
        for disprot_id, df_g in df_regions.groupby('disprot_id'):

            if mask:
                df_g['output'] = '-'
                df_g.loc[df_g[mask].notnull().any(axis='columns'), 'output'] = '0'
            else:
                df_g['output'] = '0'
                
            df_g.loc[df_g[challenge].notnull().any(axis='columns'), 'output'] = '1'
            
            # If mask is provided also check the protein has a at least one residue that could be masked (even when overwritten by a positive) 
            if df_g[challenge].notnull().any(axis='columns').any() and (not mask or df_g[mask].notnull().any(axis='columns').any()):  
                fout.write(">{}\n{}\n{}\n".format(disprot_id, "".join(df_g['aa']), "".join(df_g['output'])))
