# ENVs

**Comments**  
* Variables for directories always starts with '`dir_`' and ends without '`/`'
* Variables for dataframes always starts with '`df_`'

In [1]:
dir_pjs            = "/home/martingb/Projects"
# caid-reference
dir_main               = f"{dir_pjs}/2022/caid2-reference"
dir_data               = f"{dir_main}/data"
dir_data_sifts         = f"{dir_data}/sifts"
dir_data_alphafold     = f"{dir_data}/alphafold"
dir_data_disprot       = f"{dir_data}/disprot"
dir_src                = f"{dir_main}/src"
dir_tmp                = f"{dir_main}/tmp"
dir_results            = f"{dir_main}/results"
dir_results_tables     = f"{dir_results}/tables"
dir_results_references = f"{dir_results}/references"
dir_src_modules        = f"{dir_src}/modules"

# Imports

In [2]:
import os
import json
import requests
import pandas as pd
import numpy as np
from pprint import pprint
# For GO and IDPO terms
import networkx
import obonet
import math

# Functions

In [3]:
def expand_region(adf: pd.DataFrame, database: str = 'disprot') -> pd.DataFrame:
    """
    """
    if database == 'disprot':
        adf["reg_position"] = list(range(int(adf["start"]), int(adf["end"]) + 1, 1))
    elif database == 'sifts':
        adf["sp_position"] = list(range(int(adf["SP_BEG"]), int(adf["SP_END"]) + 1, 1))
    else:
        raise ValueError(f"'{database}' is not a choise for parameter 'database'")
    return adf

def expand_seq(adf: pd.DataFrame) -> pd.DataFrame:
    """
    """
    adf["seq_position_aa"] = [(i+1, aa) for i, aa in enumerate(adf["sequence"])]
    return adf

def get_closed_intervals(alist : list, min_elems : int) -> list:
    """
    Describe a list of closed intervals as tuples of two ints for each
    region with at least the 'min' number of consecutive elements given.
    Example:
    alist = [2,3,4,5,6,7,8,9,10,11,15,16,17,18,19,20,21,22,23,24]
    min_elems = 10
    result: [(2,11)]
    """
    intervals = []
    interval = (int(alist[0]), int(alist[0]))
    for pos in alist[1:]:
        pos = int(pos)
        if (interval[1] + 1) == pos:
            interval = (interval[0], pos)
            continue
        else:
            if (interval[1] - interval[0]) >= min_elems - 1:
                intervals.append(interval)
            interval = (pos, pos)
            continue
    if (interval[1]) == alist[-1] and \
       (interval[1] - interval[0]) >= min_elems:
        intervals.append((interval[0], alist[-1]))
    return intervals

# CAID2

## Load dataframes

### Dataframe for ontology terms

In [4]:
try:
    os.path.exists(f'{dir_results_tables}/disprot_ontologies_for_caid2.csv')
except FileNotFoundError as e:
    raise e
else:
    df_challenges = pd.read_csv(f'{dir_results_tables}/disprot_ontologies_for_caid2.csv', header=0)
df_challenges

Unnamed: 0,term_id,challenge
0,GO:0005488,binding
1,GO:0097016,binding
2,GO:0001223,binding
3,GO:0045295,binding
4,GO:0033414,binding
...,...,...
3082,IDPO:00061,transition
3083,IDPO:00054,transition
3084,IDPO:00057,transition
3085,IDPO:00058,transition


### Dataframe for private DisProt entries

In [5]:
try:
    os.path.exists(f'{dir_results_tables}/disprot_private.csv')
except FileNotFoundError as e:
    raise e
else:
    df_disprot_private = pd.read_csv(f'{dir_results_tables}/disprot_private.csv', header=0)
df_disprot_private

Unnamed: 0,disprot_id,acc,name,ncbi_taxon_id,organism,sequence,taxonomy,disprot_namespace,region_id,date,start,end,term_id,term_name,term_namespace,term_ontology
0,DP02342,P06837,Neuromodulin,10090,Mus musculus,MLCCMRRTKQVEKNDEDQKIEQDGVKPEDKAHKAATKIQASFRGHI...,"['Eukaryota', 'Metazoa', 'Chordata', 'Craniata...",Structural state,DP02342r003,2022-02-14T09:00:00.000Z,1,227,IDPO:00078,pre-molten globule,Structural state,IDPO
1,DP02342,P06837,Neuromodulin,10090,Mus musculus,MLCCMRRTKQVEKNDEDQKIEQDGVKPEDKAHKAATKIQASFRGHI...,"['Eukaryota', 'Metazoa', 'Chordata', 'Craniata...",Disorder function,DP02342r006,2022-02-14T09:00:00.000Z,34,57,GO:0051179,localization,Biological process,GO
2,DP02342,P06837,Neuromodulin,10090,Mus musculus,MLCCMRRTKQVEKNDEDQKIEQDGVKPEDKAHKAATKIQASFRGHI...,"['Eukaryota', 'Metazoa', 'Chordata', 'Craniata...",Disorder function,DP02342r007,2022-02-14T09:00:00.000Z,34,57,GO:0098772,molecular function regulator,Molecular function,GO
3,DP02342,P06837,Neuromodulin,10090,Mus musculus,MLCCMRRTKQVEKNDEDQKIEQDGVKPEDKAHKAATKIQASFRGHI...,"['Eukaryota', 'Metazoa', 'Chordata', 'Craniata...",Structural state,DP02342r009,2022-02-14T09:00:00.000Z,1,227,IDPO:00078,pre-molten globule,Structural state,IDPO
4,DP02342,P06837,Neuromodulin,10090,Mus musculus,MLCCMRRTKQVEKNDEDQKIEQDGVKPEDKAHKAATKIQASFRGHI...,"['Eukaryota', 'Metazoa', 'Chordata', 'Craniata...",Disorder function,DP02342r010,2022-02-14T09:00:00.000Z,34,57,GO:0005515,protein binding,Molecular function,GO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1163,DP03744,Q9VVJ7,CG7484 protein,7227,Drosophila melanogaster,MHKCAIFLLLALSCQQIQAELTAADCRALGFIKAQLMCSSCEKLDD...,"Eukaryota, Metazoa, Ecdysozoa, Arthropoda, Hex...",Disorder function,DP03744r002,2022-06-17T10:48:39.874Z,53,178,GO:0045454,cell redox homeostasis,Biological process,GO
1164,DP03745,Q8VHC3,Selenoprotein M,10090,Mus musculus,MSILLSPPSLLLLLAALVAPATSTTNYRPDWNRLRGLARGRVETCG...,"Eukaryota, Metazoa, Chordata, Craniata, Verteb...",Structural state,DP03745r001,2022-06-17T10:51:32.225Z,25,34,IDPO:00076,disorder,Structural state,IDPO
1165,DP03745,Q8VHC3,Selenoprotein M,10090,Mus musculus,MSILLSPPSLLLLLAALVAPATSTTNYRPDWNRLRGLARGRVETCG...,"Eukaryota, Metazoa, Chordata, Craniata, Verteb...",Structural state,DP03745r002,2022-06-17T10:51:44.007Z,121,145,IDPO:00076,disorder,Structural state,IDPO
1166,DP03745,Q8VHC3,Selenoprotein M,10090,Mus musculus,MSILLSPPSLLLLLAALVAPATSTTNYRPDWNRLRGLARGRVETCG...,"Eukaryota, Metazoa, Chordata, Craniata, Verteb...",Disorder function,DP03745r003,2022-06-17T10:52:51.196Z,24,145,GO:0045454,cell redox homeostasis,Biological process,GO


### Dataframe for sequence

In [6]:
try:
    os.path.exists(f'{dir_results_tables}/disprot_private_sequences.csv')
except FileNotFoundError as e:
    raise e
else:
    df_sequence = pd.read_csv(f'{dir_results_tables}/disprot_private_sequences.csv', header=0)
df_sequence

Unnamed: 0,disprot_id,acc,seq_position,seq_aa
0,DP02342,P06837,1,M
1,DP02342,P06837,2,L
2,DP02342,P06837,3,C
3,DP02342,P06837,4,C
4,DP02342,P06837,5,M
...,...,...,...,...
297441,DP03746,Q9QUH6-2,1280,Q
297442,DP03746,Q9QUH6-2,1281,L
297443,DP03746,Q9QUH6-2,1282,L
297444,DP03746,Q9QUH6-2,1283,I


### Dataframe for regions

In [7]:
try:
    os.path.exists(f'{dir_results_tables}/disprot_private_regions.csv')
except FileNotFoundError as e:
    raise e
else:
    df_regions = pd.read_csv(f'{dir_results_tables}/disprot_private_regions.csv', header=0)
df_regions

Unnamed: 0,disprot_id,acc,challenge,reg_position
0,DP02342,P06837,disorder,1
1,DP02342,P06837,disorder,2
2,DP02342,P06837,disorder,3
3,DP02342,P06837,disorder,4
4,DP02342,P06837,disorder,5
...,...,...,...,...
57856,DP03746,Q9QUH6-2,disorder,382
57857,DP03746,Q9QUH6-2,disorder,383
57858,DP03746,Q9QUH6-2,disorder,384
57859,DP03746,Q9QUH6-2,disorder,385


### Dataframe DisProt sequences with wwPDB (SIFTS)

In [8]:
try:
    os.path.exists(f'{dir_results_tables}/disprot_private_sifts.csv')
except FileNotFoundError as e:
    raise e
else:
    df_sequence_sifts = pd.read_csv(f'{dir_results_tables}/disprot_private_sifts.csv', header=0)
df_sequence_sifts

Unnamed: 0,disprot_id,acc,seq_position,seq_aa,sp_position
0,DP02342,P06837,1,M,
1,DP02342,P06837,2,L,
2,DP02342,P06837,3,C,
3,DP02342,P06837,4,C,
4,DP02342,P06837,5,M,
...,...,...,...,...,...
297441,DP03746,Q9QUH6-2,1280,Q,
297442,DP03746,Q9QUH6-2,1281,L,
297443,DP03746,Q9QUH6-2,1282,L,
297444,DP03746,Q9QUH6-2,1283,I,


### Dataframe DisProt sequences with Alphafold score

In [9]:
try:
    os.path.exists(f'{dir_results_tables}/disprot_private_alphafold.csv')
except FileNotFoundError as e:
    raise e
else:
    df_sequence_af = pd.read_csv(f'{dir_results_tables}/disprot_private_alphafold.csv', header=0)
df_sequence_af

Unnamed: 0,disprot_id,acc,seq_position,seq_aa,lddt
0,DP02342,P06837,1,M,0.730
1,DP02342,P06837,2,L,0.734
2,DP02342,P06837,3,C,0.744
3,DP02342,P06837,4,C,0.704
4,DP02342,P06837,5,M,0.708
...,...,...,...,...,...
297441,DP03746,Q9QUH6-2,1280,Q,
297442,DP03746,Q9QUH6-2,1281,L,
297443,DP03746,Q9QUH6-2,1282,L,
297444,DP03746,Q9QUH6-2,1283,I,


## Overlaps

### Alphafold-DisProt

In [10]:
df_overlaps_merge = pd.DataFrame(columns=['disprot_id'])
for challenge in ['disorder', 'transition', 'binding']:
    df_disprot_ids_with_af_overlaps = pd.DataFrame(columns=['disprot_id'])
    # Challenge + Alphafold
    df_sequence_af_challenge = pd.merge( left     = df_sequence_af
                                       , right    = df_regions.loc[df_regions['challenge'] == challenge]
                                       , left_on  = ["disprot_id", 'acc', 'seq_position']
                                       , right_on = ["disprot_id", 'acc', 'reg_position']
                                       , how      = 'left' )
    df_sequence_af_challenge['result'] = '-'
    df_sequence_af_challenge.loc[df_sequence_af_challenge['lddt'] > 0.7, 'result'] = '0'  # Assigment: 0 for AF order
    df_sequence_af_challenge.loc[df_sequence_af_challenge['challenge'].notnull(), 'result'] = '1' # Assigment: 1 for challenge
    #df_sequence_af_challenge = df_sequence_af_challenge[(df_sequence_af_challenge['lddt'] > 0.7) & (df_sequence_af_challenge['result'] == '1')]
    #
    df_sequence_af_challenge = df_sequence_af_challenge.query('lddt > 0.7 & result == "1"')
    for disprot_id, df_overlaps in df_sequence_af_challenge.groupby(by=['disprot_id']):
        list_overlaps = get_closed_intervals(alist = df_overlaps.reg_position.tolist(), min_elems = 10)
        # if disprot_id == 'DP02342':
        #     print(f"{challenge}:\t{list_overlaps}")
        if len(list_overlaps) > 0:
            df_disprot_ids_with_af_overlaps = pd.concat( [ df_disprot_ids_with_af_overlaps
                                                         , pd.DataFrame({'disprot_id': disprot_id
                                                                       , challenge: [list_overlaps]}, columns=['disprot_id', challenge])]
                                                       , ignore_index=True)
    if not df_overlaps_merge.empty:
        df_overlaps_merge = pd.merge( left=df_overlaps_merge
                                    , right=df_disprot_ids_with_af_overlaps
                                    , left_on='disprot_id'
                                    , right_on='disprot_id'
                                    , how='outer')
    else:
        df_overlaps_merge = df_disprot_ids_with_af_overlaps.copy(deep=True)

df_overlaps_merge.to_csv(f'{dir_results_tables}/overlaps_disprot_private_af.csv', index=False)

### wwPDB-DisProt

In [None]:
df_overlaps_merge = pd.DataFrame(columns=['disprot_id'])
for challenge in ['disorder', 'transition', 'binding']:
    list_disprot_ids = []
    df_disprot_ids_with_pdb_overlaps = pd.DataFrame(columns=['disprot_id'])
    # Challenge + Alphafold
    df_sequence_sifts_challenge = pd.merge( left     = df_sequence_sifts
                                          , right    = df_regions.loc[df_regions['challenge'] == challenge]
                                          , left_on  = ["disprot_id", 'acc', 'seq_position']
                                          , right_on = ["disprot_id", 'acc', 'reg_position']
                                          , how      = 'left' )
    # df_sequence_sifts_challenge.loc[(df_sequence_sifts_challenge['sp_position']) == (df_sequence_sifts_challenge['reg_position'])]
    df_sequence_sifts_challenge = df_sequence_sifts_challenge.query('sp_position == reg_position')
    for disprot_id, df_overlaps in df_sequence_sifts_challenge.groupby(by=['disprot_id']):
        list_overlaps = get_closed_intervals(alist = df_overlaps.reg_position.tolist(), min_elems = 10)
        # if disprot_id == 'DP02342':
        #     print(f"{challenge}:\t{list_overlaps}")
        if len(list_overlaps) > 0:
            df_disprot_ids_with_pdb_overlaps = pd.concat( [ df_disprot_ids_with_pdb_overlaps
                                                         , pd.DataFrame({'disprot_id': disprot_id
                                                                       , challenge: [list_overlaps]}, columns=['disprot_id', challenge])]
                                                       , ignore_index=True)
    if not df_overlaps_merge.empty:
        df_overlaps_merge = pd.merge( left=df_overlaps_merge
                                    , right=df_disprot_ids_with_pdb_overlaps
                                    , left_on='disprot_id'
                                    , right_on='disprot_id'
                                    , how='outer')
    else:
        df_overlaps_merge = df_disprot_ids_with_pdb_overlaps.copy(deep=True)

df_overlaps_merge.to_csv(f'{dir_results_tables}/overlaps_disprot_private_pdb.csv', index=False)

## Curators

### Who curated the entries in the CAID dataset

for each curator how many regions he/she curated
maybe by type
(challenge)

#### Private DisProt json

json_private = []
with open(f"{dir_data_disprot}/entries_private_caid2.json", "r") as f:
    json_data = json.loads(f)
    json_private = json_data['data']
    json_private

df_disprot_private

## Plots

### Lenght

### Lenght with `negative` and `positive` residues