# ENVs

**Comments**  
* Variables for directories always starts with '`dir_`' and ends without '`/`'
* Variables for dataframes always starts with '`df_`'

In [None]:
dir_pjs            = "/home/martingb/Projects"
# caid-reference
dir_main               = f"{dir_pjs}/2022/caid2-reference"
dir_data               = f"{dir_main}/data"
dir_data_sifts         = f"{dir_data}/sifts"
dir_data_alphafold     = f"{dir_data}/alphafold"
dir_data_disprot       = f"{dir_data}/disprot"
dir_src                = f"{dir_main}/src"
dir_tmp                = f"{dir_main}/tmp"
dir_results            = f"{dir_main}/results"
dir_results_tables     = f"{dir_results}/tables"
dir_results_references = f"{dir_results}/references"
dir_results_imgs       = f"{dir_results}/imgs"
dir_src_modules        = f"{dir_src}/modules"

# Imports

In [None]:
import os
import json
import requests
import pandas as pd
import numpy as np
from pprint import pprint

# Functions

In [None]:
def expand_region(adf: pd.DataFrame, database: str = 'disprot') -> pd.DataFrame:
    """
    """
    if database == 'disprot':
        adf["reg_position"] = list(range(int(adf["start"]), int(adf["end"]) + 1, 1))
    elif database == 'sifts':
        adf["sp_position"] = list(range(int(adf["SP_BEG"]), int(adf["SP_END"]) + 1, 1))
    else:
        raise ValueError(f"'{database}' is not a choise for parameter 'database'")
    return adf

def expand_seq(adf: pd.DataFrame) -> pd.DataFrame:
    """
    """
    adf["seq_position_aa"] = [(i+1, aa) for i, aa in enumerate(adf["sequence"])]
    return adf

def get_closed_intervals(alist : list, min_elems : int) -> list:
    """
    Describe a list of closed intervals as tuples of two ints for each
    region with at least the 'min' number of consecutive elements given.
    Example:
    alist = [2,3,4,5,6,7,8,9,10,11,15,16,17,18,19,20,21,22,23,24]
    min_elems = 10
    result: [(2,11)]
    """
    intervals = []
    interval = (int(alist[0]), int(alist[0]))
    for pos in alist[1:]:
        pos = int(pos)
        if (interval[1] + 1) == pos:
            interval = (interval[0], pos)
            continue
        else:
            if (interval[1] - interval[0]) >= min_elems - 1:
                intervals.append(interval)
            interval = (pos, pos)
            continue
    if (interval[1]) == alist[-1] and \
       (interval[1] - interval[0]) >= min_elems:
        intervals.append((interval[0], alist[-1]))
    return intervals

# CAID2

## Private DisProt json

In [None]:
try:
    os.path.exists(f"{dir_data_disprot}/entries_private_caid2.json")
except FileNotFoundError as e:
    raise e
else:
    with open(f"{dir_data_disprot}/entries_private_caid2.json", "r") as f:
        json_data = json.load(f)
        json_private = json_data['data']

## Dataframe for private DisProt entries

### Columns to take in account

In [None]:
main_columns = [ 'disprot_id'
               , 'acc'
               , 'name'
               , 'ncbi_taxon_id'           # int
               , 'organism'
               , 'sequence'
               , 'taxonomy' ]              # list of str (organisms)

region_columns = [ "disprot_namespace"  # str
                 , "region_id"          # str
                 , "date"               # str
                 , "start"              # int
                 , "end"                # int
                 , "term_id"            # str (GO and IDPO terms)
                 , "term_name"          # str (GO and IDPO description)
                 , "term_namespace"     # str (GO and IDPO namespace)
                 , "term_ontology" ]    # str (type of ontology, e.i.: GO, IDPO)

#### New columns to be used

In [None]:
region_stats_columns = [ "curator_name"       # str (Full name)
                       , "curator_orcid" ]    # str (ORCID ID of curator)

In [None]:
df_disprot_private = pd.json_normalize( data          = json_private
                                      , record_path   = ['regions']
                                      , meta          = main_columns
                                      , meta_prefix   = ''
                                      , record_prefix = '' )
df_disprot_private = df_disprot_private.loc[:, main_columns + region_columns + region_stats_columns]
df_disprot_private

## Load dataframes

### Dataframe for ontology terms

In [None]:
try:
    os.path.exists(f'{dir_results_tables}/disprot_ontologies_for_caid2.csv')
except FileNotFoundError as e:
    raise e
else:
    df_challenges = pd.read_csv(f'{dir_results_tables}/disprot_ontologies_for_caid2.csv', header=0)
df_challenges

### Dataframe for private DisProt entries

In [None]:
try:
    os.path.exists(f'{dir_results_tables}/disprot_private.csv')
except FileNotFoundError as e:
    raise e
else:
    df_disprot_private = pd.read_csv(f'{dir_results_tables}/disprot_private.csv', header=0)
df_disprot_private

### Dataframe for sequence

In [None]:
try:
    os.path.exists(f'{dir_results_tables}/disprot_private_sequences.csv')
except FileNotFoundError as e:
    raise e
else:
    df_sequence = pd.read_csv(f'{dir_results_tables}/disprot_private_sequences.csv', header=0)
df_sequence

### Dataframe for regions

In [None]:
try:
    os.path.exists(f'{dir_results_tables}/disprot_private_regions.csv')
except FileNotFoundError as e:
    raise e
else:
    df_regions = pd.read_csv(f'{dir_results_tables}/disprot_private_regions.csv', header=0)
df_regions

### Dataframe DisProt sequences with wwPDB (SIFTS)

In [None]:
try:
    os.path.exists(f'{dir_results_tables}/disprot_private_sifts.csv')
except FileNotFoundError as e:
    raise e
else:
    df_sequence_sifts = pd.read_csv(f'{dir_results_tables}/disprot_private_sifts.csv', header=0)
df_sequence_sifts

### Dataframe DisProt sequences with Alphafold score

In [None]:
try:
    os.path.exists(f'{dir_results_tables}/disprot_private_alphafold.csv')
except FileNotFoundError as e:
    raise e
else:
    df_sequence_af = pd.read_csv(f'{dir_results_tables}/disprot_private_alphafold.csv', header=0)
df_sequence_af

## Overlaps

### Alphafold-DisProt

In [None]:
df_overlaps_merge = pd.DataFrame(columns=['disprot_id'])
for challenge in ['disorder', 'transition', 'binding']:
    df_disprot_ids_with_af_overlaps = pd.DataFrame(columns=['disprot_id'])
    # Challenge + Alphafold
    df_sequence_af_challenge = pd.merge( left     = df_sequence_af
                                       , right    = df_regions.loc[df_regions['challenge'] == challenge]
                                       , left_on  = ["disprot_id", 'acc', 'seq_position']
                                       , right_on = ["disprot_id", 'acc', 'reg_position']
                                       , how      = 'left' )
    df_sequence_af_challenge['result'] = '-'
    df_sequence_af_challenge.loc[df_sequence_af_challenge['lddt'] > 0.7, 'result'] = '0'  # Assigment: 0 for AF order
    df_sequence_af_challenge.loc[df_sequence_af_challenge['challenge'].notnull(), 'result'] = '1' # Assigment: 1 for challenge
    #df_sequence_af_challenge = df_sequence_af_challenge[(df_sequence_af_challenge['lddt'] > 0.7) & (df_sequence_af_challenge['result'] == '1')]
    #
    df_sequence_af_challenge = df_sequence_af_challenge.query('lddt > 0.7 & result == "1"')
    for disprot_id, df_overlaps in df_sequence_af_challenge.groupby(by=['disprot_id']):
        list_overlaps = get_closed_intervals(alist = df_overlaps.reg_position.tolist(), min_elems = 10)
        # if disprot_id == 'DP02342':
        #     print(f"{challenge}:\t{list_overlaps}")
        if len(list_overlaps) > 0:
            df_disprot_ids_with_af_overlaps = pd.concat( [ df_disprot_ids_with_af_overlaps
                                                         , pd.DataFrame({'disprot_id': disprot_id
                                                                       , challenge: [list_overlaps]}, columns=['disprot_id', challenge])]
                                                       , ignore_index=True)
    if not df_overlaps_merge.empty:
        df_overlaps_merge = pd.merge( left=df_overlaps_merge
                                    , right=df_disprot_ids_with_af_overlaps
                                    , left_on='disprot_id'
                                    , right_on='disprot_id'
                                    , how='outer')
    else:
        df_overlaps_merge = df_disprot_ids_with_af_overlaps.copy(deep=True)

df_overlaps_merge.to_csv(f'{dir_results_tables}/overlaps_disprot_private_af.csv', index=False)

### wwPDB-DisProt

In [None]:
df_overlaps_merge = pd.DataFrame(columns=['disprot_id'])
for challenge in ['disorder', 'transition', 'binding']:
    list_disprot_ids = []
    df_disprot_ids_with_pdb_overlaps = pd.DataFrame(columns=['disprot_id'])
    # Challenge + Alphafold
    df_sequence_sifts_challenge = pd.merge( left     = df_sequence_sifts
                                          , right    = df_regions.loc[df_regions['challenge'] == challenge]
                                          , left_on  = ["disprot_id", 'acc', 'seq_position']
                                          , right_on = ["disprot_id", 'acc', 'reg_position']
                                          , how      = 'left' )
    # df_sequence_sifts_challenge.loc[(df_sequence_sifts_challenge['sp_position']) == (df_sequence_sifts_challenge['reg_position'])]
    df_sequence_sifts_challenge = df_sequence_sifts_challenge.query('sp_position == reg_position')
    for disprot_id, df_overlaps in df_sequence_sifts_challenge.groupby(by=['disprot_id']):
        list_overlaps = get_closed_intervals(alist = df_overlaps.reg_position.tolist(), min_elems = 10)
        # if disprot_id == 'DP02342':
        #     print(f"{challenge}:\t{list_overlaps}")
        if len(list_overlaps) > 0:
            df_disprot_ids_with_pdb_overlaps = pd.concat( [ df_disprot_ids_with_pdb_overlaps
                                                         , pd.DataFrame({'disprot_id': disprot_id
                                                                       , challenge: [list_overlaps]}, columns=['disprot_id', challenge])]
                                                       , ignore_index=True)
    if not df_overlaps_merge.empty:
        df_overlaps_merge = pd.merge( left=df_overlaps_merge
                                    , right=df_disprot_ids_with_pdb_overlaps
                                    , left_on='disprot_id'
                                    , right_on='disprot_id'
                                    , how='outer')
    else:
        df_overlaps_merge = df_disprot_ids_with_pdb_overlaps.copy(deep=True)

df_overlaps_merge.to_csv(f'{dir_results_tables}/overlaps_disprot_private_pdb.csv', index=False)

## Plots

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Folder destination for plots
os.makedirs(f"{dir_results_imgs}", exist_ok=True)

### Load Dataframe of DisProt entries and tag challenges

In [None]:
# Comment: some regions with are not matched with IDPO and GO terms because were not considerer to be part of CAID2 challenge.
# Example:
#    IDPO:00024 (molecular recognition display site ) and its children
#    IDPO:00505 (self-regulatory activity) and its children
#    GO terms that are not child of challenge ancestors
df_disprot_private_challenges = pd.merge( left  = df_disprot_private
                                        , right = df_challenges
                                        , how   = "left"
                                        , on    = "term_id" )
df_disprot_private_challenges

In [None]:
df_disprot_private_challenges_core = df_disprot_private_challenges.query('challenge.notnull()').copy(deep=True)

### Curators

#### Who curated the DisProt entries for the CAID2 dataset?

In [None]:
df_disprot_private_challenges_core.query('challenge.notnull()')['curator_name'].value_counts()

In [None]:
s_curators = df_disprot_private_challenges_core.groupby('curator_name')['challenge'].apply(lambda x: x.value_counts())

In [None]:
sns.set_theme(style="whitegrid")

fig = s_curators.unstack().plot( kind='barh'
                         , stacked=True
                         , figsize=(10,15)
                         , sort_columns=True).set(ylabel="")[0].get_figure()
fig.savefig(f"{dir_results_imgs}/barh_curators.png")

### Lenght

### Lenght with `negative` and `positive` residues