# ENVs

In [207]:
dir_pjs         = "/home/martingb/Projects/"
# caid-reference
dir_main        = f"{dir_pjs}2022/caid2-reference/"
dir_data        = f"{dir_main}data/"
dir_data_lists  = f"{dir_data}lists/"
dir_data_tables = f"{dir_data}tables/"
dir_src         = f"{dir_main}src/"
dir_src_modules = f"{dir_src}modules/"
dir_tmp         = f"{dir_main}tmp/"

## f-string variables

In [208]:
nl  = "\n"
tab = "\t"

# Imports

In [209]:
import os
import json
import requests
import pandas as pd
from pprint import pprint

# Functions

In [210]:
from typing import Any, Literal, get_args
from requests import RequestException
from enum import Enum

class Disprot_versions(Enum):
    DATE10 = "2022_06"
    DATE9 = "2022_03"
    DATE8 = "2021_12"
    DATE7 = "2021_08"
    DATE6 = "2021_06"
    DATE5 = "2020_12"
    DATE4 = "2020_06"
    DATE3 = "2019_09"
    DATE2 = "2018_11"
    DATE1 = "2016_10"

class Disprot_namespaces(Enum):
    NS5 = "structural_state"
    NS4 = "transition_state"
    NS3 = "interaction_partner"
    NS2 = "disorder_function"
    NS1 = "all"


def get_disprot_response( version  : Literal[ "2022_06"
                                            , "2022_03"
                                            , "2021_12"
                                            , "2021_08"
                                            , "2021_06"
                                            , "2020_12"
                                            , "2020_06"
                                            , "2019_09"
                                            , "2018_11"
                                            , "2016_10" ] = "latest"
                        , namespace: Literal[ "structural_state"
                                            , "transition_state"
                                            , "interaction_partner"
                                            , "disorder_function"
                                            , "all" ] = "all"
                        , ambiguos : str="false"
                        , obsolete : str="false"
                        , format   : Literal[ "json"
                                            , "fasta"
                                            , "tsv" ] = "json"
                        , consensus: str="true" ):
    """
    Get a DisProt response release. 
    Observations:
        * Uses DisProt API: 'https://disprot.org/api'
        * By default it returns the latest version,
          without ambiguos and obsolete, with consensus
          for 'all' namespace in json format.
    https://disprot.org/api/search?release=curator&show_ambiguous=false&show_obsolete=false&format=fasta&namespace=structural_state&get_consensus=true

    https://mobidb.bio.unipd.it/mobidb3_datasets/latest/derived_disorder.mjson.gz
    """
    versions   = Literal[ "2022_06", "2022_03", "2021_12", "2021_08", "2021_06"
                        , "2020_12", "2020_06", "2019_09", "2018_11", "2016_10" ]
    namespaces = Literal[ "structural_state", "transition_state"
                        , "interaction_partner", "disorder_function", "all" ]
    formats    = Literal[ "json", "fasta", "tsv" ]

    if version == "latest":
        version = sorted(get_args(versions), reverse=True)[0]
    else:
        assert version   in get_args(versions), f"'{version}' not a valid value for version."
    assert namespace in get_args(namespaces), f"'{namespace}' not a valid value for namespace."
    assert format    in get_args(formats), f"'{format}' not a valid value for format."

    server = "https://disprot.org/api"
    endpoint = "search?"
    url = f"{server}/{endpoint}"

    r = requests.get(url, params = { "release"        : version
                                   , "show_ambiguous" : ambiguos
                                   , "show_obsolete"  : obsolete
                                   , "format"         : format
                                   , "namespace"      : namespace
                                   , "get_consensus"  : consensus })

    print(r.url) # FIXME remove these lines
    #print(r.text)

    try:
        r.status_code == 200
    except RequestException as e:
        raise e (f"Request error: {r.status_code}")

    if format == "json":
        return r.json()
    elif format == "fasta" or format == "tsv":
        return r.text


def save_disprot_response( response_content: Any
                         , filename: str
                         , dir_output: str ) -> None:
    """
    Save the DisProt response release to a file.
    """
    if dir_output.endswith("/"):
        dir_output = dir_output[:-1]
    os.makedirs(f"{dir_output}", exist_ok=True)
    with open(f"{dir_output}/{filename}", "w") as f:
        if isinstance(response_content, dict):
            f.writelines(json.dumps(response_content))
        else:
            f.writelines(response_content.strip())
    return


# Retrieve release file

The following files should be placed in <root-to-CAID2>/data/annotations`

### Retrieve `disprot-2018_11-structural_state.fasta`

In [211]:
aversion="2018_11"; anamespace="structural_state"; aformat="fasta"
arelease = get_disprot_response( version   = aversion
                               , namespace = anamespace
                               , format    = aformat )
save_disprot_response( response_content = arelease
                     , filename         = f"disprot-{aversion}-{anamespace}.{aformat}"
                     , dir_output       = dir_tmp )

https://disprot.org/api/search?release=2018_11&show_ambiguous=false&show_obsolete=false&format=fasta&namespace=structural_state&get_consensus=true


### Retrieve `disprot-2022_06-structural_state.fasta`

In [212]:
aversion="2022_06"; anamespace="structural_state"; aformat="fasta"
arelease = get_disprot_response( version   = aversion
                               , namespace = anamespace
                               , format    = aformat )
save_disprot_response( response_content = arelease
                     , filename         = f"disprot-{aversion}-{anamespace}.{aformat}"
                     , dir_output       = dir_tmp )

https://disprot.org/api/search?release=2022_06&show_ambiguous=false&show_obsolete=false&format=fasta&namespace=structural_state&get_consensus=true


### Retrieve `disprot-2022_06-interaction_partner.fasta`

In [213]:
aversion="2022_06"; anamespace="interaction_partner"; aformat="fasta"
arelease = get_disprot_response( version   = aversion
                               , namespace = anamespace
                               , format    = aformat )
save_disprot_response( response_content = arelease
                     , filename         = f"disprot-{aversion}-{anamespace}.{aformat}"
                     , dir_output       = dir_tmp )

https://disprot.org/api/search?release=2022_06&show_ambiguous=false&show_obsolete=false&format=fasta&namespace=interaction_partner&get_consensus=true


### Retrieve `disprot-2022_06-all.json`

In [214]:
aversion="2022_06"; anamespace="all"; aformat="json"
arelease = get_disprot_response( version   = aversion
                               , namespace = anamespace
                               , format    = aformat )
save_disprot_response( response_content = arelease
                     , filename         = f"disprot-{aversion}-{anamespace}.{aformat}"
                     , dir_output       = dir_tmp )

https://disprot.org/api/search?release=2022_06&show_ambiguous=false&show_obsolete=false&format=json&namespace=all&get_consensus=true


## Retrieve annotations for CAID2

In [215]:
# CAID2
aversion = "latest"
for anamespace in ["structural_state", "interaction_partner"]:
    save_disprot_response( response_content = get_disprot_response( version   = aversion
                                                                  , namespace = anamespace
                                                                  , format    = "fasta" )
                            , filename         = f"disprot-{aversion}-{anamespace}.fasta"
                            , dir_output       = dir_tmp )
save_disprot_response( response_content = get_disprot_response( version   = "latest"
                                                              , namespace = "all"
                                                              , format    = "json" )
                        , filename         = f"disprot-{aversion}-all.json"
                        , dir_output       = dir_tmp )

# CAID, last version --> 2018_11
aversion = "2018_11"
save_disprot_response( response_content = get_disprot_response( version   = aversion
                                                              , namespace = "structural_state"
                                                              , format    = "fasta" )
                        , filename         = f"disprot-{aversion}-structural_state.fasta"
                        , dir_output       = dir_tmp )

https://disprot.org/api/search?release=2022_06&show_ambiguous=false&show_obsolete=false&format=fasta&namespace=structural_state&get_consensus=true
https://disprot.org/api/search?release=2022_06&show_ambiguous=false&show_obsolete=false&format=fasta&namespace=interaction_partner&get_consensus=true
https://disprot.org/api/search?release=2022_06&show_ambiguous=false&show_obsolete=false&format=json&namespace=all&get_consensus=true
https://disprot.org/api/search?release=2018_11&show_ambiguous=false&show_obsolete=false&format=fasta&namespace=structural_state&get_consensus=true


# Notes for CAID2

* Retrieve the latest DisProt release using the API as a json file
* Build a unify DataFrame using the `<disprot-release>.json": file


## Comments
- DisProt API has problems parsing:
    + Different versions
    + Regions for
        + Namespaces
    + Consensus

## DisProt cases to test

In [216]:
disprot_testing_cases = { "DP00005": "Antitermination protein N"
                        , "DP00009": "Transcription initiation factor IIA subunit 2"
                        , "DP00016": "Cyclin-dependent kinase inhibitor 1"
                        , "DP00040": "High mobility group protein HMG-I/HMG-Y"
                        , "DP00086": "Cellular tumor antigen p53" }

## KEYS from 2022_06 DisProt release

In [217]:
# DisProt JSON main Keys
[ 'UniParc'
, 'acc'
, 'creator'
, 'dataset'                 # list of str (could by empty, [])
, 'date'
, 'disorder_content'        # float
, 'disprot_consensus'       # dict
, 'disprot_id'
, 'features'                # dict
, 'length'                  # int
, 'name'
, 'ncbi_taxon_id'           # int
, 'organism'
, 'regions'                 # list of dict
, 'regions_counter'         # int
, 'released'                # DisProt release version
, 'sequence'
, 'taxonomy'                # list of str (organisms)
, 'uniparc'
, 'uniref100'
, 'uniref50'
, 'uniref90' ]

## KEYs for 'regions' (lista de dict. Cada json es una región)
[ 'cross_refs'              # list of dict
, 'curator_id'
, 'curator_name'
, 'curator_orcid'
, 'date'
, 'disprot_namespace'
, 'ec_go'
, 'ec_id'
, 'ec_name'
, 'ec_ontology'
, 'end'                     # int
, 'reference_html'
, 'reference_id'
, 'reference_source'
, 'region_id'
, 'released'
, 'start'                   # int
, 'statement'
, 'term_id'
, 'term_name'
, 'term_namespace'
, 'term_ontology'
, 'uniprot_changed'         # bool
, 'validated'               # dict
, 'version']                # int

# For string
dict_1letter = { "F": "function region"
               , "S": "order state"
               , "D": "disorder state"
               , "T": "transition"
               , "I": "transition with interaction (fold upon binding, ...)" }

# For merge cross
# challenges = [ ('GO:0005488', 'binding')
#              , ('GO:0005515', 'binding')
#              , ('GO:0005515', 'binding_protein')
#              , ('GO:0005515', 'binding')
#              , ('GO:1901363', 'heterocyclic compound binding')
#              , ('GO:0005515', 'binding')
#              , ('GO:0097159', 'organic cyclic compound binding')
#              , ('GO:0003676', 'binding')
#              , ('GO:0003676', 'organic cyclic compound binding')
#              , ('GO:0003676', 'nucleic acid binding')
#              , ('GO:0140666', 'binding')
#              , ('GO:0140666', 'organic cyclic compound binding')
#              , ('GO:0140666', 'nucleic acid binding')
#              , ('GO:0140666', 'annealing activity')
#              , ('GO:0003677', 'binding')
#              , ('GO:0003677', 'organic cyclic compound binding')
#              , ('GO:0003677', 'nucleic acid binding')
#              , ('GO:0003677', 'DNA binding')
#              , ('GO:0071667', 'binding')
#              , ('GO:0071667', 'organic cyclic compound binding')
#              , ('GO:0071667', 'nucleic acid binding')
#              , ('GO:0071667', 'DNA/RNA hybrid binding')
#              , ('GO:0003723', 'binding')
#              , ('GO:0003723', 'organic cyclic compound binding')
#              , ('GO:0003723', 'nucleic acid binding')
#              , ('GO:0003723', 'RNA binding')
#              , ('GO:0001067', 'binding')
#              , ('GO:0001067', 'organic cyclic compound binding')
#              , ('GO:0001067', 'nucleic acid binding')
#              , ('GO:0001067', 'transcription regulatory region nucleic acid binding')
#              , ('GO:0090079', 'binding')
#              , ('GO:0090079', 'organic cyclic compound binding')
#              , ('GO:0090079', 'nucleic acid binding')
#              , ('GO:0090079', 'translation regulator activity, nucleic acid binding') ]

challenges = [ ('GO:0005488', 'binding')
             , ('GO:0005515', 'binding')
             , ('GO:0005515', 'protein binding')
             , ('GO:1901363', 'binding')
             , ('GO:0005515', 'binding')
             , ('GO:0097159', 'binding')
             , ('GO:0003676', 'binding')
             , ('GO:0003676', 'nucleic acid binding')
             , ('GO:0140666', 'binding')
             , ('GO:0140666', 'nucleic acid binding')
             , ('GO:0003677', 'binding')
             , ('GO:0003677', 'nucleic acid binding')
             , ('GO:0071667', 'binding')
             , ('GO:0071667', 'nucleic acid binding')
             , ('GO:0003723', 'binding')
             , ('GO:0003723', 'nucleic acid binding')
             , ('GO:0001067', 'binding')
             , ('GO:0001067', 'nucleic acid binding')
             , ('GO:0090079', 'binding')
             , ('GO:0090079', 'nucleic acid binding') ]

# GO terms to find
# GO:0003674 molecular_function
#     GO:0005488 binding
#         GO:0005515 protein binding
#         GO:1901363 heterocyclic compound binding
#         GO:0097159 organic cyclic compound binding
#             GO:0003676 nucleic acid binding
#                 GO:0140666 annealing activity
#                 GO:0003677 DNA binding
#                 GO:0071667 DNA/RNA hybrid binding
#                 GO:0003723 RNA binding
#                 GO:0001067 transcription regulatory region nucleic acid binding
#                 GO:0090079 translation regulator activity, nucleic acid binding

## CAID2 Dataframe from json

In [218]:
main_columns = [ 'disprot_id'
               , 'features'                # dict
               , 'length'                  # int
               , 'name'
               , 'ncbi_taxon_id'           # int
               , 'organism'
               , 'regions'                 # list of dict
               , 'regions_counter'         # int
               , 'released'                # DisProt release version
               , 'sequence'
               , 'taxonomy' ]              # list of str (organisms)

region_columns = [ "disprot_namespace"  # str
                 , "region_id"          # str
                 , "date"               # str
                 , "start"              # int
                 , "end"                # int
                 , "term_id"            # str (el GO term)
                 , "term_name"          # str (GO description)
                 , "term_namespace"     # str (GO namespace)
                 , "term_ontology" ]    # str (type of ontology, e.i.: GO, IDPO)

# """
# Ontology
# DisProt relies on three different ontologies to annotate intrinsically disordered regions, the Intrinsically Disordered Proteins Ontology (IDPO), the Gene Ontology (GO) and the Evidence and Conclusion Ontology (ECO).
#     * IDPO is used to describe structural aspects of an IDP/IDR, self-functions and functions directly associated with their disordered state.
#     * GO is used to describe functional aspects of an IDP/IDR.
#     * ECO describes the technique or evidence associated with an annotation.
# """

### [OLD] - Parsing json to Dataframe manually 

In [219]:
def add_prefix(prefix: str, a_string: str) -> str:
    """
    """
    if not a_string.startswith("region"):
        return f"region_{a_string}"
    else:
        return a_string

def add_region_to_df( dp_region: dict
                    , a_dataframe: pd.DataFrame ) -> pd.DataFrame:
    """
    """
    return a_dataframe.append(dp_region, ignore_index=True)

def add_regions_to_df( dp_id: str
                     , dp_regions: list
                     , a_dataframe: pd.DataFrame ) -> pd.DataFrame:
    """
    """
    for a_region in dp_regions:
        a_dataframe = add_region_to_df(a_region | {"disprot_id":dp_id}, a_dataframe)
    return a_dataframe



#caid_columns = main_columns + [add_prefix('region', acol) for acol in region_columns]
caid_columns = sorted(main_columns + region_columns)

#print(caid_columns)

caid2_dataframe = pd.DataFrame(columns=region_columns)
caid2_dataframe.head()

Unnamed: 0,disprot_namespace,region_id,date,start,end,term_id,term_name,term_namespace,term_ontology


In [220]:
with open(f"{dir_tmp}/disprot-2022_06-all.json", "r") as f:
    dict_all = json.load(f)
    for idx, an_entry in enumerate(dict_all["data"][:20]): # FIXME do not slice
        #print(f"an_entry: {type(an_entry)}") # a dict
        for acol in caid_columns:
            if acol == "regions":
                caid2_dataframe = add_regions_to_df( an_entry["disprot_id"]
                                                   , an_entry["regions"]
                                                   , caid2_dataframe )
                break
            else:
                continue
        #print(dict_all["data"][idx]['features'])
        #break

  return a_dataframe.append(dp_region, ignore_index=True)
  return a_dataframe.append(dp_region, ignore_index=True)
  return a_dataframe.append(dp_region, ignore_index=True)
  return a_dataframe.append(dp_region, ignore_index=True)
  return a_dataframe.append(dp_region, ignore_index=True)
  return a_dataframe.append(dp_region, ignore_index=True)
  return a_dataframe.append(dp_region, ignore_index=True)
  return a_dataframe.append(dp_region, ignore_index=True)
  return a_dataframe.append(dp_region, ignore_index=True)
  return a_dataframe.append(dp_region, ignore_index=True)
  return a_dataframe.append(dp_region, ignore_index=True)
  return a_dataframe.append(dp_region, ignore_index=True)
  return a_dataframe.append(dp_region, ignore_index=True)
  return a_dataframe.append(dp_region, ignore_index=True)
  return a_dataframe.append(dp_region, ignore_index=True)
  return a_dataframe.append(dp_region, ignore_index=True)
  return a_dataframe.append(dp_region, ignore_index=True)
  return a_dat

In [221]:
caid2_dataframe.head(6)

Unnamed: 0,disprot_namespace,region_id,date,start,end,term_id,term_name,term_namespace,term_ontology,cross_refs,...,term_not_annotate,interaction_partner,term_go_domain,term_xref,unpublished,construct_alterations,sequence_construct,states_connection,conditions,sample
0,Structural state,DP00003r002,2022-02-14T09:00:00.000Z,294,334,IDPO:00076,disorder,Structural state,IDPO,"[{'db': 'PDB', 'id': '1ADV'}, {'db': 'PDB', 'i...",...,,,,,,,,,,
1,Structural state,DP00003r004,2022-02-14T09:00:00.000Z,454,464,IDPO:00076,disorder,Structural state,IDPO,"[{'db': 'PDB', 'id': '1ADV'}, {'db': 'PDB', 'i...",...,,,,,,,,,,
2,Structural state,DP00004r001,2022-02-14T09:00:00.000Z,134,170,IDPO:00076,disorder,Structural state,IDPO,,...,,,,,,,,,,
3,Structural transition,DP00004r002,2022-02-14T09:00:00.000Z,134,170,IDPO:00050,disorder to order,Structural transition,IDPO,,...,,,,,,,,,,
4,Disorder function,DP00004r004,2022-05-27T08:17:13.860Z,134,170,GO:0019835,cytolysis,Biological process,GO,,...,False,,,,,,,,,
5,Structural state,DP00005r001,2022-02-14T09:00:00.000Z,1,107,IDPO:00076,disorder,Structural state,IDPO,,...,,,,,,,,,,


### [NEW] - Parsing json to Dataframe

In [222]:
with open(f"{dir_tmp}/disprot-2022_06-all.json", "r") as f:
    dict_all = json.load(f)
    #print(dict_all["size"])

In [223]:
# Taken from https://towardsdatascience.com/how-to-parse-json-data-with-python-pandas-f84fbd0b1025
multiple_level_data = pd.json_normalize( data          = dict_all['data']
                                       , record_path   = ['regions']
                                       , meta          = ['disprot_id', 'sequence']
                                       , meta_prefix   = ''
                                       , record_prefix = '' )


#### All the keys from the record `regions` in disprot json release 2022_06

In [224]:
sorted(set(multiple_level_data.columns.to_list()) - set(['disprot_id', 'sequence']))

['annotation_extensions',
 'conditions',
 'construct_alterations',
 'cross_refs',
 'curator_id',
 'curator_name',
 'curator_orcid',
 'date',
 'disprot_namespace',
 'ec_go',
 'ec_id',
 'ec_name',
 'ec_ontology',
 'end',
 'interaction_partner',
 'reference_html',
 'reference_id',
 'reference_source',
 'region_id',
 'released',
 'sample',
 'sequence_construct',
 'start',
 'statement',
 'states_connection',
 'term_comment',
 'term_def',
 'term_go_domain',
 'term_id',
 'term_is_binding',
 'term_is_obsolete',
 'term_name',
 'term_namespace',
 'term_not_annotate',
 'term_ontology',
 'term_xref',
 'uniprot_changed',
 'unpublished',
 'validated.curator_id',
 'validated.curator_name',
 'validated.timestamp',
 'version']

##### Regions

In [225]:
region_columns_all = [ 'annotation_extensions'
                     , 'conditions'
                     , 'construct_alterations'
                     , 'cross_refs'
                     , 'curator_id'
                     , 'curator_name'
                     , 'curator_orcid'
                     , 'date'
                     , 'disprot_namespace'
                     , 'ec_go'
                     , 'ec_id'
                     , 'ec_name'
                     , 'ec_ontology'
                     , 'end'
                     , 'interaction_partner'
                     , 'reference_html'
                     , 'reference_id'
                     , 'reference_source'
                     , 'region_id'
                     , 'released'
                     , 'sample'
                     , 'sequence_construct'
                     , 'start'
                     , 'statement'
                     , 'states_connection'
                     , 'term_comment'
                     , 'term_def'
                     , 'term_go_domain'
                     , 'term_id'
                     , 'term_is_binding'
                     , 'term_is_obsolete'
                     , 'term_name'
                     , 'term_namespace'
                     , 'term_not_annotate'
                     , 'term_ontology'
                     , 'term_xref'
                     , 'uniprot_changed'
                     , 'unpublished'
                     , 'validated.curator_id'
                     , 'validated.curator_name'
                     , 'validated.timestamp'
                     , 'version']

#### Subsetting disprot data to work only with testing cases

In [226]:
dp_testing_list = []
main_columns = [ 'disprot_id'
               , 'sequence'
               , 'length' ] # Maybe not necessary to expand the sequence
for idx, entry in enumerate(dict_all['data']):
    if entry['disprot_id'] in disprot_testing_cases.keys():
        dp_testing_list.append(dict_all['data'][idx])
multiple_level_data = pd.json_normalize( data          = dp_testing_list
                                       , record_path   = ['regions']
                                       , meta          = main_columns
                                       , meta_prefix   = ''
                                       , record_prefix = '' )

#### Droping columns from Dataframe

In [227]:
multiple_level_data = multiple_level_data[main_columns + region_columns]

In [228]:
def get_range(start: int, end: int) -> list: # [3, 6] --> [3,4,5,6]
    """
    """
    return [poss for poss in range(start, end + 1, 1)]

def expand_region(adf: pd.DataFrame) -> pd.DataFrame:
    """
    """
    adf["reg_position"] = list(range(int(adf["start"]), int(adf["end"]) + 1, 1))
    return adf

def expand_seq(adf: pd.DataFrame) -> pd.DataFrame:
    """
    """
    #adf["seq_position", "seq_aa"] = list(range(1, int(adf["length"]) + 1, 1))
    #adf["seq_position", "seq_aa"] = list(zip(enumerate(adf["sequence"])))
    adf["seq_position_aa"] = [(aa[0]+1, aa[1]) for aa in enumerate(adf["sequence"])]
    return adf


In [229]:
# To make a merge.
# Be aware that the column name for `term_id` is written correctly.
df_challenges = pd.DataFrame(data=challenges, columns=['term_id', 'challenge'])

In [230]:
def build_dp_entry_seq(dp_to_expand: pd.DataFrame) -> pd.DataFrame:
    columns_to_keep = ['disprot_id', 'sequence']
    columns_to_drop = dp_to_expand.columns.difference(columns_to_keep)
    dp_entry_seq = dp_to_expand.drop(columns=columns_to_drop, axis=1).copy(deep=True)
    dp_entry_seq = dp_entry_seq.apply(expand_seq, axis=1)
    dp_entry_seq = dp_entry_seq.explode("seq_position_aa")
    dp_entry_seq[['seq_position', 'seq_aa']] = pd.DataFrame(dp_entry_seq['seq_position_aa'].tolist(), index=dp_entry_seq.index)
    columns_to_drop = dp_entry_seq.columns.difference(['disprot_id', 'seq_position', 'seq_aa'])
    dp_entry_seq = dp_entry_seq.drop(columns=columns_to_drop, axis=1)
    dp_entry_seq.drop_duplicates(inplace=True)
    #dp_entry_seq.set_index(['disprot_id', 'seq_position', 'seq_aa'], inplace=True)
    return dp_entry_seq

def build_dp_entry_regions(dp_to_expand: pd.DataFrame) -> pd.DataFrame:
    dp_entry_regions = dp_to_expand.apply(expand_region, axis=1).copy(deep=True)
    dp_entry_regions = dp_entry_regions.explode("reg_position")
    dp_entry_regions = dp_entry_regions.drop(columns=['disprot_id', 'sequence', 'length', 'start', 'end', 'term_ontology'], axis=1)
    dp_entry_regions.drop_duplicates(inplace=True)
    #dp_entry = dp_entry.reset_index()
    return dp_entry_regions

#### Testing 'build_dp_entry_seq'

In [231]:
# dp_entry_seq = build_dp_entry_seq(multiple_level_data)
print(multiple_level_data.shape) # Only has 5 cases to test
# print(dp_entry_seq.shape)

(129, 12)


In [232]:
columns_to_keep = ['disprot_id', 'sequence']
columns_to_drop = multiple_level_data.columns.difference(columns_to_keep)

multiple_level_data.drop(columns=columns_to_drop, axis=1, inplace=True)
multiple_level_data = multiple_level_data.apply(expand_seq, axis=1)
multiple_level_data = multiple_level_data.explode("seq_position_aa")
multiple_level_data[['seq_position', 'seq_aa']] = pd.DataFrame(multiple_level_data['seq_position_aa'].tolist(), index=multiple_level_data.index)
columns_to_drop = multiple_level_data.columns.difference(['disprot_id', 'seq_position', 'seq_aa'])
multiple_level_data = multiple_level_data.drop(columns=columns_to_drop, axis=1)
multiple_level_data.drop_duplicates(inplace=True)
multiple_level_data


Unnamed: 0,disprot_id,seq_position,seq_aa
0,DP00005,1,M
0,DP00005,2,D
0,DP00005,3,A
0,DP00005,4,Q
0,DP00005,5,T
...,...,...,...
72,DP00086,389,G
72,DP00086,390,P
72,DP00086,391,D
72,DP00086,392,S


#### Testing 'build_dp_entry_regions'

In [233]:
dp_entry_regions = build_dp_entry_regions(multiple_level_data)
print(multiple_level_data.shape) # Only has 5 cases to test
print(dp_entry_regions.shape)

KeyError: 'start'

#### Looping DisProt entries

In [None]:
for key, dp_entry in multiple_level_data.groupby(by=["disprot_id"], axis=0):
    #print(key)
    print(f"without merge: {dp_entry.shape}")
    dp_entry_seq = build_dp_entry_seq(dp_entry)

    # Not used in dp_entry_regions
    #dp_entry.drop(columns=['seq_position_aa'], inplace=True)

    dp_entry_regions = build_dp_entry_regions(dp_entry)

    # 
    dp_entry_merge = pd.merge( left  = dp_entry_regions
                             , right = df_challenges
                             , how   = "left"
                             , on    = "term_id" )
    dp_entry_merge = pd.merge( right    = dp_entry_merge
                             , left     = dp_entry_seq
                             , right_on = "reg_position"
                             , left_on  = "seq_position" )
    dp_entry_merge.drop_duplicates(inplace=True)
    print(dp_entry_merge)
    #if not os.path.isfile(f"{dir_tmp}{key}.fasta"): # TODO check first?
    with open(f"{dir_tmp}{key}.fasta", "w+") as f:
        f.write(f">{key}{nl}")
        f.write(f"{''.join(dp_entry_seq['seq_aa'].tolist())}{nl}")
        f.write()
    break
    for key, dp_position in dp_entry_regions.groupby(by=['disprot_id','position']):
        # print(f"without merge: {dp_position.shape}")
        # print(dp_position)
        #print(dp_position)
        # dp_position = dp_position.reset_index()
        # dp_entry_regions = pd.merge( right=df_challenges
        #                            , left=dp_entry_regions
        #                            , how='left'
        #                            , on="term_id")
        #print(f"with merge: {dp_entry_regions.shape}")
        #print(dp_to_expand)
        dp_entry_regions.drop_duplicates(inplace=True)
        dp_entry_regions.set_index(['disprot_id','position'], inplace=True)
        # #dp_position.drop(columns=['term_id_x'], inplace=True)
        # #dp_position.rename(columns ={'term_id_y':'term_id'}, inplace = True)
        # dp_position.drop_duplicates(inplace=True)
        #print(f"with merge and drop: {dp_position.shape}")
        print(dp_position)
        break
    break

without merge: (16, 12)
      disprot_namespace    region_id                      date     term_id  \
0      Structural state  DP00005r001  2022-02-14T09:00:00.000Z  IDPO:00076   
1      Structural state  DP00005r004  2022-02-14T09:00:00.000Z  IDPO:00076   
2      Structural state  DP00005r005  2022-02-14T09:00:00.000Z  IDPO:00076   
3     Disorder function  DP00005r006  2022-02-14T09:00:00.000Z  GO:0003727   
4      Structural state  DP00005r007  2022-02-14T09:00:00.000Z  IDPO:00076   
...                 ...          ...                       ...         ...   
1273  Disorder function  DP00005r008  2022-02-14T09:00:00.000Z  GO:0006357   
1274  Disorder function  DP00005r009  2022-02-14T09:00:00.000Z  GO:0006357   
1275  Disorder function  DP00005r010  2022-02-14T09:00:00.000Z  GO:0006357   
1276  Disorder function  DP00005r011  2022-02-14T09:00:00.000Z  GO:0070063   
1277   Structural state  DP00005r017  2022-02-14T09:00:00.000Z  IDPO:00076   

                                       