# Analyze MultiXrank results

In [2]:
import pandas as pd
import xml.etree.ElementTree as ET
from pyhpo import Ontology
import numpy as np

In [4]:
tree = ET.parse("../data/en_product4.xml")
root = tree.getroot()

# initilize the Ontology ()
_ = Ontology()

In [5]:
def find_orpha_name(orpha_code: str) -> str:
    """Function that returns the orphanet
    name of a disease given its orphanet
    code

    Args:
        orpha_code (str): orphanet code of 
        the disease

    Returns:
        str: the orphanet name of the disease
    """
    for disorder in root.iter('Disorder'):
        orpha_code_in_tree = disorder.find('OrphaCode').text
        orpha_name = disorder.find('Name').text
        if orpha_code_in_tree == orpha_code:
            return orpha_name

In [6]:
def create_table_diseases_seeds(mapping_file: str, table_name: str) -> dict:
    """Function that generates a mapping table of disease names and the 
    numbers used in multixrank to idenfity diseases

    Args:
        mapping_file (str): name of the mapping file
        table_name (str): name of the output table

    Returns:
        dict: a dictionary of correspondances between rare-x diseases names,
        orphanet diseases names and seed numbers used in multixrank
    """
    dico_diseases_seeds = dict()
    df_mapping_file = pd.read_csv(mapping_file, sep=";", header=0)
    df_table = pd.DataFrame(columns=["RARE-X", "ORPHANET", "SEED NUMBER"])
    df_table["RARE-X"] = df_mapping_file["Rx"]
    diseases = df_table["RARE-X"].tolist()
    df_table["ORPHANET"] = df_mapping_file["Orphanet"]
    seed_numbers = [i for i in range(1, 28)]
    df_table["SEED NUMBER"] = seed_numbers
    df_table.to_csv(table_name, sep="\t", header=True, index=False)
    for disease, seed in zip(diseases, seed_numbers):
        dico_diseases_seeds[disease] = seed
    return dico_diseases_seeds

dico_diseases_seeds = create_table_diseases_seeds(mapping_file="../Diseases_Rx_orpha_corres.csv", table_name="../Diseases_names_and_seeds_numbering.tsv")

In [14]:
def create_results_table(dico_diseases_seeds: dict) -> None:
    """Function that generates for each mutlixrank output = 
    each rarex disease, a results file that recapitulates/concatenates 
    all the scores of the 3 layers in a single file

    Args:
        dico_diseases_seeds (dict): a dictionary of the rarex 
        diseases and their seed numbers used in multixrank

    Remark: not really optimized, can be long to run
    """
    # for each disease: read multixrank outputs
    for disease, seed in dico_diseases_seeds.items():
        # Read layer 1 (rarex) output: no terms description to add
        # because this layer contains RARE-X disease names, symptomes
        # names and patients IDs
        multiplex_layer1 = pd.read_csv(f"../multixrank_RARE_X_diseases/output_multixrank/output_{seed}/multiplex_1.tsv", header=0, sep="\t")
        
        # Read layer 2 (orpha-hpo) output
        multiplex_layer2 = pd.read_csv(f"../multixrank_RARE_X_diseases/output_multixrank/output_{seed}/multiplex_2.tsv", header=0, sep="\t")
        # get the nodes into a list
        nodes_layer2 = multiplex_layer2[multiplex_layer2.columns[1]].to_list()
        # initialize empty list to store the descriptions (orpha names and phenotyes names) for each node
        list_description_layer2 = list()
        # browse nodes in mutliplex 2 to add description
        for term in nodes_layer2:
            if term[:5] == "ORPHA":
                orpha_code = term[6:]
                orpha_name = find_orpha_name(orpha_code=orpha_code)
                list_description_layer2.append(orpha_name)
            elif term[:2] == "HP":
                try:
                    hpo_phenotype = Ontology.get_hpo_object(term)
                    list_description_layer2.extend([str(hpo_phenotype)[13:]])
                # if there is no match of HPO phenotype name
                except RuntimeError:
                    list_description_layer2.append("None")
        # check that the description list and the dataframe have the same length !
        assert len(list_description_layer2) == len(multiplex_layer2.index)
        # create new description columns for the terms
        description_layer2 = pd.DataFrame(list_description_layer2, columns=['description'])
        # create new dataframe for layer 2 containing the ranking of the nodes + their description (orpha names and phenotypes names)
        multiplex_2_with_description = pd.concat([multiplex_layer2.reindex(range(len(multiplex_layer2))), description_layer2.reindex(range(len(multiplex_layer2)))], axis=1)

        # read layer 3 (disease similaruty) output
        multiplex_layer3 = pd.read_csv(f"../multixrank_RARE_X_diseases/output_multixrank/output_{seed}/multiplex_3.tsv", header=0, sep="\t")
        nodes_layer3 = multiplex_layer3[multiplex_layer3.columns[1]].to_list()
        list_description_layer3 = list()
        # fetch orpha names for each node
        for term in nodes_layer3:
                orpha_code = term[6:]
                orpha_name = find_orpha_name(orpha_code=orpha_code)
                list_description_layer3.append(orpha_name)
        # check lengths
        assert len(list_description_layer3) == len(multiplex_layer3.index)
        # create new column for description of orpha diseases
        description_layer3 = pd.DataFrame(list_description_layer3, columns=['description'])
        # generate new dataframe with nodes ranking of layer 3 + orphanet names
        multiplex_layer3_with_description = pd.concat([multiplex_layer3.reindex(range(len(multiplex_layer3))), description_layer3.reindex(range(len(multiplex_layer3)))], axis=1)
        
        # concatenate the three dataframes and generate table output
        max_rows = max(len(multiplex_layer1), len(multiplex_2_with_description), len(multiplex_layer3_with_description))
        table_results = pd.concat([multiplex_layer1.reindex(range(max_rows)), multiplex_2_with_description.reindex(range(max_rows)), multiplex_layer3_with_description.reindex(range(max_rows))], axis=1)
        table_results.to_csv(f"../multixrank_RARE_X_diseases/analysis_results/results_disease_{seed}.tsv", sep="\t", header=True, index=False)

create_results_table(dico_diseases_seeds=dico_diseases_seeds)

12545
       multiplex          node     score
0              2  ORPHA:289494  0.073607
1              2    HP:0001260  0.000694
2              2    HP:0001251  0.000693
3              2    HP:0001332  0.000687
4              2    HP:0000164  0.000682
...          ...           ...       ...
12540          2    HP:0100134  0.000000
12541          2    HP:0100529  0.000000
12542          2  ORPHA:140933  0.000000
12543          2    HP:0003538  0.000000
12544          2    HP:0003260  0.000000

[12545 rows x 3 columns]
2771
2771
12545
       multiplex         node     score
0              2  ORPHA:96092  0.073698
1              2   HP:0001263  0.000381
2              2   HP:0001249  0.000380
3              2   HP:0000750  0.000370
4              2   HP:0001999  0.000369
...          ...          ...       ...
12540          2   HP:0031871  0.000000
12541          2   HP:0032436  0.000000
12542          2   HP:0040130  0.000000
12543          2   HP:0003538  0.000000
12544          2   H