# Analyze MultiXrank results

MultiXrank is a Random Walk with Restart algorithm designed for multilayer networks. Starting from a seed node, it assigns scores to all nodes in the network. Theses scores indicate how closely connected a node is to the seed.

Our multilayer network consists of two layers: the Rare-X layer containing diseases, patients, and symptom nodes, and the Orphanet layer containing diseases and Human Phenotype Ontology (HPO) nodes.

Our hypothesis is that MultiXrank can uncover previously unknown phenotypes associated with rare diseases. Taking iteratively each rare disease as a seed, we hypothesise that MultiXrank can identify symptoms that have a strong connection to the seed disease but are not represented in the HPO terms associated with that disease. These uncorrelated symptoms might indicate new and unrecognized aspects of the disease's phenotype, potentially leading to valuable insights for diagnosis and treatment.

In [11]:
import pandas as pd
import xml.etree.ElementTree as ET
from pyhpo import Ontology
import numpy as np
import os

In [3]:
tree = ET.parse("../data/en_product4.xml")
root = tree.getroot()

# initilize the Ontology ()
_ = Ontology()

In [4]:
def find_orpha_name(orpha_code: str) -> str:
    """Function that returns the orphanet
    name of a disease given its orphanet
    code

    Args:
        orpha_code (str): orphanet code of 
        the disease

    Returns:
        str: the orphanet name of the disease
    """
    for disorder in root.iter('Disorder'):
        orpha_code_in_tree = disorder.find('OrphaCode').text
        orpha_name = disorder.find('Name').text
        if orpha_code_in_tree == orpha_code:
            return orpha_name

In [6]:
def create_table_diseases_seeds(mapping_file: str, table_name: str) -> dict:
    """Function that generates a mapping table of disease names and the 
    numbers used in multixrank to idenfity diseases

    Args:
        mapping_file (str): name of the mapping file
        table_name (str): name of the output table

    Returns:
        dict: a dictionary of correspondances between rare-x diseases names,
        orphanet diseases names and seed numbers used in multixrank
    """
    dico_diseases_seeds = dict()
    df_mapping_file = pd.read_csv(mapping_file, sep=";", header=0)
    df_table = pd.DataFrame(columns=["RARE-X", "ORPHANET", "SEED NUMBER"])
    df_table["RARE-X"] = df_mapping_file["Rx"]
    diseases = df_table["RARE-X"].tolist()
    df_table["ORPHANET"] = df_mapping_file["Orphanet"]
    seed_numbers = [i for i in range(1, 28)]
    df_table["SEED NUMBER"] = seed_numbers
    df_table.to_csv(table_name, sep="\t", header=True, index=False)
    for disease, seed in zip(diseases, seed_numbers):
        dico_diseases_seeds[disease] = seed
    return dico_diseases_seeds

dico_diseases_seeds = create_table_diseases_seeds(mapping_file="../data/Diseases_Rx_orpha_corres.csv", table_name="../Diseases_names_and_seeds_numbering.tsv")

In [7]:
# Make dictionnary with the correspondance table (orpha vs rare x disease names)
dico_mapping = dict()
mapping = pd.read_csv(f"../network/bipartite/bipartite_RARE_X_orpha_diseases.tsv", header=None, sep="\t")
for index, row in mapping.iterrows():
    dico_mapping[str(row[0])] = row[1]

In [13]:
outdir = "output_DiseaseDisease_Phenotype_Weighted"
resultsdir = "results_output_DiseaseDisease_Phenotype_Weighted"

In [None]:
def create_results_table(dico_diseases_seeds: dict) -> None:
    """Function that generates for each mutlixrank output = 
    each rarex disease, a results file that recapitulates/concatenates 
    all the scores of the 3 layers in a single file

    Args:
        dico_diseases_seeds (dict): a dictionary of the rarex 
        diseases and their seed numbers used in multixrank

    Remark: not really optimized, can be long to run
    """
    
    os.makedirs(f"../multixrank_RARE_X_diseases/{resultsdir}/", exist_ok=True)
    # for each disease: read multixrank outputs
    for disease, seed in dico_diseases_seeds.items():
        # Read layer 1 (rarex) output: no terms description to add
        # because this layer contains RARE-X disease names, symptomes
        # names and patients IDs
        multiplex_layer1 = pd.read_csv(f"../multixrank_RARE_X_diseases/{outdir}/output_{seed}/multiplex_Rare_X_layer.tsv", header=0, sep="\t")
        # get nodes into a list
        nodes_layer1 = multiplex_layer1[multiplex_layer1.columns[1]].to_list()
        # initialize empty list to store the descriptions (corresponding orpha names) for each node
        list_description_layer1 = list()
        # browse nodes in mutliplex 1 to add description
        for term in nodes_layer1:
            if term in dico_mapping:
                list_description_layer1.append(dico_mapping[term])
            else:
                list_description_layer1.append('None')
        # check that the description list and the dataframe have the same length !
        assert len(list_description_layer1) == len(multiplex_layer1.index)
        # create new description columns for the terms
        description_layer1 = pd.DataFrame(list_description_layer1, columns=['OrphaCorres'])
        # create new dataframe for layer 1 containing the ranking of the nodes + their description (orpha names and phenotypes names)
        multiplex_1_with_description = pd.concat([multiplex_layer1.reindex(range(len(multiplex_layer1))), description_layer1.reindex(range(len(multiplex_layer1)))], axis=1)

        # Read layer 2 (orpha-hpo) output
        multiplex_layer2 = pd.read_csv(f"../multixrank_RARE_X_diseases/{outdir}/output_{seed}/multiplex_Orpha_layer.tsv", header=0, sep="\t")
        # get the nodes into a list
        nodes_layer2 = multiplex_layer2[multiplex_layer2.columns[1]].to_list()
        # initialize empty list to store the descriptions (orpha names and phenotyes names) for each node
        list_description_layer2 = list()
        # browse nodes in mutliplex 2 to add description
        for term in nodes_layer2:
            if term[:5] == "ORPHA":
                orpha_code = term[6:]
                orpha_name = find_orpha_name(orpha_code=orpha_code)
                list_description_layer2.append(orpha_name)
            elif term[:2] == "HP":
                try:
                    hpo_phenotype = Ontology.get_hpo_object(term)
                    list_description_layer2.extend([str(hpo_phenotype)[13:]])
                # if there is no match of HPO phenotype name
                except RuntimeError:
                    list_description_layer2.append("None")
        # check that the description list and the dataframe have the same length !
        assert len(list_description_layer2) == len(multiplex_layer2.index)
        # create new description columns for the terms
        description_layer2 = pd.DataFrame(list_description_layer2, columns=['description'])
        # create new dataframe for layer 2 containing the ranking of the nodes + their description (orpha names and phenotypes names)
        multiplex_2_with_description = pd.concat([multiplex_layer2.reindex(range(len(multiplex_layer2))), description_layer2.reindex(range(len(multiplex_layer2)))], axis=1)

        # concatenate the three dataframes and generate table output
        max_rows = max(len(multiplex_layer1), len(multiplex_2_with_description))

        table_results = pd.concat([multiplex_1_with_description.reindex(range(max_rows)), multiplex_2_with_description.reindex(range(max_rows))], axis=1)
        table_results.to_csv(f"../multixrank_RARE_X_diseases/{resultsdir}/results_disease_{seed}.tsv", sep="\t", header=True, index=False)

create_results_table(dico_diseases_seeds=dico_diseases_seeds)