# Analyze MultiXrank results

In [2]:
import pandas as pd
import xml.etree.ElementTree as ET
from pyhpo import Ontology
import numpy as np

In [4]:
tree = ET.parse("../data/en_product4.xml")
root = tree.getroot()

# initilize the Ontology ()
_ = Ontology()

In [5]:
def find_orpha_name(orpha_code: str):
    for disorder in root.iter('Disorder'):
        orpha_code_in_tree = disorder.find('OrphaCode').text
        orpha_name = disorder.find('Name').text
        if orpha_code_in_tree == orpha_code:
            return orpha_name

In [6]:
def create_table_diseases_seeds(mapping_file: str, table_name: str) -> dict:
    dico_diseases_seeds = dict()
    df_mapping_file = pd.read_csv(mapping_file, sep=";", header=0)
    df_table = pd.DataFrame(columns=["RARE-X", "ORPHANET", "SEED NUMBER"])
    df_table["RARE-X"] = df_mapping_file["Rx"]
    diseases = df_table["RARE-X"].tolist()
    df_table["ORPHANET"] = df_mapping_file["Orphanet"]
    seed_numbers = [i for i in range(1, 28)]
    df_table["SEED NUMBER"] = seed_numbers
    df_table.to_csv(table_name, sep="\t", header=True, index=False)
    for disease, seed in zip(diseases, seed_numbers):
        dico_diseases_seeds[disease] = seed
    return dico_diseases_seeds

dico_diseases_seeds = create_table_diseases_seeds(mapping_file="../Diseases_Rx_orpha_corres.csv", table_name="../Diseases_names_and_seeds_numbering.tsv")

In [13]:
def create_results_table(dico_diseases_seeds: dict) -> None:
    for disease, seed in dico_diseases_seeds.items():
        # Read layer 1 (rarex): no terms description to add
        multiplex_layer1 = pd.read_csv(f"../multixrank_RARE_X_diseases/output_multixrank/output_{seed}/multiplex_1.tsv", header=0, sep="\t")
        
        # Read layer 2 (orpha-hpo) and fetch term description
        multiplex_layer2 = pd.read_csv(f"../multixrank_RARE_X_diseases/output_multixrank/output_{seed}/multiplex_2.tsv", header=0, sep="\t")
        nodes_layer2 = multiplex_layer2[multiplex_layer2.columns[1]].to_list()
        list_description_layer2 = list()
        # browse terms in mutliplex 2 to add description
        for term in nodes_layer2:
            if term[:5] == "ORPHA":
                orpha_code = term[6:]
                orpha_name = find_orpha_name(orpha_code=orpha_code)
                list_description_layer2.append(orpha_name)
            elif term[:2] == "HP":
                try:
                    hpo_phenotype = Ontology.get_hpo_object(term)
                    list_description_layer2.extend([str(hpo_phenotype)[13:]])
                except RuntimeError:
                    list_description_layer2.append("None")
        # check that the description list and the dataframe have the same length
        assert len(list_description_layer2) == len(multiplex_layer2.index)
        # create new description columns for the terms
        description_layer2 = pd.DataFrame(list_description_layer2, columns=['description'])
        print(len(list_description_layer2))
        print(multiplex_layer2)
        multiplex_2_with_description = pd.concat([multiplex_layer2.reindex(range(len(multiplex_layer2))), description_layer2.reindex(range(len(multiplex_layer2)))], axis=1)

        # read layer 3 (disease similaruty) and fetch orpha diseases descriptions
        multiplex_layer3 = pd.read_csv(f"../multixrank_RARE_X_diseases/output_multixrank/output_{seed}/multiplex_3.tsv", header=0, sep="\t")
        nodes_layer3 = multiplex_layer3[multiplex_layer3.columns[1]].to_list()
        list_description_layer3 = list()
        for term in nodes_layer3:
                orpha_code = term[6:]
                orpha_name = find_orpha_name(orpha_code=orpha_code)
                list_description_layer3.append(orpha_name)
        # check lengths
        assert len(list_description_layer3) == len(multiplex_layer3.index)
        # create new column for description of orpha diseases
        description_layer3 = pd.DataFrame(list_description_layer3, columns=['description'])
        print(len(list_description_layer3))
        print(len(multiplex_layer3))
        multiplex_layer3_with_description = pd.concat([multiplex_layer3.reindex(range(len(multiplex_layer3))), description_layer3.reindex(range(len(multiplex_layer3)))], axis=1)
        
        # concatenate the three dataframes
        max_rows = max(len(multiplex_layer1), len(multiplex_2_with_description), len(multiplex_layer3_with_description))
        table_results = pd.concat([multiplex_layer1.reindex(range(max_rows)), multiplex_2_with_description.reindex(range(max_rows)), multiplex_layer3_with_description.reindex(range(max_rows))], axis=1)
        
        table_results.to_csv(f"../multixrank_RARE_X_diseases/analysis_results/results_disease_{seed}.tsv", sep="\t", header=True, index=False)

create_results_table(dico_diseases_seeds=dico_diseases_seeds)

12545
RangeIndex(start=0, stop=12545, step=1)
2771
RangeIndex(start=0, stop=2771, step=1)
2771


KeyboardInterrupt: 