# Analyze MultiXrank results

In [2]:
import pandas as pd
import xml.etree.ElementTree as ET
from pyhpo import Ontology

In [3]:
tree = ET.parse("../data/en_product4.xml")
root = tree.getroot()

# initilize the Ontology ()
_ = Ontology()

In [4]:
def find_orpha_name(orpha_code: str):
    for disorder in root.iter('Disorder'):
        orpha_code_in_tree = disorder.find('OrphaCode').text
        orpha_name = disorder.find('Name').text
        if orpha_code_in_tree == orpha_code:
            return orpha_name

In [5]:
def create_table_diseases_seeds(mapping_file: str, table_name: str) -> dict:
    dico_diseases_seeds = dict()
    df_mapping_file = pd.read_csv(mapping_file, sep=";", header=0)
    df_table = pd.DataFrame(columns=["RARE-X", "ORPHANET", "SEED NUMBER"])
    df_table["RARE-X"] = df_mapping_file["Rx"]
    diseases = df_table["RARE-X"].tolist()
    df_table["ORPHANET"] = df_mapping_file["Orphanet"]
    seed_numbers = [i for i in range(1, 28)]
    df_table["SEED NUMBER"] = seed_numbers
    df_table.to_csv(table_name, sep="\t", header=True, index=False)
    for disease, seed in zip(diseases, seed_numbers):
        dico_diseases_seeds[disease] = seed
    return dico_diseases_seeds

dico_diseases_seeds = create_table_diseases_seeds(mapping_file="../Diseases_Rx_orpha_corres.csv", table_name="../Diseases_names_and_seeds_numbering.tsv")

In [18]:
def create_results_table(dico_diseases_seeds: dict) -> None:
    for disease, seed in dico_diseases_seeds.items():
        list_terms_description = list()
        print(disease)
        multiplex_orpha_hpo_ranking = pd.read_csv(f"../multixrank_RARE_X_diseases/output_{seed}/multiplex_1.tsv", header=0, sep="\t")
        print(multiplex_orpha_hpo_ranking.index)
        #multiplex_orpha_hpo_ranking.insert(loc=2, column='description', value=['' for i in range(multiplex_orpha_hpo_ranking.shape[0])])
        nodes = multiplex_orpha_hpo_ranking[multiplex_orpha_hpo_ranking.columns[1]].to_list()
        for term in nodes:
            if term[:5] == "ORPHA":
                orpha_code = term[6:]
                orpha_name = find_orpha_name(orpha_code=orpha_code)
                list_terms_description.append(orpha_name)
            elif term[:2] == "HP":
                try:
                    hpo_phenotype = Ontology.get_hpo_object(term)
                    list_terms_description.extend([str(hpo_phenotype)[13:]])
                except RuntimeError:
                    list_terms_description.append("None")
        description = pd.DataFrame(list_terms_description, columns=['description'])
        #multiplex_orpha_hpo_ranking["description"] = list_terms_description
        print(len(list_terms_description))
        print(description.index)
        max_rows1 = max(len(multiplex_orpha_hpo_ranking), len(description))
        multiplex_orpha_hpo_ranking_with_description = pd.concat([multiplex_orpha_hpo_ranking.reindex(range(max_rows1)), description.reindex(range(max_rows1))], axis=1)
        print(multiplex_orpha_hpo_ranking_with_description)
        
        multiplex_rarex_ranking = pd.read_csv(f"../multixrank_RARE_X_diseases/output_{seed}/multiplex_2.tsv", header=0, sep="\t")
        max_rows2 = max(len(multiplex_orpha_hpo_ranking_with_description), len(multiplex_rarex_ranking))
        table_results = pd.concat([multiplex_rarex_ranking.reindex(range(max_rows2)), multiplex_orpha_hpo_ranking_with_description.reindex(range(max_rows2))], axis=1)
        

        table_results.to_csv(f"../multixrank_RARE_X_diseases/analysis_results/results_disease_{seed}.tsv", sep="\t", header=True, index=False)

create_results_table(dico_diseases_seeds=dico_diseases_seeds)

4H Leukodystrophy
RangeIndex(start=0, stop=12545, step=1)
12545
RangeIndex(start=0, stop=12545, step=1)
       multiplex          node         score  \
0              1  ORPHA:289494  1.107732e-01   
1              1    HP:0001250  5.383525e-04   
2              1    HP:0001260  5.361803e-04   
3              1    HP:0001251  5.347369e-04   
4              1    HP:0004322  5.308398e-04   
...          ...           ...           ...   
12540          1    HP:0006089  1.911770e-10   
12541          1    HP:0012455  1.907952e-10   
12542          1    HP:0005250  1.448314e-10   
12543          1    HP:0040138  1.405330e-10   
12544          1    HP:0003329  1.341202e-10   

                                             description  
0                                      4H leukodystrophy  
1                                                Seizure  
2                                             Dysarthria  
3                                                 Ataxia  
4                       