# Build a disease-disease similarity network from ORPHANET data

This code allows to build disease-disease similarity network that connects ORPHANET diseases if they possess at least one common causative gene

Data from Orphanet : https://www.orphadata.com/data/xml/en_product6.xml

In [22]:
import xml.etree.ElementTree as ET
import csv
import pandas as pd

In [23]:
tree = ET.parse("../data/en_product6.xml")
root = tree.getroot()

In [25]:
def generate_dico_diseases_genes(root) -> dict():
    dico_diseases_genes = dict()
    for disorder in root.iter('Disorder'):
        orpha_code = "ORPHA:"+disorder.find('OrphaCode').text
        dico_diseases_genes[orpha_code] = list()
        for gda in disorder.iter('DisorderGeneAssociation'):
                for gene in gda.iter('Gene'):
                    dico_diseases_genes[orpha_code] += [gene.find('Symbol').text]
    print(dico_diseases_genes)
    return dico_diseases_genes

dico_diseases_genes = generate_dico_diseases_genes(root=root)

{'ORPHA:166024': ['KIF7'], 'ORPHA:93': ['AGA'], 'ORPHA:166035': ['CWC27'], 'ORPHA:585': ['SUMF1'], 'ORPHA:118': ['MANBA'], 'ORPHA:166063': ['TSEN54'], 'ORPHA:166078': ['VWF'], 'ORPHA:206': ['IL6', 'IL23R', 'NOD2', 'NCF4', 'ATG16L1', 'IRF5', 'IRGM', 'PTPN2'], 'ORPHA:166073': ['RARS2'], 'ORPHA:166084': ['VWF'], 'ORPHA:333': ['ASAH1'], 'ORPHA:349': ['FUCA1'], 'ORPHA:166090': ['VWF'], 'ORPHA:166087': ['VWF'], 'ORPHA:366': ['AGL'], 'ORPHA:166093': ['VWF'], 'ORPHA:368': ['PYGM'], 'ORPHA:166096': ['VWF'], 'ORPHA:166100': ['COL2A1', 'COL11A2'], 'ORPHA:371': ['PFKM'], 'ORPHA:166105': ['FASTKD2'], 'ORPHA:369': ['PYGL'], 'ORPHA:447': ['PIGA'], 'ORPHA:166108': ['KCNK9'], 'ORPHA:166119': ['LEMD3'], 'ORPHA:166260': ['DSPP'], 'ORPHA:166265': ['DSPP'], 'ORPHA:166272': ['TRIP11'], 'ORPHA:576': ['GNPTAB'], 'ORPHA:812': ['NEU1'], 'ORPHA:166282': ['SCN5A', 'HCN4', 'MYH6', 'GNB2'], 'ORPHA:578': ['MCOLN1'], 'ORPHA:166286': ['GJB2'], 'ORPHA:771': ['IL23R', 'NOD2', 'IRF5', 'PTPN2', 'ABCB1'], 'ORPHA:461': ['FL

In [35]:
def compare_two_gene_sets(gene_set_1: set, gene_set_2: set) -> bool():
    return len(gene_set_1.intersection(gene_set_2)) > 0

In [34]:
def compare_gene_sets_in_dict(dico_diseases_genes: dict) -> dict():
    dico_diseases_similarity = dict()
    diseases = list(dico_diseases_genes.keys())
    for i in range(len(diseases)):
        disease_1 = diseases[i]
        dico_diseases_similarity[disease_1] = list()
        for j in range(i + 1, len(diseases)):
            disease_2 = diseases[j]
            list_genes_1 = dico_diseases_genes[disease_1]
            list_genes_2 = dico_diseases_genes[disease_2]
            if compare_two_gene_sets(set(list_genes_1), set(list_genes_2)):
                dico_diseases_similarity[disease_1] += [disease_2]
    print(dico_diseases_similarity)
    return dico_diseases_similarity

dico_diseases_similarity = compare_gene_sets_in_dict(dico_diseases_genes=dico_diseases_genes)

{'ORPHA:166024': ['ORPHA:36', 'ORPHA:2189', 'ORPHA:2754'], 'ORPHA:93': [], 'ORPHA:166035': [], 'ORPHA:585': [], 'ORPHA:118': [], 'ORPHA:166063': ['ORPHA:2524'], 'ORPHA:166078': ['ORPHA:166084', 'ORPHA:166090', 'ORPHA:166087', 'ORPHA:166093', 'ORPHA:166096'], 'ORPHA:206': ['ORPHA:771', 'ORPHA:536', 'ORPHA:379', 'ORPHA:117', 'ORPHA:186', 'ORPHA:220402', 'ORPHA:220393', 'ORPHA:90340', 'ORPHA:85410', 'ORPHA:85414', 'ORPHA:85408'], 'ORPHA:166073': [], 'ORPHA:166084': ['ORPHA:166090', 'ORPHA:166087', 'ORPHA:166093', 'ORPHA:166096'], 'ORPHA:333': ['ORPHA:2590'], 'ORPHA:349': [], 'ORPHA:166090': ['ORPHA:166087', 'ORPHA:166093', 'ORPHA:166096'], 'ORPHA:166087': ['ORPHA:166093', 'ORPHA:166096'], 'ORPHA:366': [], 'ORPHA:166093': ['ORPHA:166096'], 'ORPHA:368': [], 'ORPHA:166096': [], 'ORPHA:166100': ['ORPHA:166011', 'ORPHA:2380', 'ORPHA:137678', 'ORPHA:1427', 'ORPHA:2021', 'ORPHA:485', 'ORPHA:1856', 'ORPHA:209867', 'ORPHA:459051', 'ORPHA:93279', 'ORPHA:93346', 'ORPHA:93296', 'ORPHA:93297', 'ORPHA:

In [50]:
def generate_disease_sim_network(dico_diseases_similarity: dict, network_path: str) -> None:
    network = pd.DataFrame(columns=["source", "target"])
    associations = list()
    index = 0
    for disease in dico_diseases_similarity.keys():
        for associated_diseases in dico_diseases_similarity[disease]:
            if not (disease, associated_diseases) in associations or not (associated_diseases, disease) in associations:
                network._set_value(index, "source", disease)
                network._set_value(index, "target", associated_diseases)
                associations.append((disease, associated_diseases))
                index += 1
    print(associations)
    print(len(associations))
    print(network)
    network.to_csv(network_path, sep="\t", header=None, index=False)

generate_disease_sim_network(dico_diseases_similarity=dico_diseases_similarity, network_path="../network/multiplex/Disease_Similarity/Disease_Similarity_Network.tsv")

[('ORPHA:166024', 'ORPHA:36'), ('ORPHA:166024', 'ORPHA:2189'), ('ORPHA:166024', 'ORPHA:2754'), ('ORPHA:166063', 'ORPHA:2524'), ('ORPHA:166078', 'ORPHA:166084'), ('ORPHA:166078', 'ORPHA:166090'), ('ORPHA:166078', 'ORPHA:166087'), ('ORPHA:166078', 'ORPHA:166093'), ('ORPHA:166078', 'ORPHA:166096'), ('ORPHA:206', 'ORPHA:771'), ('ORPHA:206', 'ORPHA:536'), ('ORPHA:206', 'ORPHA:379'), ('ORPHA:206', 'ORPHA:117'), ('ORPHA:206', 'ORPHA:186'), ('ORPHA:206', 'ORPHA:220402'), ('ORPHA:206', 'ORPHA:220393'), ('ORPHA:206', 'ORPHA:90340'), ('ORPHA:206', 'ORPHA:85410'), ('ORPHA:206', 'ORPHA:85414'), ('ORPHA:206', 'ORPHA:85408'), ('ORPHA:166084', 'ORPHA:166090'), ('ORPHA:166084', 'ORPHA:166087'), ('ORPHA:166084', 'ORPHA:166093'), ('ORPHA:166084', 'ORPHA:166096'), ('ORPHA:333', 'ORPHA:2590'), ('ORPHA:166090', 'ORPHA:166087'), ('ORPHA:166090', 'ORPHA:166093'), ('ORPHA:166090', 'ORPHA:166096'), ('ORPHA:166087', 'ORPHA:166093'), ('ORPHA:166087', 'ORPHA:166096'), ('ORPHA:166093', 'ORPHA:166096'), ('ORPHA:1661