In [1]:
import pandas as pd
import os
from query_phenomizer import utils

In [2]:
#Credits to Sasja Westgeest for the code in this code block.

class Update_HPO:
    
    def __init__(self, obo_file):
        self.obo_file = obo_file
        self.replacement_dict = self.create_dictionary_replacements()
        self.non_phenotype_nodes = self.find_non_phenotype_nodes()
   
    def find_non_phenotype_nodes(self):
        non_phenotype_nodes = set(['HP:0000005', 'HP:0012823', 'HP:0040279'])
        
        nodes_added = len(non_phenotype_nodes)
        while nodes_added > 0:

            hpobo = open(self.obo_file)
            nodes_added = 0
            term = ''

            for line in hpobo:
                if line.startswith('id'):
                    term = line.split(': ')[1].split('\n')[0]

                elif line.startswith('is_a'):
                    hpo_term = line.split(': ')[1].split(' !')[0]
                    if hpo_term in non_phenotype_nodes and term not in non_phenotype_nodes:
                        non_phenotype_nodes.add(term)
                        nodes_added += 1
        return non_phenotype_nodes


    def create_dictionary_replacements(self):
        hpobo = open(self.obo_file)
        replacements = {} #key is replaced by value

        term = ''

        for line in hpobo:
            if line.startswith('id'):
                term = line.split(': ')[1].split('\n')[0]

            elif line.startswith('replaced_by'):
                hpo_term = line.split(': ')[1].split('\n')[0]
                replacements[term] = hpo_term

            elif line.startswith('alt_id'):
                hpo_term = line.split(': ')[1].split('\n')[0]
                replacements[hpo_term] = term
        return replacements

    
    def create_dictionary_id_name(self): #gebruik ik helemaal niet
        hpobo = open(self.obo_file)
        id_to_name = {}

        term_id = ''

        for line in hpobo:
            if line.startswith('id'):
                term_id = line.split(': ')[1].split('\n')[0]

            elif line.startswith('name'):
                term_name = line.split(': ')[1].split('\n')[0]
                id_to_name[term_id] = term_name

        return id_to_name
    
    def replace(self, term):
        if term in self.replacement_dict.keys():
            return self.replacement_dict[term]
        else:
            return term

    
    def delete_non_phenotype_nodes(self, term_list):
        new_term_list = [i for i in term_list if i not in self.non_phenotype_nodes]
        return new_term_list


    def update_phenotype(self, patient):
        replaced = [self.replace(term) for term in patient]
        replaced_deleted = self.delete_non_phenotype_nodes(replaced)
        return replaced_deleted

In [3]:
pd.set_option("display.max_rows", None, "display.max_columns", None)

data = pd.read_pickle('C:/Users/niels/Downloads/patienten.pkl')

In [4]:

obo_file = 'C:/Users/niels/Downloads\hp.obo'


In [5]:
#Credits to Sasja Westgeest for the code in this code block.

updater = Update_HPO(obo_file)
for index in range(len(data['hpo_all'])):
    data['hpo_all'][index] = updater.update_phenotype(data['hpo_all'][index])
    data['hpo_all_with_parents'][index] = updater.update_phenotype(data['hpo_all_with_parents'][index])
    data['label'][index] = data['label'][index].replace("_", "")

In [6]:
omim_genes = {
    'OMIM:148050' : 'ankrd',
    'OMIM:300958' : 'ddx3x',
    'OMIM:610443' : 'kansl1',
    'OMIM:611867' : '22q11',
    'OMIM:614104' : 'dyrk1a',
    'OMIM:615009' : 'pacs1',
    'OMIM:615873' : 'adnp',
    'OMIM:616158' : 'pura',
    'OMIM:616708' : 'wac',
    'OMIM:617140' : 'son',
    'OMIM:618846' : 'kdm3b', 
    'OMIM:618829' : 'spop_2',
    'OMIM:618828' : 'spop_1',
    'OMIM:617854' : 'cltc',
    'OMIM:617557' : 'yy1',
    'OMIM:617450' : 'ppm1d'
}

diseases = list(omim_genes.keys())

In [7]:

diagnoses_database = {}
genes_database = {}
for index in range(len(data['hpo_all'])):
    
    separator = ', '
    a = (separator.join(data['hpo_all'][index])) 

    result = utils.query_phenomizer('scout', 'scout123',  a)

    diagnoses_omim = []
    genes_l = []
    for i in range(100):
        omim = result.text.split('\n')[6:][i].split('\t')[2] #OMIM name
        all_genes = (result.text.split('\n')[6:][i].split('\t')[4]) # GENE name
        diagnoses_omim.append(omim)
        genes_l.append(all_genes)
        diagnoses_database[index] = diagnoses_omim
        genes_database[index] = genes_l


In [11]:
counter = 0
for i in diagnoses_database.keys():
    diagnoses = diagnoses_database[i]
    for j in diagnoses:
        if j in diseases:
            if omim_genes.get(j) == data['label'][i]:
                counter += 1
                if omim_genes.get(j) != data['label'][i]:
                    for k in range(100):
                        if data['label'][i].upper() in genes_database[i][k]:
                            counter += 1

print('Percentage of correct predictions in top 100 candidate diseases:', round(counter/(len(data['hpo_all']))*100,1),'%')

Percentage of correct predictions in top 100 candidate diseases: 4.2 %
