We provide all of the files to run this script (except the raw claims data) as well as the files that are generated from running this script in the Harvard Dataverse repo. Note that you may need to run the `Orphanet Convert to Flat Files.ipynb` first before running this script.

In [43]:
import pandas as pd
import numpy as np
import sys
import obonet
import networkx
import config
import logging
import seaborn as sns
import matplotlib.pyplot as plt

from simulation_pipeline.utils import preprocess 

sys.path.insert(0, '../') # add config to path


In [44]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Read in data

In [28]:
#Path to raw phenolyzer data
PHENOLYZER_LIB = config.KNOWLEDGE_GRAPH_PATH / 'raw_phenolyzer_2015'

PHENOLYZER_2015_OUTPUT_DIR = config.KNOWLEDGE_GRAPH_PATH / 'raw_normalized_phenolyzer_2015'

In [82]:
# 2015 KG
hpoa_phen_genes_2015 = pd.read_csv(PHENOLYZER_LIB / 'DB_COMPILED_HPO_PHENOTYPE_GENE', sep='\t', header=None)
hpoa_phen_genes_2015.columns = ['HPO_ID', 'HPO_NAME', 'GENE_LIST']
hpoa_phen_genes_2015['GENE_LIST'] = hpoa_phen_genes_2015['GENE_LIST'].str.replace(',$', '')
hpoa_phen_genes_2015['GENE_LIST'] = hpoa_phen_genes_2015['GENE_LIST'].str.replace('^,', '')
hpoa_phen_genes_2015['GENE_LIST'] = hpoa_phen_genes_2015['GENE_LIST'].str.split(',')
hpoa_phen_genes_2015 = hpoa_phen_genes_2015.explode('GENE_LIST')

hpoa_phen_diseases_2015 = pd.read_csv(PHENOLYZER_LIB / 'DB_HPO_ANNOTATION', sep='\t', dtype=str)
hpoa_phen_diseases_2015['HPO_ID'] = 'HP:' + hpoa_phen_diseases_2015['HPO_ID']

coba = pd.read_csv(PHENOLYZER_LIB / 'DB_COBA_NEUROCOMPLEX', sep='\t', dtype=str, header=None)
coba.columns = ['GENE_ID', 'DISEASE', 'SOURCE', 'SCORE', 'CATEGORY']

protein_interaction = pd.read_csv(PHENOLYZER_LIB / 'DB_COMPILED_BINARY_PROTEIN_INTERACTION_SCORE', sep='\t', dtype=str)
biosystem = pd.read_csv(PHENOLYZER_LIB / 'DB_COMPILED_BIOSYSTEM_SCORE', sep='\t', dtype=str)
gene_reviews = pd.read_csv(PHENOLYZER_LIB / 'DB_COMPILED_GENEREVIEWS', sep='\t', dtype=str)
gene_dx = pd.read_csv(PHENOLYZER_LIB / 'DB_COMPILED_GENE_DISEASE_SCORE', sep='\t', dtype=str)
gene_dx['GENE'] = gene_dx['GENE'].str.replace(',$', '')
gene_dx['GENE'] = gene_dx['GENE'].str.replace('^,', '')
gene_dx['GENE'] = gene_dx['GENE'].str.split(',')
gene_dx = gene_dx.explode('GENE')

disgenet = pd.read_csv(PHENOLYZER_LIB / 'DB_DISGENET_GENE_DISEASE_SCORE', sep='\t', dtype=str, header=None)
disgenet.columns = ['GENE_ID', 'DISEASE', 'DISEASE_ID', 'SCORE', 'SOURCE']
gad = pd.read_csv(PHENOLYZER_LIB / 'DB_GAD_GENE_DISEASE_SCORE', sep='\t', dtype=str, header=None)
gad.columns = ['GENE_ID', 'DISEASE', 'DISEASE_PUBMED', 'SCORE', 'SOURCE']

genecards = pd.read_csv(PHENOLYZER_LIB / 'DB_GENECARDS_GENE_DISEASE_SCORE', sep='\t', dtype=str, header=None)
genecards.columns = ['GENE_ID', 'DISEASE', 'UNK', 'SCORE', 'SOURCE']
hgnc = pd.read_csv(PHENOLYZER_LIB / 'DB_HGNC_GENE_FAMILY', sep='\t', dtype=str)
transcription = pd.read_csv(PHENOLYZER_LIB / 'DB_HTRI_TRANSCRIPTION_INTERACTION', sep='\t', dtype=str)

human_gene_id = pd.read_csv(PHENOLYZER_LIB / 'DB_HUMAN_GENE_ID', sep='\t', dtype=str, skiprows=1, header=None)
human_gene_id.columns= ['', 'GENE_ID', 'GENE_LIST']
human_gene_id['GENE_LIST'] = human_gene_id['GENE_LIST'].str.replace('|$', '')
human_gene_id['GENE_LIST'] = human_gene_id['GENE_LIST'].str.replace('^,|', '')
human_gene_id['GENE_LIST'] = human_gene_id['GENE_LIST'].str.split('|')
human_gene_id = human_gene_id.explode('GENE_LIST')


mentha = pd.read_csv(PHENOLYZER_LIB / 'DB_MENTHA_GENE_GENE_INTERACTION', sep='\t', dtype=str)

omim_gene_dx = pd.read_csv(PHENOLYZER_LIB / 'DB_OMIM_GENE_DISEASE', sep='\t', dtype=str)
omim_gene_dx['GENE'] = omim_gene_dx['GENE'].str.replace(',$', '')
omim_gene_dx['GENE'] = omim_gene_dx['GENE'].str.replace('^,', '')
omim_gene_dx['GENE'] = omim_gene_dx['GENE'].str.split('\s*,\s*')
omim_gene_dx = omim_gene_dx.explode('GENE')


orphanet = pd.read_csv(PHENOLYZER_LIB / 'DB_ORPHANET_GENE_DISEASE', sep='\t', dtype=str,encoding= 'unicode_escape')
gwas = pd.read_csv(PHENOLYZER_LIB / 'DB_GWAS_GENE_DISEASE', sep='\t', dtype=str, encoding= 'unicode_escape')
clinvar = pd.read_csv(PHENOLYZER_LIB / 'DB_CLINVAR_GENE_DISEASE', sep='\t',  encoding= 'unicode_escape')


#Nothing to do for the following
#ctd = pd.read_csv(PHENOLYZER_LIB / 'DB_COMPILED_CTD_DISEASES', sep='\t', dtype=str)
#ctd_used = pd.read_csv(PHENOLYZER_LIB / 'DB_COMPILED_CTD_DISEASES_USED', sep='\t', dtype=str)
#dx_count = pd.read_csv(PHENOLYZER_LIB / 'DB_COMPILED_DISEASE_COUNT', sep='\t', dtype=str)
#omim_dx = pd.read_csv(PHENOLYZER_LIB / 'DB_COMPILED_OMIM_ID_DISEASE', sep='\t', dtype=str)
#omim_desc = pd.read_csv(PHENOLYZER_LIB / 'DB_OMIM_DESCRIPTION', sep='\t', dtype=str)


In [None]:
# HPO Frequencies from claims database
hpo_infant = pd.read_csv(config.CLAIMS_PATH / "HPO_Pheno_Counts_Infants.tsv", delimiter = "\t")
hpo_child = pd.read_csv(config.CLAIMS_PATH / "HPO_Pheno_Counts_Children.tsv", delimiter = "\t")
hpo_adolescent = pd.read_csv(config.CLAIMS_PATH / "HPO_Pheno_Counts_Adolescent.tsv", delimiter = "\t")
hpo_adult = pd.read_csv(config.CLAIMS_PATH / "HPO_Pheno_Counts_Adult.tsv", delimiter = "\t")
hpo_elderly = pd.read_csv(config.CLAIMS_PATH / "HPO_Pheno_Counts_Elderly.tsv", delimiter = "\t")

In [19]:
hpo_2019 = obonet.read_obo(config.HPO_FILE) 

# Map Phenotypes to 2019 HPO

In [45]:
preprocessor = preprocess.Preprocessor()

INFO:root:Initializing Gene Mappings....
INFO:root:Reading hgnc mappings....
INFO:root:Reading biomart mappings....
INFO:root:Reading shilpa mappings....
INFO:root:Reading ncbi mappings....
INFO:root:Retrieving unique lists of genes....


In [None]:
hpoa_phen_diseases_2015 = preprocessor.map_phenotypes(hpoa_phen_diseases_2015)
hpoa_phen_genes_2015 = preprocessor.map_phenotypes(hpoa_phen_genes_2015)

# Map Genes to Ensembl ID

## 2015 KG

In [157]:
human_gene_id = preprocessor.map_genes(human_gene_id, ['GENE_ID', 'GENE_LIST'])
print(len(human_gene_id))

human_gene_id_format = human_gene_id.sort_values('GENE_LIST').filter(['', 'GENE_ID_ensembl', 'GENE_LIST_ensembl'])\
    .groupby(['', 'GENE_ID_ensembl'], sort=False)['GENE_LIST_ensembl'].apply(list).reset_index()
human_gene_id_format['GENE_LIST_ensembl'] = human_gene_id_format['GENE_LIST_ensembl'].apply(lambda x: list(set([str(i) for i in x]))) #doing this to include null in list
human_gene_id_format['GENE_LIST_ensembl'] = human_gene_id_format['GENE_LIST_ensembl'].str.join('|')
human_gene_id_format.head()

ERROR:root:The following gene can not be converted to an Ensembl ID: CDAN4
ERROR:root:The following gene can not be converted to an Ensembl ID: Zdhhc21
ERROR:root:The following gene can not be converted to an Ensembl ID: Cyb5r1
ERROR:root:The following gene can not be converted to an Ensembl ID: Ermn
ERROR:root:The following gene can not be converted to an Ensembl ID: dJ439F8.1
ERROR:root:The following gene can not be converted to an Ensembl ID: DEL22Q11.2
ERROR:root:The following gene can not be converted to an Ensembl ID: ENSMODG00000023111
ERROR:root:The following gene can not be converted to an Ensembl ID: bK963H5.1
ERROR:root:The following gene can not be converted to an Ensembl ID: DEL10Q26
ERROR:root:The following gene can not be converted to an Ensembl ID: DEL17Q12
ERROR:root:The following gene can not be converted to an Ensembl ID: HYPLIP2
ERROR:root:The following gene can not be converted to an Ensembl ID: Hoxd9
ERROR:root:The following gene can not be converted to an Ensembl

ERROR:root:The following gene can not be converted to an Ensembl ID: DEL1Q41Q42
ERROR:root:The following gene can not be converted to an Ensembl ID: OGS2
ERROR:root:The following gene can not be converted to an Ensembl ID: MCOPCB7
ERROR:root:The following gene can not be converted to an Ensembl ID: Celf4
ERROR:root:The following gene can not be converted to an Ensembl ID: hBD-3
ERROR:root:The following gene can not be converted to an Ensembl ID: POREN1
ERROR:root:The following gene can not be converted to an Ensembl ID: DEL9P
ERROR:root:The following gene can not be converted to an Ensembl ID: FECD7
ERROR:root:The following gene can not be converted to an Ensembl ID: HALP2
ERROR:root:The following gene can not be converted to an Ensembl ID: PTOS2
ERROR:root:The following gene can not be converted to an Ensembl ID: hLAT1
ERROR:root:The following gene can not be converted to an Ensembl ID: H-IDHB
ERROR:root:The following gene can not be converted to an Ensembl ID: POROK5
ERROR:root:The f

ERROR:root:The following gene can not be converted to an Ensembl ID: CPFP
ERROR:root:The following gene can not be converted to an Ensembl ID: C1orf18
ERROR:root:The following gene can not be converted to an Ensembl ID: MLRL
ERROR:root:The following gene can not be converted to an Ensembl ID: AMMEC
ERROR:root:The following gene can not be converted to an Ensembl ID: dJ366L4.2
ERROR:root:The following gene can not be converted to an Ensembl ID: 50-DAG
ERROR:root:The following gene can not be converted to an Ensembl ID: GpMiIII
ERROR:root:The following gene can not be converted to an Ensembl ID: ETL2
ERROR:root:The following gene can not be converted to an Ensembl ID: col4a1
ERROR:root:The following gene can not be converted to an Ensembl ID: Arsg
ERROR:root:The following gene can not be converted to an Ensembl ID: DELXP11.3
ERROR:root:The following gene can not be converted to an Ensembl ID: mTERF2
ERROR:root:The following gene can not be converted to an Ensembl ID: MCOPCT1
ERROR:root:T

ERROR:root:The following gene can not be converted to an Ensembl ID: MLSM7
ERROR:root:The following gene can not be converted to an Ensembl ID: BMIQ16
ERROR:root:The following gene can not be converted to an Ensembl ID: Lcn9
ERROR:root:The following gene can not be converted to an Ensembl ID: ENSMUSG00000031781
ERROR:root:The following gene can not be converted to an Ensembl ID: Hoxd3
ERROR:root:The following gene can not be converted to an Ensembl ID: FBgn0021874
ERROR:root:The following gene can not be converted to an Ensembl ID: IPLA2(GAMMA)
ERROR:root:The following gene can not be converted to an Ensembl ID: DEL16P11.2
ERROR:root:The following gene can not be converted to an Ensembl ID: hsa-mir-1254-1
ERROR:root:The following gene can not be converted to an Ensembl ID: HBFQTL6
ERROR:root:The following gene can not be converted to an Ensembl ID: Hoxb3
ERROR:root:The following gene can not be converted to an Ensembl ID: DUP22Q13
ERROR:root:The following gene can not be converted to a

ERROR:root:The following gene can not be converted to an Ensembl ID: DEL15Q25
ERROR:root:The following gene can not be converted to an Ensembl ID: bA535F17.1
ERROR:root:The following gene can not be converted to an Ensembl ID: Pe1Fe13
ERROR:root:The following gene can not be converted to an Ensembl ID: DEL8Q12Q21
ERROR:root:The following gene can not be converted to an Ensembl ID: SPDA1
ERROR:root:The following gene can not be converted to an Ensembl ID: MGR1
ERROR:root:The following gene can not be converted to an Ensembl ID: PRO21339
ERROR:root:The following gene can not be converted to an Ensembl ID: SCLC1
ERROR:root:The following gene can not be converted to an Ensembl ID: POROK6
ERROR:root:The following gene can not be converted to an Ensembl ID: EA1.2
ERROR:root:The following gene can not be converted to an Ensembl ID: FMTLE
ERROR:root:The following gene can not be converted to an Ensembl ID: FKSG49
ERROR:root:The following gene can not be converted to an Ensembl ID: OTDD
ERROR:r

83224


Unnamed: 0,Unnamed: 1,GENE_ID,GENE_LIST,GENE_LIST_ensembl,GENE_LIST_mapping_status
0,1,A1BG,A1B,ENSG00000172164,mapped to ensembl
0,1,A1BG,ABG,ENSG00000121410,mapped to ensembl
0,1,A1BG,GAB,ENSG00000121410,mapped to ensembl
0,1,A1BG,HYST2477,ENSG00000121410,mapped to ensembl
1,2,A2M,A2MD,ENSG00000175899,mapped to ensembl


In [182]:
gene_dx = preprocessor.map_genes(gene_dx, ['GENE'])
gene_dx.head()

ERROR:root:The following gene can not be converted to an Ensembl ID: CDAN4
ERROR:root:The following gene can not be converted to an Ensembl ID: GALNAC-T14
ERROR:root:The following gene can not be converted to an Ensembl ID: BHLHE36
ERROR:root:The following gene can not be converted to an Ensembl ID: EVI3
ERROR:root:The following gene can not be converted to an Ensembl ID: BETA-GLOBIN
ERROR:root:The following gene can not be converted to an Ensembl ID: DGKDELTA
ERROR:root:The following gene can not be converted to an Ensembl ID: CAHbeta
ERROR:root:The following gene can not be converted to an Ensembl ID: DEL16Q22
ERROR:root:The following gene can not be converted to an Ensembl ID: SF3B150
ERROR:root:The following gene can not be converted to an Ensembl ID: CALMBP1
ERROR:root:The following gene can not be converted to an Ensembl ID: MCMPMLTNA
ERROR:root:The following gene can not be converted to an Ensembl ID: PKC-BETA
ERROR:root:The following gene can not be converted to an Ensembl ID: 

ERROR:root:The following gene can not be converted to an Ensembl ID: DJ353E16.2
ERROR:root:The following gene can not be converted to an Ensembl ID: ALPHA-PIX
ERROR:root:The following gene can not be converted to an Ensembl ID: P140-TRKA
ERROR:root:The following gene can not be converted to an Ensembl ID: HBRF
ERROR:root:The following gene can not be converted to an Ensembl ID: HDMP1
ERROR:root:The following gene can not be converted to an Ensembl ID: SLP5
ERROR:root:The following gene can not be converted to an Ensembl ID: BHLHE12
ERROR:root:The following gene can not be converted to an Ensembl ID: NM15
ERROR:root:The following gene can not be converted to an Ensembl ID: EIF3-P46
ERROR:root:The following gene can not be converted to an Ensembl ID: BFIS1
ERROR:root:The following gene can not be converted to an Ensembl ID: BA59I9.3
ERROR:root:The following gene can not be converted to an Ensembl ID: C11DELP15P14
ERROR:root:The following gene can not be converted to an Ensembl ID: BHLHB2

ERROR:root:The following gene can not be converted to an Ensembl ID: HHR21
ERROR:root:The following gene can not be converted to an Ensembl ID: VPS32-2
ERROR:root:The following gene can not be converted to an Ensembl ID: SETAGIN
ERROR:root:The following gene can not be converted to an Ensembl ID: HRFT2
ERROR:root:The following gene can not be converted to an Ensembl ID: HCPEB-1
ERROR:root:The following gene can not be converted to an Ensembl ID: NCKBETA
ERROR:root:The following gene can not be converted to an Ensembl ID: BHLHB11
ERROR:root:The following gene can not be converted to an Ensembl ID: P90-RSK2
ERROR:root:The following gene can not be converted to an Ensembl ID: CTRCT35
ERROR:root:The following gene can not be converted to an Ensembl ID: TRPRS
ERROR:root:The following gene can not be converted to an Ensembl ID: ATM1P
ERROR:root:The following gene can not be converted to an Ensembl ID: IL-1R-ALPHA
ERROR:root:The following gene can not be converted to an Ensembl ID: PJA923
ERR

ERROR:root:The following gene can not be converted to an Ensembl ID: DRW10
ERROR:root:The following gene can not be converted to an Ensembl ID: NCL-1
ERROR:root:The following gene can not be converted to an Ensembl ID: HIF-1-BETA
ERROR:root:The following gene can not be converted to an Ensembl ID: CAMKN
ERROR:root:The following gene can not be converted to an Ensembl ID: UCK1-LIKE
ERROR:root:The following gene can not be converted to an Ensembl ID: SLIT-3
ERROR:root:The following gene can not be converted to an Ensembl ID: HCG_1745121
ERROR:root:The following gene can not be converted to an Ensembl ID: GM-CSF-R-ALPHA
ERROR:root:The following gene can not be converted to an Ensembl ID: HOLFD
ERROR:root:The following gene can not be converted to an Ensembl ID: RSC6P
ERROR:root:The following gene can not be converted to an Ensembl ID: Npas4
ERROR:root:The following gene can not be converted to an Ensembl ID: TPL-2
ERROR:root:The following gene can not be converted to an Ensembl ID: STSLC-

ERROR:root:The following gene can not be converted to an Ensembl ID: BA153I24.2
ERROR:root:The following gene can not be converted to an Ensembl ID: PNR-2
ERROR:root:The following gene can not be converted to an Ensembl ID: BA325O24.3
ERROR:root:The following gene can not be converted to an Ensembl ID: PMCA2I
ERROR:root:The following gene can not be converted to an Ensembl ID: IKK-ALPHA
ERROR:root:The following gene can not be converted to an Ensembl ID: GSG3
ERROR:root:The following gene can not be converted to an Ensembl ID: PLCETA2
ERROR:root:The following gene can not be converted to an Ensembl ID: HSPCA1
ERROR:root:The following gene can not be converted to an Ensembl ID: P19-INK4D
ERROR:root:The following gene can not be converted to an Ensembl ID: RAF-1
ERROR:root:The following gene can not be converted to an Ensembl ID: HELD
ERROR:root:The following gene can not be converted to an Ensembl ID: HEL-S-123M
ERROR:root:The following gene can not be converted to an Ensembl ID: FECD2


ERROR:root:The following gene can not be converted to an Ensembl ID: RHVIII
ERROR:root:The following gene can not be converted to an Ensembl ID: HFAT1
ERROR:root:The following gene can not be converted to an Ensembl ID: HTSP
ERROR:root:The following gene can not be converted to an Ensembl ID: BHLHE76
ERROR:root:The following gene can not be converted to an Ensembl ID: RHII
ERROR:root:The following gene can not be converted to an Ensembl ID: CISH2
ERROR:root:The following gene can not be converted to an Ensembl ID: HVPLA(2)
ERROR:root:The following gene can not be converted to an Ensembl ID: DJ393D12.2
ERROR:root:The following gene can not be converted to an Ensembl ID: HHCP-6
ERROR:root:The following gene can not be converted to an Ensembl ID: HTSHR-I
ERROR:root:The following gene can not be converted to an Ensembl ID: PARP-5A
ERROR:root:The following gene can not be converted to an Ensembl ID: GABABRBP
ERROR:root:The following gene can not be converted to an Ensembl ID: H2B/L
ERROR:ro

ERROR:root:The following gene can not be converted to an Ensembl ID: CT6.2B
ERROR:root:The following gene can not be converted to an Ensembl ID: RETDSR4
ERROR:root:The following gene can not be converted to an Ensembl ID: P48.2
ERROR:root:The following gene can not be converted to an Ensembl ID: BA792D24.4
ERROR:root:The following gene can not be converted to an Ensembl ID: PTFALPHA
ERROR:root:The following gene can not be converted to an Ensembl ID: 9130404H11RIK
ERROR:root:The following gene can not be converted to an Ensembl ID: HALP2
ERROR:root:The following gene can not be converted to an Ensembl ID: HZWINT-1
ERROR:root:The following gene can not be converted to an Ensembl ID: TMP-21-I
ERROR:root:The following gene can not be converted to an Ensembl ID: MYD88-2
ERROR:root:The following gene can not be converted to an Ensembl ID: ATL1-BETA
ERROR:root:The following gene can not be converted to an Ensembl ID: 100128607
ERROR:root:The following gene can not be converted to an Ensembl 

ERROR:root:The following gene can not be converted to an Ensembl ID: NPKC-THETA
ERROR:root:The following gene can not be converted to an Ensembl ID: Tnfrsf19
ERROR:root:The following gene can not be converted to an Ensembl ID: JDF2
ERROR:root:The following gene can not be converted to an Ensembl ID: COOKL
ERROR:root:The following gene can not be converted to an Ensembl ID: HKVBETA2.2
ERROR:root:The following gene can not be converted to an Ensembl ID: V-ABL
ERROR:root:The following gene can not be converted to an Ensembl ID: NMRK
ERROR:root:The following gene can not be converted to an Ensembl ID: PHEHA
ERROR:root:The following gene can not be converted to an Ensembl ID: MRV222
ERROR:root:The following gene can not be converted to an Ensembl ID: VEL
ERROR:root:The following gene can not be converted to an Ensembl ID: H-BCS
ERROR:root:The following gene can not be converted to an Ensembl ID: GPB.NY
ERROR:root:The following gene can not be converted to an Ensembl ID: HPAK3
ERROR:root:The

ERROR:root:The following gene can not be converted to an Ensembl ID: BETA-3-GX-T5
ERROR:root:The following gene can not be converted to an Ensembl ID: HSMAUG2
ERROR:root:The following gene can not be converted to an Ensembl ID: RCNCB
ERROR:root:The following gene can not be converted to an Ensembl ID: tubgcp3
ERROR:root:The following gene can not be converted to an Ensembl ID: ERDJ2
ERROR:root:The following gene can not be converted to an Ensembl ID: H-EAG
ERROR:root:The following gene can not be converted to an Ensembl ID: PRO0310P1
ERROR:root:The following gene can not be converted to an Ensembl ID: HST18960
ERROR:root:The following gene can not be converted to an Ensembl ID: ZP-4
ERROR:root:The following gene can not be converted to an Ensembl ID: CD42B-ALPHA
ERROR:root:The following gene can not be converted to an Ensembl ID: DJ271M21.1.1
ERROR:root:The following gene can not be converted to an Ensembl ID: DUH1
ERROR:root:The following gene can not be converted to an Ensembl ID: BE

ERROR:root:The following gene can not be converted to an Ensembl ID: BA356B19.1
ERROR:root:The following gene can not be converted to an Ensembl ID: P120RASGAP
ERROR:root:The following gene can not be converted to an Ensembl ID: R-PTP-ZETA-2
ERROR:root:The following gene can not be converted to an Ensembl ID: DJ322A24.1
ERROR:root:The following gene can not be converted to an Ensembl ID: HUML7-1
ERROR:root:The following gene can not be converted to an Ensembl ID: C5DELQ14.3
ERROR:root:The following gene can not be converted to an Ensembl ID: C1DELP32P31
ERROR:root:The following gene can not be converted to an Ensembl ID: BK150C2.1
ERROR:root:The following gene can not be converted to an Ensembl ID: PART14
ERROR:root:The following gene can not be converted to an Ensembl ID: RAC-GAMMA
ERROR:root:The following gene can not be converted to an Ensembl ID: NPTIIB
ERROR:root:The following gene can not be converted to an Ensembl ID: BA416N2.1
ERROR:root:The following gene can not be converted 

ERROR:root:The following gene can not be converted to an Ensembl ID: CHEMR23
ERROR:root:The following gene can not be converted to an Ensembl ID: DGK-ALPHA
ERROR:root:The following gene can not be converted to an Ensembl ID: DJ421D16.1
ERROR:root:The following gene can not be converted to an Ensembl ID: SYTXIV
ERROR:root:The following gene can not be converted to an Ensembl ID: CDW217
ERROR:root:The following gene can not be converted to an Ensembl ID: TCP-1-GAMMA
ERROR:root:The following gene can not be converted to an Ensembl ID: BETA3GAL-T3
ERROR:root:The following gene can not be converted to an Ensembl ID: CD120A
ERROR:root:The following gene can not be converted to an Ensembl ID: HSRP1
ERROR:root:The following gene can not be converted to an Ensembl ID: P12DOC-1
ERROR:root:The following gene can not be converted to an Ensembl ID: PAPA6
ERROR:root:The following gene can not be converted to an Ensembl ID: APO-M
ERROR:root:The following gene can not be converted to an Ensembl ID: SW

ERROR:root:The following gene can not be converted to an Ensembl ID: HCE-1
ERROR:root:The following gene can not be converted to an Ensembl ID: HSA-MIR-147A
ERROR:root:The following gene can not be converted to an Ensembl ID: Celf1
ERROR:root:The following gene can not be converted to an Ensembl ID: TM4SF2B
ERROR:root:The following gene can not be converted to an Ensembl ID: N-CDASE
ERROR:root:The following gene can not be converted to an Ensembl ID: FZE7
ERROR:root:The following gene can not be converted to an Ensembl ID: NA(V)1.4
ERROR:root:The following gene can not be converted to an Ensembl ID: SP17-2
ERROR:root:The following gene can not be converted to an Ensembl ID: LGV2
ERROR:root:The following gene can not be converted to an Ensembl ID: P44-ERK1
ERROR:root:The following gene can not be converted to an Ensembl ID: BPG181B23.4
ERROR:root:The following gene can not be converted to an Ensembl ID: ARVD10
ERROR:root:The following gene can not be converted to an Ensembl ID: TAIFB
ER

ERROR:root:The following gene can not be converted to an Ensembl ID: HSCR7
ERROR:root:The following gene can not be converted to an Ensembl ID: CI-42K
ERROR:root:The following gene can not be converted to an Ensembl ID: HPTPETA
ERROR:root:The following gene can not be converted to an Ensembl ID: HSSWEET1
ERROR:root:The following gene can not be converted to an Ensembl ID: ENACB
ERROR:root:The following gene can not be converted to an Ensembl ID: EOCP
ERROR:root:The following gene can not be converted to an Ensembl ID: B(2)GCN
ERROR:root:The following gene can not be converted to an Ensembl ID: SPRED-2
ERROR:root:The following gene can not be converted to an Ensembl ID: ELOX-3
ERROR:root:The following gene can not be converted to an Ensembl ID: C11DELQ13
ERROR:root:The following gene can not be converted to an Ensembl ID: MPZL1B
ERROR:root:The following gene can not be converted to an Ensembl ID: WYATT
ERROR:root:The following gene can not be converted to an Ensembl ID: MYO16B
ERROR:roo

ERROR:root:The following gene can not be converted to an Ensembl ID: DJ831C21.1
ERROR:root:The following gene can not be converted to an Ensembl ID: FECD5
ERROR:root:The following gene can not be converted to an Ensembl ID: MAT-XA
ERROR:root:The following gene can not be converted to an Ensembl ID: FLJ45139
ERROR:root:The following gene can not be converted to an Ensembl ID: CGKI-BETA
ERROR:root:The following gene can not be converted to an Ensembl ID: Lmx1b
ERROR:root:The following gene can not be converted to an Ensembl ID: HNRNPH
ERROR:root:The following gene can not be converted to an Ensembl ID: PRIL-16
ERROR:root:The following gene can not be converted to an Ensembl ID: HCAP-D3
ERROR:root:The following gene can not be converted to an Ensembl ID: NECL-1
ERROR:root:The following gene can not be converted to an Ensembl ID: DJ591C20
ERROR:root:The following gene can not be converted to an Ensembl ID: PLC-DELTA-3
ERROR:root:The following gene can not be converted to an Ensembl ID: SWD

ERROR:root:The following gene can not be converted to an Ensembl ID: P45-BWR1A
ERROR:root:The following gene can not be converted to an Ensembl ID: HFAF1S
ERROR:root:The following gene can not be converted to an Ensembl ID: BGNT-2
ERROR:root:The following gene can not be converted to an Ensembl ID: K(VCA)BETA-1
ERROR:root:The following gene can not be converted to an Ensembl ID: MGCP
ERROR:root:The following gene can not be converted to an Ensembl ID: NCOA-2
ERROR:root:The following gene can not be converted to an Ensembl ID: ZACRP2
ERROR:root:The following gene can not be converted to an Ensembl ID: SYTVI
ERROR:root:The following gene can not be converted to an Ensembl ID: HGPCR37
ERROR:root:The following gene can not be converted to an Ensembl ID: FCRH6
ERROR:root:The following gene can not be converted to an Ensembl ID: HIK-1
ERROR:root:The following gene can not be converted to an Ensembl ID: KCNJ12X
ERROR:root:The following gene can not be converted to an Ensembl ID: BHLHE32
ERROR

ERROR:root:The following gene can not be converted to an Ensembl ID: B230215M10RIK
ERROR:root:The following gene can not be converted to an Ensembl ID: DEL3Q22Q24
ERROR:root:The following gene can not be converted to an Ensembl ID: CAMKPASE
ERROR:root:The following gene can not be converted to an Ensembl ID: TAIFC
ERROR:root:The following gene can not be converted to an Ensembl ID: BA3J10.2
ERROR:root:The following gene can not be converted to an Ensembl ID: MAKAP
ERROR:root:The following gene can not be converted to an Ensembl ID: P73RHOGAP
ERROR:root:The following gene can not be converted to an Ensembl ID: PRB
ERROR:root:The following gene can not be converted to an Ensembl ID: KCA5
ERROR:root:The following gene can not be converted to an Ensembl ID: FSAP113
ERROR:root:The following gene can not be converted to an Ensembl ID: ZEP-1
ERROR:root:The following gene can not be converted to an Ensembl ID: DJ422F24.1
ERROR:root:The following gene can not be converted to an Ensembl ID: HTIM

ERROR:root:The following gene can not be converted to an Ensembl ID: MYF-6
ERROR:root:The following gene can not be converted to an Ensembl ID: ERRA
ERROR:root:The following gene can not be converted to an Ensembl ID: LARGEG
ERROR:root:The following gene can not be converted to an Ensembl ID: GSD1A
ERROR:root:The following gene can not be converted to an Ensembl ID: 1810009N24RIK
ERROR:root:The following gene can not be converted to an Ensembl ID: CISH3
ERROR:root:The following gene can not be converted to an Ensembl ID: PE1FE6
ERROR:root:The following gene can not be converted to an Ensembl ID: HHR54
ERROR:root:The following gene can not be converted to an Ensembl ID: DJ965G21.2
ERROR:root:The following gene can not be converted to an Ensembl ID: ATOD5
ERROR:root:The following gene can not be converted to an Ensembl ID: LMNT2
ERROR:root:The following gene can not be converted to an Ensembl ID: RHK562-II
ERROR:root:The following gene can not be converted to an Ensembl ID: DJ583P15.7
ER

ERROR:root:The following gene can not be converted to an Ensembl ID: HUCDC7
ERROR:root:The following gene can not be converted to an Ensembl ID: P27BBP
ERROR:root:The following gene can not be converted to an Ensembl ID: CH-TOG
ERROR:root:The following gene can not be converted to an Ensembl ID: HEL-S-88N
ERROR:root:The following gene can not be converted to an Ensembl ID: HBFQTL2
ERROR:root:The following gene can not be converted to an Ensembl ID: HOBPIIA
ERROR:root:The following gene can not be converted to an Ensembl ID: SI-1-2-19
ERROR:root:The following gene can not be converted to an Ensembl ID: ZETA2-COP
ERROR:root:The following gene can not be converted to an Ensembl ID: EDG-7
ERROR:root:The following gene can not be converted to an Ensembl ID: SMALLTALK
ERROR:root:The following gene can not be converted to an Ensembl ID: TEMTYS
ERROR:root:The following gene can not be converted to an Ensembl ID: B3GALNAC-T2
ERROR:root:The following gene can not be converted to an Ensembl ID: M

ERROR:root:The following gene can not be converted to an Ensembl ID: CMS1A1
ERROR:root:The following gene can not be converted to an Ensembl ID: SRP38
ERROR:root:The following gene can not be converted to an Ensembl ID: BA145L22.2
ERROR:root:The following gene can not be converted to an Ensembl ID: 1190004M21RIK
ERROR:root:The following gene can not be converted to an Ensembl ID: BG174L6.2
ERROR:root:The following gene can not be converted to an Ensembl ID: DJ857M17.2
ERROR:root:The following gene can not be converted to an Ensembl ID: KV1.9
ERROR:root:The following gene can not be converted to an Ensembl ID: BA218C14.1
ERROR:root:The following gene can not be converted to an Ensembl ID: P62B
ERROR:root:The following gene can not be converted to an Ensembl ID: ENACBETA
ERROR:root:The following gene can not be converted to an Ensembl ID: GLYBP
ERROR:root:The following gene can not be converted to an Ensembl ID: TILZ4C
ERROR:root:The following gene can not be converted to an Ensembl ID: 

ERROR:root:The following gene can not be converted to an Ensembl ID: F-SPONDIN
ERROR:root:The following gene can not be converted to an Ensembl ID: PRPP43P
ERROR:root:The following gene can not be converted to an Ensembl ID: C22DUPQ13
ERROR:root:The following gene can not be converted to an Ensembl ID: JERKY
ERROR:root:The following gene can not be converted to an Ensembl ID: NF110B
ERROR:root:The following gene can not be converted to an Ensembl ID: NTPDASE-1
ERROR:root:The following gene can not be converted to an Ensembl ID: M.HSAIIIB
ERROR:root:The following gene can not be converted to an Ensembl ID: S6K-BETA-1
ERROR:root:The following gene can not be converted to an Ensembl ID: PP1158
ERROR:root:The following gene can not be converted to an Ensembl ID: CDW116
ERROR:root:The following gene can not be converted to an Ensembl ID: HST2563
ERROR:root:The following gene can not be converted to an Ensembl ID: Cdh1
ERROR:root:The following gene can not be converted to an Ensembl ID: BA63

ERROR:root:The following gene can not be converted to an Ensembl ID: MGR4
ERROR:root:The following gene can not be converted to an Ensembl ID: HTAOK1
ERROR:root:The following gene can not be converted to an Ensembl ID: GLUK4
ERROR:root:The following gene can not be converted to an Ensembl ID: POU1F1A
ERROR:root:The following gene can not be converted to an Ensembl ID: GLUN3B
ERROR:root:The following gene can not be converted to an Ensembl ID: BA535F17.1
ERROR:root:The following gene can not be converted to an Ensembl ID: HEXOI
ERROR:root:The following gene can not be converted to an Ensembl ID: QKF
ERROR:root:The following gene can not be converted to an Ensembl ID: HNF1BETA
ERROR:root:The following gene can not be converted to an Ensembl ID: Map3k11
ERROR:root:The following gene can not be converted to an Ensembl ID: BHLHA27
ERROR:root:The following gene can not be converted to an Ensembl ID: LEPQTL1
ERROR:root:The following gene can not be converted to an Ensembl ID: Birc2
ERROR:root

ERROR:root:The following gene can not be converted to an Ensembl ID: HTYW5
ERROR:root:The following gene can not be converted to an Ensembl ID: GNT-VB
ERROR:root:The following gene can not be converted to an Ensembl ID: SPLA2-IIE
ERROR:root:The following gene can not be converted to an Ensembl ID: TGF1A
ERROR:root:The following gene can not be converted to an Ensembl ID: MLTNG
ERROR:root:The following gene can not be converted to an Ensembl ID: CINC-2B
ERROR:root:The following gene can not be converted to an Ensembl ID: HOX2.1
ERROR:root:The following gene can not be converted to an Ensembl ID: KTWS
ERROR:root:The following gene can not be converted to an Ensembl ID: DIPP3BETA
ERROR:root:The following gene can not be converted to an Ensembl ID: BHLHA31
ERROR:root:The following gene can not be converted to an Ensembl ID: HCCC6
ERROR:root:The following gene can not be converted to an Ensembl ID: TGFBETA-RII
ERROR:root:The following gene can not be converted to an Ensembl ID: B3GALTX
ERRO

ERROR:root:The following gene can not be converted to an Ensembl ID: P18-INK4C
ERROR:root:The following gene can not be converted to an Ensembl ID: M-SEMA-M
ERROR:root:The following gene can not be converted to an Ensembl ID: HSA-MIR-383
ERROR:root:The following gene can not be converted to an Ensembl ID: ALPHA-CHAIN
ERROR:root:The following gene can not be converted to an Ensembl ID: SEM2
ERROR:root:The following gene can not be converted to an Ensembl ID: HQKI
ERROR:root:The following gene can not be converted to an Ensembl ID: GALNAC-T5L
ERROR:root:The following gene can not be converted to an Ensembl ID: VTI1-RP2
ERROR:root:The following gene can not be converted to an Ensembl ID: P110GAMMA
ERROR:root:The following gene can not be converted to an Ensembl ID: DJ794I6.3
ERROR:root:The following gene can not be converted to an Ensembl ID: C1DELQ41Q42
ERROR:root:The following gene can not be converted to an Ensembl ID: TAPS
ERROR:root:The following gene can not be converted to an Ensem

ERROR:root:The following gene can not be converted to an Ensembl ID: HST2298
ERROR:root:The following gene can not be converted to an Ensembl ID: CAMKP-N
ERROR:root:The following gene can not be converted to an Ensembl ID: 6330408P19RIK
ERROR:root:The following gene can not be converted to an Ensembl ID: P60-SRC
ERROR:root:The following gene can not be converted to an Ensembl ID: WISP1C
ERROR:root:The following gene can not be converted to an Ensembl ID: HP1HS-BETA
ERROR:root:The following gene can not be converted to an Ensembl ID: HSIAH2
ERROR:root:The following gene can not be converted to an Ensembl ID: C1DELQ43Q44
ERROR:root:The following gene can not be converted to an Ensembl ID: CDW128B
ERROR:root:The following gene can not be converted to an Ensembl ID: DAD4
ERROR:root:The following gene can not be converted to an Ensembl ID: ENACA
ERROR:root:The following gene can not be converted to an Ensembl ID: DJ355M6.2
ERROR:root:The following gene can not be converted to an Ensembl ID:

ERROR:root:The following gene can not be converted to an Ensembl ID: SCS-BETAA
ERROR:root:The following gene can not be converted to an Ensembl ID: EIF-2BBETA
ERROR:root:The following gene can not be converted to an Ensembl ID: FGQTL6
ERROR:root:The following gene can not be converted to an Ensembl ID: TIIAC
ERROR:root:The following gene can not be converted to an Ensembl ID: HPD-1
ERROR:root:The following gene can not be converted to an Ensembl ID: P59OASL
ERROR:root:The following gene can not be converted to an Ensembl ID: HSAN
ERROR:root:The following gene can not be converted to an Ensembl ID: DJ520B18.2
ERROR:root:The following gene can not be converted to an Ensembl ID: ALPHA-MHC
ERROR:root:The following gene can not be converted to an Ensembl ID: Ugt1a9
ERROR:root:The following gene can not be converted to an Ensembl ID: PROSAP1
ERROR:root:The following gene can not be converted to an Ensembl ID: FSAPB
ERROR:root:The following gene can not be converted to an Ensembl ID: BA472E5.

ERROR:root:The following gene can not be converted to an Ensembl ID: HKCA4
ERROR:root:The following gene can not be converted to an Ensembl ID: MTRF1A
ERROR:root:The following gene can not be converted to an Ensembl ID: BHLHE1
ERROR:root:The following gene can not be converted to an Ensembl ID: P56DOK-2
ERROR:root:The following gene can not be converted to an Ensembl ID: BA145E8.1
ERROR:root:The following gene can not be converted to an Ensembl ID: BHLHA23
ERROR:root:The following gene can not be converted to an Ensembl ID: DEL16P.11.2
ERROR:root:The following gene can not be converted to an Ensembl ID: HDC-STAMP
ERROR:root:The following gene can not be converted to an Ensembl ID: CGK
ERROR:root:The following gene can not be converted to an Ensembl ID: 5-OPASE
ERROR:root:The following gene can not be converted to an Ensembl ID: MI-ER2
ERROR:root:The following gene can not be converted to an Ensembl ID: HSP56
ERROR:root:The following gene can not be converted to an Ensembl ID: GSD1D
ERR

Unnamed: 0,GENE,DISEASE,DISEASE_ID,SCORE,SOURCE,GENE_ensembl,GENE_mapping_status
0,A2M,Alpha 2 macroglobulin deficiency,OMIM:614036,0.25,OMIM,ENSG00000175899,mapped to ensembl
0,A2MD,Alpha 2 macroglobulin deficiency,OMIM:614036,0.25,OMIM,ENSG00000175899,mapped to ensembl
0,CPAMD5,Alpha 2 macroglobulin deficiency,OMIM:614036,0.25,OMIM,ENSG00000175899,mapped to ensembl
0,FWP007,Alpha 2 macroglobulin deficiency,OMIM:614036,0.25,OMIM,ENSG00000175899,mapped to ensembl
0,S863-7,Alpha 2 macroglobulin deficiency,OMIM:614036,0.25,OMIM,ENSG00000175899,mapped to ensembl


In [184]:
gene_dx_formatted = gene_dx.sort_values('GENE').filter(['GENE_ensembl', 'DISEASE', 'DISEASE_ID', 'SCORE', 'SOURCE'])\
    .groupby(['DISEASE', 'DISEASE_ID', 'SCORE', 'SOURCE'], sort=False)['GENE_ensembl'].apply(list).reset_index()
gene_dx_formatted['GENE_ensembl'] = gene_dx_formatted['GENE_ensembl'].apply(lambda x: list(set([str(i) for i in x]))) #doing this to include null in list
gene_dx_formatted['GENE_ensembl'] = gene_dx_formatted['GENE_ensembl'].str.join(', ')
gene_dx_formatted.head()

Unnamed: 0,DISEASE,DISEASE_ID,SCORE,SOURCE,GENE_ensembl
0,Fibrinogen,PUBMED:20031577,0.24999999998,GWAS,"ENSG00000109065, ENSG00000109062, HNATL"
1,Thiazide induced adverse metabolic effects in ...,PUBMED:23400010,0.249999,GWAS,"GLUN2A, ENSG00000183454, ENSG00000019144, ENSG..."
2,Personality dimensions,PUBMED:23903073,0.249999825,GWAS,"GAL-NAC6S, ENSG00000121410, ST3GALA, ST3GALIA,..."
3,Autosomal dominant progressive external ophtha...,ORPHANET:254892,0.857142857142857,ORPHANET,"ENSG00000171428, ENSG00000121410, ENSG00000110..."
4,Mitochondrial DNA depletion syndrome 12,OMIM:615418,1.0,OMIM,"ENSG00000171428, ENSG00000121410, ENSG00000110..."


In [195]:
hpoa_phen_genes_2015 = preprocessor.map_genes(hpoa_phen_genes_2015, ['GENE_LIST'])
print(len(hpoa_phen_genes_2015))
hpoa_phen_genes_2015.head()


ERROR:root:The following gene can not be converted to an Ensembl ID: ITS


276162


Unnamed: 0,HPO_ID,HPO_NAME,GENE_LIST,HPO_ID_2019,GENE_LIST_ensembl,GENE_LIST_mapping_status
0,HP:0000878,11 pairs of ribs,HDAC6,HP:0000878,ENSG00000094631,mapped to ensembl
0,HP:0000878,11 pairs of ribs,LBR,HP:0000878,ENSG00000143815,mapped to ensembl
0,HP:0000878,11 pairs of ribs,SOX9,HP:0000878,ENSG00000125398,mapped to ensembl
0,HP:0000878,11 pairs of ribs,RNU4ATAC,HP:0000878,ENSG00000264229,mapped to ensembl
0,HP:0000878,11 pairs of ribs,ATR,HP:0000878,ENSG00000175054,mapped to ensembl


In [203]:
hpoa_phen_genes_2015_formatted = hpoa_phen_genes_2015.filter(['HPO_ID_2019','HPO_NAME','GENE_LIST_ensembl'])\
    .groupby(['HPO_ID_2019', 'HPO_NAME'], sort=False)['GENE_LIST_ensembl'].apply(list).reset_index()
hpoa_phen_genes_2015_formatted['GENE_LIST_ensembl'] = hpoa_phen_genes_2015_formatted['GENE_LIST_ensembl'].apply(lambda x: list(set([str(i) for i in x]))) #doing this to include null in list
hpoa_phen_genes_2015_formatted['GENE_LIST_ensembl'] = hpoa_phen_genes_2015_formatted['GENE_LIST_ensembl'].str.join(',')
hpoa_phen_genes_2015_formatted.head()

Unnamed: 0,HPO_ID_2019,HPO_NAME,GENE_LIST_ensembl
0,HP:0000878,11 pairs of ribs,"ENSG00000143815,ENSG00000264229,ENSG0000009463..."
1,HP:0001459,1-3 toe syndactyly,ENSG00000106571
2,HP:0006088,1-5 finger complete cutaneous syndactyly,ENSG00000105983
3,HP:0010708,1-5 finger syndactyly,ENSG00000105983
4,HP:0010713,1-5 toe syndactyly,ENSG00000106571


In [None]:
genecards = preprocessor.map_genes(genecards, ['GENE_ID'])
genecards.head()


In [None]:
disgenet = preprocessor.map_genes(disgenet, ['GENE_ID'])
disgenet.head()


In [None]:
orphanet = preprocessor.map_genes(orphanet, ['GENE'])
orphanet.head()


In [None]:
# 2015 KG
clinvar = preprocessor.map_genes(clinvar, ['GENE'])
clinvar.head()


In [None]:
gwas = preprocessor.map_genes(gwas, ['GENE'])
gwas.head()


In [None]:
gad = preprocessor.map_genes(gad, ['GENE_ID'])
gad.head()

In [None]:
gene_reviews = preprocessor.map_genes(gene_reviews, ['GENE'])
gene_reviews.head()


In [None]:
protein_interaction = preprocessor.map_genes(protein_interaction, ['PROTEIN_1', 'PROTEIN_2'])
protein_interaction.head()


In [None]:
biosystem = preprocessor.map_genes(biosystem, ['GENE'])
biosystem.head()


In [None]:
coba = preprocessor.map_genes(coba, ['GENE_ID'])
coba.head()

In [None]:
hgnc = preprocessor.map_genes(hgnc, ['GENE'])

hgnc.head()

In [None]:
mentha = preprocessor.map_genes(mentha, ['Gene A', 'Gene B'])

mentha.head()

In [None]:
transcription = preprocessor.map_genes(transcription, ['TF', 'TG'])
transcription.head()

In [189]:
omim_gene_dx = preprocessor.map_genes(omim_gene_dx, ['GENE'])
omim_gene_dx.head()


ERROR:root:The following gene can not be converted to an Ensembl ID: CDAN4
ERROR:root:The following gene can not be converted to an Ensembl ID: DER22t11-22
ERROR:root:The following gene can not be converted to an Ensembl ID: FRABIN
ERROR:root:The following gene can not be converted to an Ensembl ID: FSHD1
ERROR:root:The following gene can not be converted to an Ensembl ID: Zmpste24
ERROR:root:The following gene can not be converted to an Ensembl ID: FRTS1
ERROR:root:The following gene can not be converted to an Ensembl ID: MICRODEL3q29
ERROR:root:The following gene can not be converted to an Ensembl ID: DEL10Q26
ERROR:root:The following gene can not be converted to an Ensembl ID: MSL1V1
ERROR:root:The following gene can not be converted to an Ensembl ID: ARVC10
ERROR:root:The following gene can not be converted to an Ensembl ID: PDE11A3
ERROR:root:The following gene can not be converted to an Ensembl ID: DEL5Q12
ERROR:root:The following gene can not be converted to an Ensembl ID: DEL2q

ERROR:root:The following gene can not be converted to an Ensembl ID: C9DELp24.3
ERROR:root:The following gene can not be converted to an Ensembl ID: DEL15q11.2
ERROR:root:The following gene can not be converted to an Ensembl ID: DUPXQ27.3Q28
ERROR:root:The following gene can not be converted to an Ensembl ID: DEL11p13
ERROR:root:The following gene can not be converted to an Ensembl ID: SCKL8
ERROR:root:The following gene can not be converted to an Ensembl ID: DEL8q13
ERROR:root:The following gene can not be converted to an Ensembl ID: HJUMPY
ERROR:root:The following gene can not be converted to an Ensembl ID: DUPXq27.3q28
ERROR:root:The following gene can not be converted to an Ensembl ID: AIS4
ERROR:root:The following gene can not be converted to an Ensembl ID: DEL9p
ERROR:root:The following gene can not be converted to an Ensembl ID: DUH1
ERROR:root:The following gene can not be converted to an Ensembl ID: DUP8Q22.1
ERROR:root:The following gene can not be converted to an Ensembl ID:

ERROR:root:The following gene can not be converted to an Ensembl ID: EJA1
ERROR:root:The following gene can not be converted to an Ensembl ID: HTGH
ERROR:root:The following gene can not be converted to an Ensembl ID: DEL16p12.1
ERROR:root:The following gene can not be converted to an Ensembl ID: CMD1BB
ERROR:root:The following gene can not be converted to an Ensembl ID: DA10
ERROR:root:The following gene can not be converted to an Ensembl ID: CUP2q35
ERROR:root:The following gene can not be converted to an Ensembl ID: KIAA1756
ERROR:root:The following gene can not be converted to an Ensembl ID: DEL16p12.1p11.2
ERROR:root:The following gene can not be converted to an Ensembl ID: NMLFS
ERROR:root:The following gene can not be converted to an Ensembl ID: DUPXQ28
ERROR:root:The following gene can not be converted to an Ensembl ID: PEE4
ERROR:root:The following gene can not be converted to an Ensembl ID: DEL19p13.13
ERROR:root:The following gene can not be converted to an Ensembl ID: DEL15Q

ERROR:root:The following gene can not be converted to an Ensembl ID: FGQTL6
ERROR:root:The following gene can not be converted to an Ensembl ID: TIIAC
ERROR:root:The following gene can not be converted to an Ensembl ID: HSAN
ERROR:root:The following gene can not be converted to an Ensembl ID: DEL1p36
ERROR:root:The following gene can not be converted to an Ensembl ID: DEL11P15P14
ERROR:root:The following gene can not be converted to an Ensembl ID: DEL3Q13.31
ERROR:root:The following gene can not be converted to an Ensembl ID: PDE11A2
ERROR:root:The following gene can not be converted to an Ensembl ID: CDG1A
ERROR:root:The following gene can not be converted to an Ensembl ID: CTRCT28
ERROR:root:The following gene can not be converted to an Ensembl ID: DUP2Q31.1
ERROR:root:The following gene can not be converted to an Ensembl ID: DUP16P11.2
ERROR:root:The following gene can not be converted to an Ensembl ID: DEL3q29
ERROR:root:The following gene can not be converted to an Ensembl ID: GNA

Unnamed: 0,GENE,DISEASE,MIM_NUMBER,SOURCE_CODE,LINGKAGE_INFO,GENE_ensembl,GENE_mapping_status
0,A2M,Alpha-2-macroglobulin deficiency,614036,C,1,ENSG00000175899,mapped to ensembl
0,A2MD,Alpha-2-macroglobulin deficiency,614036,C,1,ENSG00000175899,mapped to ensembl
1,A2M,"Alzheimer disease, susceptibility to",104300,C,3,ENSG00000175899,mapped to ensembl
1,A2MD,"Alzheimer disease, susceptibility to",104300,C,3,ENSG00000175899,mapped to ensembl
2,A4GALT,"Blood group, P1Pk system, p phenotype",111400,P,3,ENSG00000128274,mapped to ensembl


In [202]:
omim_gene_dx_formatted = omim_gene_dx.filter(['GENE_ensembl', 'DISEASE', 'MIM_NUMBER', 'SOURCE_CODE', 'LINGKAGE_INFO'])\
    .groupby(['DISEASE', 'MIM_NUMBER', 'SOURCE_CODE', 'LINGKAGE_INFO'], sort=False)['GENE_ensembl'].apply(list).reset_index()
omim_gene_dx_formatted['GENE_ensembl'] = omim_gene_dx_formatted['GENE_ensembl'].apply(lambda x: list(set([str(i) for i in x]))) #doing this to include null in list
omim_gene_dx_formatted['GENE_ensembl'] = omim_gene_dx_formatted['GENE_ensembl'].str.join(', ')
omim_gene_dx_formatted.head()

Unnamed: 0,DISEASE,MIM_NUMBER,SOURCE_CODE,LINGKAGE_INFO,GENE_ensembl
0,Alpha-2-macroglobulin deficiency,614036,C,1,ENSG00000175899
1,"Alzheimer disease, susceptibility to",104300,C,3,"ENSG00000010704,ENSG00000175899,MVCD3,ENSG0000..."
2,"Blood group, P1Pk system, p phenotype",111400,P,3,ENSG00000128274
3,"Blood group, P1Pk system, P phenotype",111400,P,3,ENSG00000128274
4,NOR polyagglutination syndrome,111400,P,3,ENSG00000128274


# Write 2015 Phenolyzer KG edges to file

In [204]:
hpoa_phen_genes_2015_formatted \
    .to_csv(PHENOLYZER_2015_OUTPUT_DIR / 'DB_COMPILED_HPO_PHENOTYPE_GENE', sep='\t', index=False, header=False)

In [187]:
gene_dx_formatted.sort_values('DISEASE').filter(['GENE_ensembl', 'DISEASE', 'DISEASE_ID', 'SCORE', 'SOURCE'])\
    .rename(columns={'GENE_ensembl':'GENE'}) \
    .to_csv(PHENOLYZER_2015_OUTPUT_DIR / 'DB_COMPILED_GENE_DISEASE_SCORE', sep='\t', index=False)

In [178]:
with open(str(PHENOLYZER_2015_OUTPUT_DIR / 'DB_HUMAN_GENE_ID'), 'w') as f:
    f.write('#Format: tax_id GeneID Symbol LocusTag Synonyms dbXrefs chromosome map_location description type_of_gene Symbol_from_nomenclature_authority Full_name_from_nomenclature_authority Nomenclature_status Other_designations Modification_date (tab is used as a separator, pound sign - start of a comment)\n')
    human_gene_id_format.sort_values('').to_csv(f, sep='\t', index=False, header=False)
    

In [62]:
omim_gene_dx.filter(['GENE_ensembl', 'DISEASE', 'MIM_NUMBER', 'SOURCE_CODE', 'LINGKAGE_INFO']) \
    .rename(columns={'GENE_ensembl':'GENE'}) \
    .to_csv(PHENOLYZER_2015_OUTPUT_DIR / 'DB_OMIM_GENE_DISEASE', sep='\t', index=False)

In [None]:
transcription.filter(['TF_ensembl', 'TG_ensembl', 'EVIDENCE', 'PUBMED', 'SCORE']) \
    .rename(columns={'TF_ensembl':'TF', 'TG_ensembl':'TG'}) \
    .to_csv(PHENOLYZER_2015_OUTPUT_DIR / 'DB_HTRI_TRANSCRIPTION_INTERACTION', sep='\t', index=False)

In [None]:
mentha.filter(['Gene A_ensembl', 'Gene B_ensembl', 'EVIDENCE', 'Score', 'PMID']) \
    .rename(columns={'Gene A_ensembl':'Gene A', 'Gene B_ensembl':'Gene B'}) \
    .to_csv(PHENOLYZER_2015_OUTPUT_DIR / 'DB_MENTHA_GENE_GENE_INTERACTION', sep='\t', index=False)

In [None]:
hgnc.filter(['GENE_ensembl', 'GENE_FAMILY_TAG', 'DESCRIPTION']) \
    .rename(columns={'GENE_ensembl':'GENE'}) \
    .to_csv(PHENOLYZER_2015_OUTPUT_DIR / 'DB_HGNC_GENE_FAMILY', sep='\t', index=False)

In [None]:
coba.filter(['GENE_ID_ensembl', 'DISEASE', 'SOURCE', 'SCORE', 'CATEGORY']) \
    .to_csv(PHENOLYZER_2015_OUTPUT_DIR / 'DB_COBA_NEUROCOMPLEX', sep='\t', index=False, header=False)

In [None]:
biosystem.filter(['BIOSYSTEM_ID', 'GENE_ensembl', 'SCORE', 'NAME']) \
    .rename(columns={'GENE_ensembl': 'GENE'}) \
    .to_csv(PHENOLYZER_2015_OUTPUT_DIR / 'DB_COMPILED_BIOSYSTEM_SCORE', sep='\t', index=False)

In [None]:
protein_interaction.filter(['PROTEIN_1_ensembl', 'PROTEIN_2_ensembl', 'EVIDENCE', 'SCORE', 'PUBMED_ID']) \
    .rename(columns={'PROTEIN_1_ensembl':'PROTEIN_1', 'PROTEIN_2_ensembl':'PROTEIN_2'}) \
    .to_csv(PHENOLYZER_2015_OUTPUT_DIR / 'DB_COMPILED_BINARY_PROTEIN_INTERACTION_SCORE', sep='\t',  index=False)

In [None]:
gene_reviews.filter(['GENE_ensembl', 'DISEASE', 'OMIM_NUMBER']) \
    .rename(columns={'GENE_ensembl':'GENE'}) \
    .to_csv(PHENOLYZER_2015_OUTPUT_DIR / 'DB_COMPILED_GENEREVIEWS', sep='\t', index=False)

In [None]:
genecards.filter(['GENE_ID_ensembl', 'DISEASE', 'UNK', 'SCORE', 'SOURCE']) \
    .to_csv(PHENOLYZER_2015_OUTPUT_DIR / 'DB_GENECARDS_GENE_DISEASE_SCORE', sep='\t', index=False, header=False)

In [None]:
gad.filter(['GENE_ID_ensembl', 'DISEASE', 'DISEASE_PUBMED', 'SCORE', 'SOURCE']) \
    .to_csv(PHENOLYZER_2015_OUTPUT_DIR / 'DB_GAD_GENE_DISEASE_SCORE', sep='\t', index=False, header=False)

In [None]:

disgenet.filter(['GENE_ID_ensembl', 'DISEASE', 'DISEASE_ID', 'SCORE', 'SOURCE']) \
    .rename(columns={'GENE_ID_ensembl':'GENE'}) \
    .to_csv(PHENOLYZER_2015_OUTPUT_DIR / 'DB_DISGENET_GENE_DISEASE_SCORE', sep='\t', index=False, header=False)

In [None]:
orphanet.filter(['GENE_ensembl', 'DISEASE', 'ORPHANET_NUMBER', 'SOURCE_COUNT', 'LINKAGE_INFO']) \
    .rename(columns={'GENE_ensembl':'GENE'}) \
    .to_csv(PHENOLYZER_2015_OUTPUT_DIR / 'DB_ORPHANET_GENE_DISEASE', sep='\t', index=False)

In [None]:
gwas.filter(['GENE_ensembl', 'DISEASE', 'PUBMED_NUMBER', 'RAW_SCORE']) \
    .rename(columns={'GENE_ensembl':'GENE'}) \
    .to_csv(PHENOLYZER_2015_OUTPUT_DIR / 'DB_GWAS_GENE_DISEASE', sep='\t', index=False)

In [None]:
clinvar.filter(['GENE_ensembl', 'DISEASE', 'MIM_NUMBER', 'SOURCE_COUNT']) \
    .rename(columns={'GENE_ensembl':'GENE'}) \
    .to_csv(PHENOLYZER_2015_OUTPUT_DIR / 'DB_CLINVAR_GENE_DISEASE', sep='\t', index=False)

In [None]:
# write 2015 KG
hpoa_phen_diseases_2015.filter(['HPO_ID_2019', 'SOURCE', 'HPO_DISEASE_NAME', 'FREQUENCY']) \
    .rename(columns={'HPO_ID_2019':'HPO_ID'}) \
    .to_csv(PHENOLYZER_2015_OUTPUT_DIR / 'DB_HPO_ANNOTATION', sep='\t', index=False)
    

# Convert Noisy Phenotype Frequencies to 2019 HPO

In [None]:
hpo_infant = preprocessor.map_phenotypes(hpo_infant, col_name = 'HPO')
hpo_child = preprocessor.map_phenotypes(hpo_child,col_name = 'HPO')
hpo_adolescent = preprocessor.map_phenotypes(hpo_adolescent, col_name = 'HPO')
hpo_adult = preprocessor.map_phenotypes(hpo_adult, col_name = 'HPO')
hpo_elderly = preprocessor.map_phenotypes(hpo_elderly, col_name = 'HPO')



In [None]:
def rename_cols(df):
    return df.drop(columns=['HPO']).rename(columns={'HPO_ID_2019':'HPO'})

In [None]:
hpo_infant = rename_cols(hpo_infant)
hpo_infant.to_csv(config.CLAIMS_PATH / "HPO_Pheno_Counts_Infants_normalized.tsv", sep = "\t", index=False)

In [None]:
hpo_child = rename_cols(hpo_child)
hpo_child.to_csv(config.CLAIMS_PATH / "HPO_Pheno_Counts_Children_normalized.tsv", sep = "\t", index=False)

hpo_adolescent = rename_cols(hpo_adolescent)
hpo_adolescent.to_csv(config.CLAIMS_PATH / "HPO_Pheno_Counts_Adolescent_normalized.tsv", sep = "\t", index=False)

hpo_adult = rename_cols(hpo_adult)
hpo_adult.to_csv(config.CLAIMS_PATH / "HPO_Pheno_Counts_Adult_normalized.tsv", sep = "\t", index=False)

hpo_elderly = rename_cols(hpo_elderly)
hpo_elderly.to_csv(config.CLAIMS_PATH / "HPO_Pheno_Counts_Elderly_normalized.tsv", sep = "\t", index=False)

# Convert 2019 Disegenet & HPOA (used for finding non-disease genes in simulation pipeline) to Ensembl 

## Read in data

In [None]:
disegenet_mappings = pd.read_csv(config.DISGENET_PATH /'source'/ "disease_mappings.tsv", delimiter="|")
disegenet_mappings_hpo = disegenet_mappings.query("vocabulary == 'HPO'")

In [None]:
disegenet_pheno_full = pd.read_csv(config.DISGENET_PATH / 'source' /"all_gene_disease_associations.tsv", delimiter="\t")

hpoa_genes_phenotypes = pd.read_csv(config.HPO_PATH / "ALL_SOURCES_ALL_FREQUENCIES_diseases_to_genes_to_phenotypes.txt",
                                        delimiter= "\t",
                                        names = ['DiseaseID', "GeneID", "Gene_Symbol",  "HPO_ID", "HPO_Name"],
                                       skiprows=1, dtype=str)


fp_genes_prob = pd.read_csv(config.FLAGS_PATH / "flags_s3_fp_genes_freqs.txt", \
                            delimiter="\t", names=["GeneSymbol","Count"])

date_to_gene_map = {'01-Dec': 'DEC1', '06-Sep': 'SEPT6', '09-Mar': 'MARCH9',  '09-Sep': 'SEPT9', '11-Mar': 'MARCH11', 
                   '02-Sep': 'SEPT2', '01-Mar': 'MARCH1','10-Sep': 'SEPT10', '05-Mar': 'MARCH5', '07-Mar': 'MARCH7',
                   '11-Sep': 'SEPT11', '02-Mar': 'MARCH2', '07-Sep': 'SEPT7', '12-Sep': 'SEPT12', '04-Mar': 'MARCH4',
                   '15-Sep': 'SEP15', '08-Mar': 'MARCH8', '03-Sep': 'SEPT3', '05-Sep': 'SEPT5', '08-Sep': 'SEPT8',
                   '06-Mar': 'MARCH6', '01-Sep': 'SEPT1','03-Mar': 'MARCH3','14-Sep': 'SEPT14','10-Mar': 'MARCH10'}
fp_genes_prob['GeneSymbol'] = fp_genes_prob['GeneSymbol'].replace(date_to_gene_map)

## perform mapping

In [None]:
fp_genes_prob = preprocessor.map_genes(fp_genes_prob, ['GeneSymbol'])


In [None]:
disegenet_mappings_hpo = disegenet_mappings_hpo.rename(columns={'code': 'HPO_ID'})
disegenet_mappings_hpo = preprocessor.map_phenotypes(disegenet_mappings_hpo)

In [None]:
hpoa_genes_phenotypes = preprocessor.map_genes(hpoa_genes_phenotypes, ['Gene_Symbol'])
hpoa_genes_phenotypes = preprocessor.map_phenotypes(hpoa_genes_phenotypes)

In [None]:
disegenet_pheno_full = preprocessor.map_genes(disegenet_pheno_full, ['geneSymbol'])


## write to file

In [None]:
disegenet_mappings_hpo = disegenet_mappings_hpo.drop(columns=['name', 'HPO_ID', 'vocabulary'])
disegenet_mappings_hpo.to_csv(config.DISGENET_PATH / "hpo_disease_mappings_normalized.tsv", sep = '\t')

In [None]:
disegenet_pheno_full = disegenet_pheno_full.rename(columns={'geneSymbol_ensembl': 'ensembl_ids' })
disegenet_pheno_full.to_csv(config.DISGENET_PATH / "all_gene_disease_associations_normalized.tsv", sep='\t')


In [None]:
hpoa_genes_phenotypes = hpoa_genes_phenotypes.rename(columns={'Gene_Symbol_ensembl': 'ensembl_ids'})
hpoa_genes_phenotypes.to_csv(config.HPO_PATH / "ALL_SOURCES_ALL_FREQUENCIES_diseases_to_genes_to_phenotypes_normalized.txt", sep='\t')

In [None]:
fp_genes_prob = fp_genes_prob.drop(columns=['GeneSymbol']).rename(columns={'GeneSymbol_ensembl':'Ensembl_ID'})
fp_genes_prob.to_csv(config.FLAGS_PATH / "flags_s3_fp_genes_freqs_normalized.txt", sep="\t")

# Convert Orphanet Data used for simulating patients to Ensembl & 2019 HPO

In [35]:
# NOTE: These files are created via the Orphanet Convert to Flat Files script
orphanet_phenotypes_2015 = pd.read_csv(config.ORPHANET_PATH / 'intermediate_files'/ 'orphanet_final_disease_hpo_2015.tsv', sep='\t', dtype=str)
orphanet_gene_metadata_2015 = pd.read_csv(config.ORPHANET_PATH / 'intermediate_files'/ 'orphanet_final_gene_metadata_2015.tsv', sep='\t', dtype=str)
orphanet_genes_2015 = pd.read_csv(config.ORPHANET_PATH / 'intermediate_files' / 'orphanet_final_disease_genes_2015_bugfix.tsv', sep='\t', dtype=str)
orphanet_disease_metadata_2015 = pd.read_csv(config.ORPHANET_PATH / 'intermediate_files' / 'orphanet_final_disease_metadata_2015_bugfix.tsv', sep='\t', dtype=str)

#Make sure all orphanet phenotypes are in HPO 2019 hierarchy
# First, drop rows with 'HP:0500014' (abnormal test result) because it doesn't map to 2019 HPO & isn't very useful
orphanet_phenotypes_2015 = orphanet_phenotypes_2015.loc[orphanet_phenotypes_2015['HPO_ID']  != 'HP:0500014']
orphanet_phenotypes_2015 = preprocessor.map_phenotypes(orphanet_phenotypes_2015)
orphanet_phenotypes_2015 = orphanet_phenotypes_2015.drop(columns=['HPO_ID']).rename(columns={'HPO_ID_2019': 'HPO_ID'})
assert all( p in hpo_2019.nodes() for p in orphanet_phenotypes_2015['HPO_ID'])


In [36]:
# convert all orphanet genes to Ensembl IDs
orphanet_genes_2015 = preprocessor.map_genes(orphanet_genes_2015, ['Gene_Symbol'])
orphanet_genes_2015 = orphanet_genes_2015.drop(columns=['Gene_Symbol']).rename(columns={'Gene_Symbol_ensembl': 'Ensembl_ID'})


ERROR:root:The following gene can not be converted to an Ensembl ID: SLC7A2-IT1


In [37]:
orphanet_gene_metadata_2015 = preprocessor.map_genes(orphanet_gene_metadata_2015, ['Gene_Symbol'])
orphanet_gene_metadata_2015 = orphanet_gene_metadata_2015.drop(columns=['Gene_Symbol']).rename(columns={'Gene_Symbol_ensembl': 'Ensembl_ID'})


ERROR:root:The following gene can not be converted to an Ensembl ID: SLC7A2-IT1


In [39]:
orphanet_genes_2015.loc[orphanet_genes_2015['Ensembl_ID'] == 'ENSG00000262919']

Unnamed: 0,OrphaNumber,Disorder_Name,DG_Assoc_First_Published_Post_2015_01,Ensembl_ID,Gene_Symbol_mapping_status


In [40]:
orphanet_phenotypes_2015.to_csv(config.ORPHANET_PATH / 'orphanet_final_disease_hpo_normalized_2015.tsv', sep='\t', index=False)
orphanet_gene_metadata_2015.to_csv(config.ORPHANET_PATH / 'orphanet_final_gene_metadata_normalized_2015.tsv', sep='\t', index=False)
orphanet_genes_2015.to_csv(config.ORPHANET_PATH / 'orphanet_final_disease_genes_normalized_2015.tsv', sep='\t', index=False)
orphanet_disease_metadata_2015.to_csv(config.ORPHANET_PATH / 'orphanet_final_disease_metadata_normalized_2015.tsv', sep='\t', index=False)
