# HPOA File creator
This notebook shows how to convert a collection of phenopackets into a file that can be use for the phenotype.hpoa resource. The latter provides aggregate frequency data for each source.

For this example we first create a collection of phenopackets and then transform them into a file for the HPOA.

Our example will be [Yogev Y, et al. Limb girdle muscular disease caused by HMGCR mutation and statin myopathy treatable with mevalonolactone. Proc Natl Acad Sci U S A. 2023 Feb 14;120(7):e2217831120. doi: 10.1073/pnas.2217831120. Epub 2023 Feb 6. PMID: 36745799; PMCID: PMC9963716.]

In [1]:
import phenopackets as php
from google.protobuf.json_format import MessageToDict, MessageToJson
from google.protobuf.json_format import Parse, ParseDict
import pandas as pd
import os, sys
pd.set_option('display.max_colwidth', None) # show entire column contents, important!
pd.set_option('display.max_rows', None)
from collections import defaultdict

from phenopackets import Phenopacket
from google.protobuf.json_format import Parse
import json
import hpotk

from pyphetools.creation import *
from pyphetools.visualization import *

import importlib.metadata
__version__ = importlib.metadata.version("pyphetools")
print(f"Using pyphetools version {__version__}")

Using pyphetools version 0.8.3


In [2]:
parser = HpoParser()
hpo_cr = parser.get_hpo_concept_recognizer()
hpo_version = parser.get_version()
PMID="PMID:36745799"
title = "Limb girdle muscular disease caused by HMGCR mutation and statin myopathy treatable with mevalonolactone"
metadata = MetaData(created_by="ORCID:0000-0002-0736-9199", pmid=PMID, pubmed_title=title)
metadata.default_versions_with_hpo(version=hpo_version)

In [3]:
data = "data/LGMDR28.xlsx"
df = pd.read_excel(data)

In [4]:
df.head()

Unnamed: 0,INDIVIDUAL,V:2,V:5,V:8,V:9,V:12,V:13
0,SEX,F,M,M,M,F,M
1,AGE_AT_EXAMINATION,49,58,37,42,51,41
2,AGE_AT_ONSET,31,39,24,33,31,34
3,PROXIMAL_STRENGTH-UPPER_LIMB,0/5,3/5,5/5,5/5,2/5,3/5
4,PROXIMAL_STRENGTH-LOWER_LIMB,0/5,2/5,5/5,4/5,2/5,4/5


In [5]:
dft = df.transpose()
dft.columns = dft.iloc[0]
dft.drop(dft.index[0], inplace=True)
dft['patient_id'] = dft.index  # Set the new column 'patient_id' to be identical to the contents of the index
dft.head() 

INDIVIDUAL,SEX,AGE_AT_EXAMINATION,AGE_AT_ONSET,PROXIMAL_STRENGTH-UPPER_LIMB,PROXIMAL_STRENGTH-LOWER_LIMB,ATROPHY_UPPER_LIMB,ATROPHY_LOWER_LIMB,DEEP_TENDON_REFLEXES,PAIN_ON_EXERTION,AMBULATORY,...,VLDL,FASTING_BLOOD_SUGAR,"ANA,RF,C3,C4ABNORMALITIES","ANTI-SM,ANTIJO-1,ANTI-SSA/B,ANCA,AMA",ANTI-HMGCR_AB,ABNORMAL_BRAIN_IMAGING,MYOPATHIC_CHANGES_IN_EMG,ABNORMAL_NCV,COMORBIDITIES,patient_id
V:2,F,49,31,0/5,0/5,Marked,Marked,Absent,+,-,...,17(9-26),390,-,-,-,-,+,-,Insulindependentdiabetes-onsetatage19,V:2
V:5,M,58,39,3/5,2/5,Marked,Marked,Diminished,+,-,...,25(15-46),123,-,,-,-,+,(+)L4-5radiculopathy,"COPD,Diastolicdysfunction,ICRBBB,Lymphocytosis",V:5
V:8,M,37,24,5/5,5/5,-,-,+,+,+,...,19,127,-,,-,,,,,V:8
V:9,M,42,33,5/5,4/5,-,-,+,+,+,...,22(12-32),111,,,-,-,,,ICRBBB,V:9
V:12,F,51,31,2/5,2/5,Evident,Evident,Diminished,+,-,...,30(11-154),124,-,-,-,-,+,-,Single kidney,V:12


In [6]:
column_mapper_d = {}

In [7]:
result = OptionColumnMapper.autoformat(df=dft, concept_recognizer=hpo_cr, delimiter=",")
print(result)

age_at_examination_d = {'49': 'PLACEHOLDER',
 '58': 'PLACEHOLDER',
 '37': 'PLACEHOLDER',
 '42': 'PLACEHOLDER',
 '51': 'PLACEHOLDER',
 '41': 'PLACEHOLDER'}
age_at_examinationMapper = OptionColumnMapper(concept_recognizer=hpo_cr, option_d=age_at_examination_d)
age_at_examinationMapper.preview_column(df['AGE_AT_EXAMINATION'])
column_mapper_d['AGE_AT_EXAMINATION'] = age_at_examinationMapper

age_at_onset_d = {'31': 'PLACEHOLDER',
 '39': 'PLACEHOLDER',
 '24': 'PLACEHOLDER',
 '33': 'PLACEHOLDER',
 '34': 'PLACEHOLDER'}
age_at_onsetMapper = OptionColumnMapper(concept_recognizer=hpo_cr, option_d=age_at_onset_d)
age_at_onsetMapper.preview_column(df['AGE_AT_ONSET'])
column_mapper_d['AGE_AT_ONSET'] = age_at_onsetMapper

proximal_strength-upper_limb_d = {'0/5': 'PLACEHOLDER',
 '3/5': 'PLACEHOLDER',
 '5/5': 'PLACEHOLDER',
 '2/5': 'PLACEHOLDER'}
proximal_strength-upper_limbMapper = OptionColumnMapper(concept_recognizer=hpo_cr, option_d=proximal_strength-upper_limb_d)
proximal_strength-upper_limbMappe

In [8]:
ageColumn = AgeColumnMapper.by_year(column_name='AGE_AT_EXAMINATION')
ageColumn.preview_column(dft['AGE_AT_EXAMINATION'])

Unnamed: 0,original column contents,age
0,49,P49Y
1,58,P58Y
2,37,P37Y
3,42,P42Y
4,51,P51Y
5,41,P41Y


In [9]:
proximal_strength_upper_limb_d = {
 '3/5': 'Proximal muscle weakness in upper limbs',
 '5/5': 'Proximal muscle weakness in upper limbs',
 '2/5': 'Proximal muscle weakness in upper limbs'}
excluded_d = {'0/5': 'Proximal muscle weakness in upper limbs'}
proximal_strength_upper_limbMapper = OptionColumnMapper(concept_recognizer=hpo_cr, option_d=proximal_strength_upper_limb_d, excluded_d=excluded_d)
proximal_strength_upper_limbMapper.preview_column(dft['PROXIMAL_STRENGTH-UPPER_LIMB'])
column_mapper_d['PROXIMAL_STRENGTH-UPPER_LIMB'] = proximal_strength_upper_limbMapper

In [10]:

proximal_strength_lower_limb_d = {
 '2/5': 'Proximal muscle weakness in lower limbs',
 '5/5': 'Proximal muscle weakness in lower limbs',
 '4/5': 'Proximal muscle weakness in lower limbs'}
excluded = {'0/5': 'Proximal muscle weakness in lower limbs'}
proximal_strength_lower_limbMapper = OptionColumnMapper(concept_recognizer=hpo_cr, option_d=proximal_strength_lower_limb_d, excluded_d=excluded)
proximal_strength_lower_limbMapper.preview_column(dft['PROXIMAL_STRENGTH-LOWER_LIMB'])
column_mapper_d['PROXIMAL_STRENGTH-LOWER_LIMB'] = proximal_strength_lower_limbMapper

In [11]:
atrophy_upper_limb_d = {'Marked': 'Upper limb amyotrophy',
 'Evident': 'Upper limb amyotrophy'}
atrophy_upper_limbMapper = OptionColumnMapper(concept_recognizer=hpo_cr, option_d=atrophy_upper_limb_d)
atrophy_upper_limbMapper.preview_column(dft['ATROPHY_UPPER_LIMB'])
column_mapper_d['ATROPHY_UPPER_LIMB'] = atrophy_upper_limbMapper

In [12]:
atrophy_lower_limb_d = {'Marked': 'Lower limb amyotrophy',
 'Evident': 'Lower limb amyotrophy'}
atrophy_lower_limbMapper = OptionColumnMapper(concept_recognizer=hpo_cr, option_d=atrophy_lower_limb_d)
atrophy_lower_limbMapper.preview_column(dft['ATROPHY_LOWER_LIMB'])
column_mapper_d['ATROPHY_LOWER_LIMB'] = atrophy_lower_limbMapper

In [13]:
deep_tendon_reflexes_d = {'Absent': 'Areflexia',
 'Diminished': 'Hyporeflexia'}
deep_tendon_reflexesMapper = OptionColumnMapper(concept_recognizer=hpo_cr, option_d=deep_tendon_reflexes_d)
deep_tendon_reflexesMapper.preview_column(dft['DEEP_TENDON_REFLEXES'])
column_mapper_d['DEEP_TENDON_REFLEXES'] = deep_tendon_reflexesMapper

In [14]:
#Exercise-induced myalgia HP:0003738
pain_on_exertion_d = {"+":"Exercise-induced myalgia"}
pain_on_exertionMapper = OptionColumnMapper(concept_recognizer=hpo_cr, option_d=pain_on_exertion_d)
pain_on_exertionMapper.preview_column(dft['PAIN_ON_EXERTION'])
column_mapper_d['PAIN_ON_EXERTION'] = pain_on_exertionMapper

In [15]:
# Loss of ambulation HP:0002505
ambulatory_d = {}
ambulatoryMapper = SimpleColumnMapper(hpo_id="HP:0002505", hpo_label="Loss of ambulation", observed="+", excluded="-")
ambulatoryMapper.preview_column(dft['AMBULATORY'])
column_mapper_d['AMBULATORY'] = ambulatoryMapper

In [16]:
# dft['MOBILITY_RESTRICTION'])
# Not coding

In [18]:
# Respiratory insufficiency HP:0002093
respiratory_difficulties_d = {'Ventilated_through_tracheostomy': 'Respiratory insufficiency', "+":"Respiratory insufficiency"}
excluded = {"-":"Respiratory insufficiency"}
respiratory_difficultiesMapper = OptionColumnMapper(concept_recognizer=hpo_cr, option_d=respiratory_difficulties_d, excluded_d=excluded)
respiratory_difficultiesMapper.preview_column(dft['RESPIRATORY_DIFFICULTIES'])
column_mapper_d['RESPIRATORY_DIFFICULTIES'] = respiratory_difficultiesMapper

In [19]:
dysphagia_d = {}
dysphagiaMapper = SimpleColumnMapper(hpo_id="HP:0002015", hpo_label="Dysphagia", observed="+", excluded="-")
dysphagiaMapper.preview_column(dft['DYSPHAGIA'])
column_mapper_d['DYSPHAGIA'] = dysphagiaMapper

In [21]:
echocardiography_d = {
 'Mild diastolic_dysfunction': 'Left ventricular diastolic dysfunction',
}
echocardiographyMapper = OptionColumnMapper(concept_recognizer=hpo_cr, option_d=echocardiography_d)
echocardiographyMapper.preview_column(dft['ECHOCARDIOGRAPHY'])
column_mapper_d['ECHOCARDIOGRAPHY'] = echocardiographyMapper

In [22]:
cpkMapper = ConstantColumnMapper(hpo_id="HP:0003236", hpo_label='Elevated circulating creatine kinase concentration')
cpkMapper.preview_column(dft['CPK(REFERENCE_20-180_U/L)'])
column_mapper_d['CPK(REFERENCE_20-180_U/L)'] = cpkMapper

In [23]:
troponinMapper = ThresholdedColumnMapper(hpo_id="HP:0410174", 
                                         hpo_label="Increased circulating troponin T concentration", 
                                         threshold=14, 
                                         call_if_above=True)
troponinMapper.preview_column(dft['MAXIMAL TROPONIN T(0-14NG/L)'])
#column_mapper_d['MAXIMAL TROPONIN T(0-14NG/L)'] = troponinMapper

Unnamed: 0,term,status
0,Increased circulating troponin T concentration (HP:0410174),observed
1,Increased circulating troponin T concentration (HP:0410174),observed
2,Increased circulating troponin T concentration (HP:0410174),excluded
3,Increased circulating troponin T concentration (HP:0410174),observed
4,Increased circulating troponin T concentration (HP:0410174),observed
5,Increased circulating troponin T concentration (HP:0410174),excluded


In [27]:
# Not abnormal for any individual
creatinineMapper = ConstantColumnMapper(hpo_id="HP:0012100", hpo_label="Abnormal circulating creatinine concentration", excluded=True)
creatinineMapper.preview_column(dft['CREATININE'])
column_mapper_d['CREATININE'] = creatinineMapper

In [29]:
ast_d = {'34(12-106)': 'Elevated circulating aspartate aminotransferase concentration',
 '54(15-241)': 'Elevated circulating aspartate aminotransferase concentration',
 '277(68-905)': 'Elevated circulating aspartate aminotransferase concentration',
 '43(21-138)': 'Elevated circulating aspartate aminotransferase concentration',
 '98(28-566)': 'Elevated circulating aspartate aminotransferase concentration'}
excluded = {'23(19-29)': 'Elevated circulating aspartate aminotransferase concentration'}
astMapper = OptionColumnMapper(concept_recognizer=hpo_cr, option_d=ast_d, excluded_d=excluded)
astMapper.preview_column(dft['AST(REFERENCE_0-35U/L)'])
column_mapper_d['AST(REFERENCE_0-35U/L)'] = astMapper

Unnamed: 0,terms
0,HP:0031956 (Elevated circulating aspartate aminotransferase concentration/observed)
1,HP:0031956 (Elevated circulating aspartate aminotransferase concentration/observed)
2,HP:0031956 (Elevated circulating aspartate aminotransferase concentration/observed)
3,HP:0031956 (Elevated circulating aspartate aminotransferase concentration/excluded)
4,HP:0031956 (Elevated circulating aspartate aminotransferase concentration/observed)
5,HP:0031956 (Elevated circulating aspartate aminotransferase concentration/observed)


In [33]:
alt_d = {'31(9-113)': 'Elevated circulating alanine aminotransferase concentration',
 '50(10-199)': 'Elevated circulating alanine aminotransferase concentration',
 '322(43-911)': 'Elevated circulating alanine aminotransferase concentration',
 '44(12-173)': 'Elevated circulating alanine aminotransferase concentration',
 '80(21-375)': 'Elevated circulating alanine aminotransferase concentration'}
excluded = {'15(11-25)': 'Elevated circulating alanine aminotransferase concentration'}
altMapper = OptionColumnMapper(concept_recognizer=hpo_cr, option_d=alt_d, excluded_d=excluded)
altMapper.preview_column(dft['ALT(REFERENCE_0-45U/L)'])
column_mapper_d['ALT(REFERENCE_0-45U/L)'] = altMapper

In [21]:





alkaline_phosphatase(reference_30-120u/l)_d = {'151(108-331)': 'PLACEHOLDER',
 '109(78-130)': 'PLACEHOLDER',
 '78(67-88)': 'PLACEHOLDER',
 '89(65-107)': 'PLACEHOLDER',
 '100(72-132)': 'PLACEHOLDER',
 '79(68-94)': 'PLACEHOLDER'}
alkaline_phosphatase(reference_30-120u/l)Mapper = OptionColumnMapper(concept_recognizer=hpo_cr, option_d=alkaline_phosphatase(reference_30-120u/l)_d)
alkaline_phosphatase(reference_30-120u/l)Mapper.preview_column(df['ALKALINE_PHOSPHATASE(REFERENCE_30-120U/L)'])
column_mapper_d['ALKALINE_PHOSPHATASE(REFERENCE_30-120U/L)'] = alkaline_phosphatase(reference_30-120u/l)Mapper

total_cholesterol_average(recommended<200mg/dl)_d = {'146(127-167)': 'PLACEHOLDER',
 '159(144-182)': 'PLACEHOLDER',
 '128(111-158)': 'PLACEHOLDER',
 '136(79-160)': 'PLACEHOLDER',
 '171(147-211)': 'PLACEHOLDER',
 '128(104-137)': 'PLACEHOLDER'}
total_cholesterol_average(recommended<200mg/dl)Mapper = OptionColumnMapper(concept_recognizer=hpo_cr, option_d=total_cholesterol_average(recommended<200mg/dl)_d)
total_cholesterol_average(recommended<200mg/dl)Mapper.preview_column(df['TOTAL_CHOLESTEROL_AVERAGE(RECOMMENDED<200MG/DL)'])
column_mapper_d['TOTAL_CHOLESTEROL_AVERAGE(RECOMMENDED<200MG/DL)'] = total_cholesterol_average(recommended<200mg/dl)Mapper

triglycerides(recommended<150mg/dl)_d = {'87(47-129)': 'PLACEHOLDER',
 '123(79-230)': 'PLACEHOLDER',
 '95.5(95-96)': 'PLACEHOLDER',
 '108(58-160)': 'PLACEHOLDER',
 '149(55-270)': 'PLACEHOLDER',
 '167(77-232)': 'PLACEHOLDER'}
triglycerides(recommended<150mg/dl)Mapper = OptionColumnMapper(concept_recognizer=hpo_cr, option_d=triglycerides(recommended<150mg/dl)_d)
triglycerides(recommended<150mg/dl)Mapper.preview_column(df['TRIGLYCERIDES(RECOMMENDED<150MG/DL)'])
column_mapper_d['TRIGLYCERIDES(RECOMMENDED<150MG/DL)'] = triglycerides(recommended<150mg/dl)Mapper

hdl(recommended>60mg/dl)_d = {'49(31-65)': 'PLACEHOLDER',
 '49(43-57)': 'PLACEHOLDER',
 '38(30-46)': 'PLACEHOLDER',
 '45(31-50)': 'PLACEHOLDER',
 '55(30-70)': 'PLACEHOLDER',
 '41(27-49)': 'PLACEHOLDER'}
hdl(recommended>60mg/dl)Mapper = OptionColumnMapper(concept_recognizer=hpo_cr, option_d=hdl(recommended>60mg/dl)_d)
hdl(recommended>60mg/dl)Mapper.preview_column(df['HDL(RECOMMENDED>60MG/DL)'])
column_mapper_d['HDL(RECOMMENDED>60MG/DL)'] = hdl(recommended>60mg/dl)Mapper

ldl(recommended<100mg/dl)_d = {'80(68-99)': 'PLACEHOLDER',
 '87(79-104)': 'PLACEHOLDER',
 '77(62-92)': 'PLACEHOLDER',
 '67(28-81)': 'PLACEHOLDER',
 '82.5(50-112)': 'PLACEHOLDER',
 '55(31-71)': 'PLACEHOLDER'}
ldl(recommended<100mg/dl)Mapper = OptionColumnMapper(concept_recognizer=hpo_cr, option_d=ldl(recommended<100mg/dl)_d)
ldl(recommended<100mg/dl)Mapper.preview_column(df['LDL(RECOMMENDED<100MG/DL)'])
column_mapper_d['LDL(RECOMMENDED<100MG/DL)'] = ldl(recommended<100mg/dl)Mapper

vldl_d = {'17(9-26)': 'PLACEHOLDER',
 '25(15-46)': 'PLACEHOLDER',
 '19': 'PLACEHOLDER',
 '22(12-32)': 'PLACEHOLDER',
 '30(11-154)': 'PLACEHOLDER',
 '33(15-46)': 'PLACEHOLDER'}
vldlMapper = OptionColumnMapper(concept_recognizer=hpo_cr, option_d=vldl_d)
vldlMapper.preview_column(df['VLDL'])
column_mapper_d['VLDL'] = vldlMapper

fasting_blood_sugar_d = {'390': 'PLACEHOLDER',
 '123': 'PLACEHOLDER',
 '127': 'PLACEHOLDER',
 '111': 'PLACEHOLDER',
 '124': 'PLACEHOLDER',
 '155': 'PLACEHOLDER'}
fasting_blood_sugarMapper = OptionColumnMapper(concept_recognizer=hpo_cr, option_d=fasting_blood_sugar_d)
fasting_blood_sugarMapper.preview_column(df['FASTING_BLOOD_SUGAR'])
column_mapper_d['FASTING_BLOOD_SUGAR'] = fasting_blood_sugarMapper

ana,rf,c3,c4abnormalities_d = {'nan': 'PLACEHOLDER'}
ana,rf,c3,c4abnormalitiesMapper = OptionColumnMapper(concept_recognizer=hpo_cr, option_d=ana,rf,c3,c4abnormalities_d)
ana,rf,c3,c4abnormalitiesMapper.preview_column(df['ANA,RF,C3,C4ABNORMALITIES'])
column_mapper_d['ANA,RF,C3,C4ABNORMALITIES'] = ana,rf,c3,c4abnormalitiesMapper

anti-sm,antijo-1,anti-ssa/b,anca,ama_d = {'nan': 'PLACEHOLDER'}
anti-sm,antijo-1,anti-ssa/b,anca,amaMapper = OptionColumnMapper(concept_recognizer=hpo_cr, option_d=anti-sm,antijo-1,anti-ssa/b,anca,ama_d)
anti-sm,antijo-1,anti-ssa/b,anca,amaMapper.preview_column(df['ANTI-SM,ANTIJO-1,ANTI-SSA/B,ANCA,AMA'])
column_mapper_d['ANTI-SM,ANTIJO-1,ANTI-SSA/B,ANCA,AMA'] = anti-sm,antijo-1,anti-ssa/b,anca,amaMapper

anti-hmgcr_ab_d = {}
anti-hmgcr_abMapper = OptionColumnMapper(concept_recognizer=hpo_cr, option_d=anti-hmgcr_ab_d)
anti-hmgcr_abMapper.preview_column(df['ANTI-HMGCR_AB'])
column_mapper_d['ANTI-HMGCR_AB'] = anti-hmgcr_abMapper

abnormal_brain_imaging_d = {'nan': 'PLACEHOLDER'}
abnormal_brain_imagingMapper = OptionColumnMapper(concept_recognizer=hpo_cr, option_d=abnormal_brain_imaging_d)
abnormal_brain_imagingMapper.preview_column(df['ABNORMAL_BRAIN_IMAGING'])
column_mapper_d['ABNORMAL_BRAIN_IMAGING'] = abnormal_brain_imagingMapper

myopathic_changes_in_emg_d = {'nan': 'PLACEHOLDER'}
myopathic_changes_in_emgMapper = OptionColumnMapper(concept_recognizer=hpo_cr, option_d=myopathic_changes_in_emg_d)
myopathic_changes_in_emgMapper.preview_column(df['MYOPATHIC_CHANGES_IN_EMG'])
column_mapper_d['MYOPATHIC_CHANGES_IN_EMG'] = myopathic_changes_in_emgMapper

abnormal_ncv_d = {'(+)L4-5radiculopathy': 'PLACEHOLDER',
 'nan': 'PLACEHOLDER'}
abnormal_ncvMapper = OptionColumnMapper(concept_recognizer=hpo_cr, option_d=abnormal_ncv_d)
abnormal_ncvMapper.preview_column(df['ABNORMAL_NCV'])
column_mapper_d['ABNORMAL_NCV'] = abnormal_ncvMapper

comorbidities_d = {'Insulindependentdiabetes-onsetatage19': 'PLACEHOLDER',
 'COPD': 'PLACEHOLDER',
 'Diastolicdysfunction': 'PLACEHOLDER',
 'ICRBBB': 'PLACEHOLDER',
 'Lymphocytosis': 'Lymphocytosis',
 'nan': 'PLACEHOLDER',
 'Single kidney': 'Unilateral renal agenesis'}
comorbiditiesMapper = OptionColumnMapper(concept_recognizer=hpo_cr, option_d=comorbidities_d)
comorbiditiesMapper.preview_column(df['COMORBIDITIES'])
column_mapper_d['COMORBIDITIES'] = comorbiditiesMapper

AttributeError: 'NoneType' object has no attribute 'groups'

In [22]:
dft['AGE_AT_EXAMINATION']

V:2            49
V:5            58
V:8            37
V:9            42
V:12           51
V:13           41
Unnamed: 7    NaN
Name: AGE_AT_EXAMINATION, dtype: object