In [1]:
from phenopacket_mapper.data_standards import DataField
from phenopacket_mapper.data_standards import DataModel, ValueSet, DataSection, OrGroup, Cardinality

In [2]:
genomic_interpretation = DataModel(
    name="Phenopacket schema Genomic Interpretation",
    fields=(
        DataField(
            name="subject_or_biosample_id",
            specification=str,
            required=True,
            description="The id of the patient or biosample that is the subject being interpreted. REQUIRED.",
            cardinality=Cardinality.ONE,
        ),
        
        DataField(
            name="interpretation_status",
            specification=ValueSet(
                name="Interpretation Status Value Set",
                elements=("UNKNOWN_STATUS", "REJECTED", "CANDIDATE", "CONTRIBUTORY", "CAUSATIVE"),
            ),
            required=True,
            description="status of the interpretation. REQUIRED.",
        ),
        
        DataSection(
            name="example",
            required=True,
            fields=(
                DataField(
                    name="a_number",
                    required=True,
                    specification=int,                    
                ),
            )
        ),
        
        OrGroup(
            name="call",
            fields=(
                DataSection(
                    name="GeneDescriptor",
                    fields=(
                        DataField(
                            name="value_id",
                            specification=str,
                            required=True,
                            description="Official identifier of the gene. REQUIRED."
                        ),

                        DataField(
                            name="symbol",
                            specification=str,
                            required=True,
                            description="Official gene symbol. REQUIRED."
                        ),

                        DataField(
                            name="description",
                            specification=str,
                            required=False,
                            description="A free-text description of the gene"
                        ),
                    ),
                ),
            ),
        ),
    )
)

In [3]:
genomic_interpretation.example.a_number

DataField(name='a_number', specification=ValueSet(elements=(<class 'int'>,), name='', description=''), id='a_number', required=True, description='', cardinality=Cardinality(min=1, max='n'))

In [4]:
s = str(genomic_interpretation)

print(s)

DataModel(
	name: Phenopacket schema Genomic Interpretation
	DataField(
		id: subject_or_biosample_id,
		name: subject_or_biosample_id,
		required: True
		specification: ValueSet(elements=(<class 'str'>,), name='', description='')
		cardinality: 1..1
	)
	DataField(
		id: interpretation_status,
		name: interpretation_status,
		required: True
		specification: ValueSet(elements=('UNKNOWN_STATUS', 'REJECTED', 'CANDIDATE', 'CONTRIBUTORY', 'CAUSATIVE'), name='Interpretation Status Value Set', description='')
		cardinality: 1..n
	)
	DataSection(
		id: example,
		name: example,
		required: True
		cardinality: 1..n
	DataField(
		id: a_number,
		name: a_number,
		required: True
		specification: ValueSet(elements=(<class 'int'>,), name='', description='')
		cardinality: 1..n
	)
	)
	OrGroup(
		id: call,
		name: call,
		required: False
		cardinality: 0..n
	DataSection(
		id: genedescriptor,
		name: GeneDescriptor,
		required: False
		cardinality: 0..n
	DataField(
		id: value_id,
		name: value_id,
	

In [5]:
from io import StringIO

xml_data = [
        '<?xml version="1.0" encoding="UTF-8" ?> <ODM xmlns="http://www.cdisc.org/ns/odm/v1.3" xmlns:ds="http://www.w3.org/2000/09/xmldsig#" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:redcap="https://projectredcap.org" xsi:schemaLocation="http://www.cdisc.org/ns/odm/v1.3 schema/odm/ODM1-3-1.xsd" ODMVersion="1.3.1" FileOID="000-00-0000" FileType="Snapshot" Description="genAdipositas - ALT Demo" AsOfDateTime="2024-10-14T11:57:18" CreationDateTime="2024-10-14T11:57:18" SourceSystem="REDCap" SourceSystemVersion="14.6.9"> '
        '<ClinicalData StudyOID="Project.GenAdipositasALTDemo" MetaDataVersionOID="Metadata.GenAdipositasALTDemo_2024-10-14_1157">'
        '<SubjectData SubjectKey="101" redcap:RecordIdField="record_id">'
            '<ANumber>123</ANumber>'
        '</SubjectData>'
        '</ClinicalData>'
        '</ODM>',
        '<?xml version="1.0" encoding="UTF-8" ?> <ODM xmlns="http://www.cdisc.org/ns/odm/v1.3" xmlns:ds="http://www.w3.org/2000/09/xmldsig#" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:redcap="https://projectredcap.org" xsi:schemaLocation="http://www.cdisc.org/ns/odm/v1.3 schema/odm/ODM1-3-1.xsd" ODMVersion="1.3.1" FileOID="000-00-0000" FileType="Snapshot" Description="genAdipositas - ALT Demo" AsOfDateTime="2024-10-14T11:57:18" CreationDateTime="2024-10-14T11:57:18" SourceSystem="REDCap" SourceSystemVersion="14.6.9"> '
        '<ClinicalData StudyOID="Project.GenAdipositasALTDemo" MetaDataVersionOID="Metadata.GenAdipositasALTDemo_2024-10-14_1157">'
        '<SubjectData SubjectKey="102" redcap:RecordIdField="record_id">'
        '<ANumber>124</ANumber>'
        '</SubjectData>'
        '</ClinicalData>'
        '</ODM>',
    ]

In [6]:
genomic_interpretation.subject_or_biosample_id

DataField(name='subject_or_biosample_id', specification=ValueSet(elements=(<class 'str'>,), name='', description=''), id='subject_or_biosample_id', required=True, description='The id of the patient or biosample that is the subject being interpreted. REQUIRED.', cardinality=Cardinality(min=1, max=1))

In [7]:
from phenopacket_mapper.utils.io import load_hierarchical_data, DataReader
from phenopacket_mapper.utils.io.input import load_hierarchical_data_recursive

loading value

In [8]:
data_reader = DataReader(
    file=StringIO(xml_data[0]),
    file_extension="xml",
)
tmp = load_hierarchical_data_recursive(
    loaded_data_instance_identifier="value",
    loaded_data_instance=data_reader.data, 
    data_model=genomic_interpretation.subject_or_biosample_id, 
    mapping={
        genomic_interpretation.subject_or_biosample_id: "ODM.ClinicalData.SubjectData.SubjectKey",
        genomic_interpretation.example.a_number: "ODM.ClinicalData.SubjectData.ANumber",
    },
    resources=tuple(),
)
print("-"*80)
print(tmp)

--------------------------------------------------------------------------------
DataFieldValue(id='value:ODM.ClinicalData.SubjectData.SubjectKey', field=DataField(name='subject_or_biosample_id', specification=ValueSet(elements=(<class 'str'>,), name='', description=''), id='subject_or_biosample_id', required=True, description='The id of the patient or biosample that is the subject being interpreted. REQUIRED.', cardinality=Cardinality(min=1, max=1)), value=101)


loading section

In [9]:
data_reader = DataReader(
    file=StringIO(xml_data[0]),
    file_extension="xml",
)
tmp = load_hierarchical_data_recursive(
    loaded_data_instance_identifier="section",
    loaded_data_instance=data_reader.data,
    data_model=genomic_interpretation.example,
    mapping={
        genomic_interpretation.subject_or_biosample_id: "ODM.ClinicalData.SubjectData.SubjectKey",
        genomic_interpretation.example.a_number: "ODM.ClinicalData.SubjectData.ANumber",
    },
    resources=tuple(),
)
print("-"*80)
print(tmp)

--------------------------------------------------------------------------------
DataSectionInstance(id='section:example', section=DataSection(name='example', id='example', fields=(DataField(name='a_number', specification=ValueSet(elements=(<class 'int'>,), name='', description=''), id='a_number', required=True, description='', cardinality=Cardinality(min=1, max='n')),), required=True, cardinality=Cardinality(min=1, max='n')), values=(DataFieldValue(id='section:ODM.ClinicalData.SubjectData.ANumber', field=DataField(name='a_number', specification=ValueSet(elements=(<class 'int'>,), name='', description=''), id='a_number', required=True, description='', cardinality=Cardinality(min=1, max='n')), value=123),))


loading instance

In [10]:
data_model_instance = load_hierarchical_data(
    file=StringIO(xml_data[0]), 
    data_model=genomic_interpretation, 
    file_extension="xml",
    mapping={
        genomic_interpretation.subject_or_biosample_id: "ODM.ClinicalData.SubjectData.SubjectKey",
        genomic_interpretation.example.a_number: "ODM.ClinicalData.SubjectData.ANumber",
    }
)
print(data_model_instance)

DataModelInstance(id='PLACEHOLDER_IDENTIFIER', data_model=DataModel(name='Phenopacket schema Genomic Interpretation', fields=(DataField(name='subject_or_biosample_id', specification=ValueSet(elements=(<class 'str'>,), name='', description=''), id='subject_or_biosample_id', required=True, description='The id of the patient or biosample that is the subject being interpreted. REQUIRED.', cardinality=Cardinality(min=1, max=1)), DataField(name='interpretation_status', specification=ValueSet(elements=('UNKNOWN_STATUS', 'REJECTED', 'CANDIDATE', 'CONTRIBUTORY', 'CAUSATIVE'), name='Interpretation Status Value Set', description=''), id='interpretation_status', required=True, description='status of the interpretation. REQUIRED.', cardinality=Cardinality(min=1, max='n')), DataSection(name='example', id='example', fields=(DataField(name='a_number', specification=ValueSet(elements=(<class 'int'>,), name='', description=''), id='a_number', required=True, description='', cardinality=Cardinality(min=1,



laoding dataset

In [11]:
from phenopacket_mapper.utils.io.input import load_hierarchical_dataset

In [12]:
data_set = load_hierarchical_dataset(
    file=[StringIO(xml_data[i]) for i in range(len(xml_data))],
    data_model=genomic_interpretation,
    file_extension="xml",
    mapping={
        genomic_interpretation.subject_or_biosample_id: "ODM.ClinicalData.SubjectData.SubjectKey",
        genomic_interpretation.example.a_number: "ODM.ClinicalData.SubjectData.ANumber",
    }
)

