In [50]:
from phenopacket_mapper.data_standards import DataField
from phenopacket_mapper.data_standards import DataModel, ValueSet, DataSection, OrGroup
from phenopacket_mapper.utils.io import DataReader
from referencing.jsonschema import specification_with

In [51]:
genomic_interpretation = DataModel(
    data_model_name="Phenopacket schema Genomic Interpretation",
    fields=(
        DataField(
            name="subject_or_biosample_id",
            specification=str,
            required=True,
            description="The id of the patient or biosample that is the subject being interpreted. REQUIRED."
        ),
        
        DataField(
            name="interpretation_status",
            specification=ValueSet(
                name="Interpretation Status Value Set",
                elements=["UNKNOWN_STATUS", "REJECTED", "CANDIDATE", "CONTRIBUTORY", "CAUSATIVE"],
            ),
            required=True,
            description="status of the interpretation. REQUIRED.",
        ),
        
        DataSection(
            name="example",
            required=True,
            fields=(
                DataField(
                    name="a_number",
                    required=True,
                    specification=int,                    
                ),
            )
        ),
        
        OrGroup(
            name="call",
            fields=(
                DataSection(
                    name="GeneDescriptor",
                    fields=(
                        DataField(
                            name="value_id",
                            specification=str,
                            required=True,
                            description="Official identifier of the gene. REQUIRED."
                        ),

                        DataField(
                            name="symbol",
                            specification=str,
                            required=True,
                            description="Official gene symbol. REQUIRED."
                        ),

                        DataField(
                            name="description",
                            specification=str,
                            required=False,
                            description="A free-text description of the gene"
                        ),
                    ),
                ),
            ),
        ),
    )
)

In [52]:
s = str(genomic_interpretation)

print(s)

DataModel(
	name: Phenopacket schema Genomic Interpretation
	DataField(
		id: subject_or_biosample_id,
		name: subject_or_biosample_id,
		required: True
		specification: ValueSet(elements=[<class 'str'>], name='', description='')
		cardinality: 1..n
	)
	DataField(
		id: interpretation_status,
		name: interpretation_status,
		required: True
		specification: ValueSet(elements=['UNKNOWN_STATUS', 'REJECTED', 'CANDIDATE', 'CONTRIBUTORY', 'CAUSATIVE'], name='Interpretation Status Value Set', description='')
		cardinality: 1..n
	)
	DataSection(
		id: example,
		name: example,
		required: True
		cardinality: 1..n
	DataField(
		id: a_number,
		name: a_number,
		required: True
		specification: ValueSet(elements=[<class 'int'>], name='', description='')
		cardinality: 1..n
	)
	)
	OrGroup(
		id: call,
		name: call,
		required: False
		cardinality: 0..n
	DataSection(
		id: genedescriptor,
		name: GeneDescriptor,
		required: False
		cardinality: 0..n
	DataField(
		id: value_id,
		name: value_id,
		r

In [53]:
from io import StringIO

xml_data = \
    (
        '<?xml version="1.0" encoding="UTF-8" ?> <ODM xmlns="http://www.cdisc.org/ns/odm/v1.3" xmlns:ds="http://www.w3.org/2000/09/xmldsig#" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:redcap="https://projectredcap.org" xsi:schemaLocation="http://www.cdisc.org/ns/odm/v1.3 schema/odm/ODM1-3-1.xsd" ODMVersion="1.3.1" FileOID="000-00-0000" FileType="Snapshot" Description="genAdipositas - ALT Demo" AsOfDateTime="2024-10-14T11:57:18" CreationDateTime="2024-10-14T11:57:18" SourceSystem="REDCap" SourceSystemVersion="14.6.9"> '
        '<ClinicalData StudyOID="Project.GenAdipositasALTDemo" MetaDataVersionOID="Metadata.GenAdipositasALTDemo_2024-10-14_1157">'
        '<SubjectData SubjectKey="101" redcap:RecordIdField="record_id">'
            '<ANumber>123</ANumber>'
        '</SubjectData>'
        '</ClinicalData>'
        '</ODM>'
    )

buffer = StringIO(xml_data)

In [54]:
from io import IOBase
from pathlib import Path
from typing import Union, List, Literal


def load_hierarchical_data(
        file: Union[str, Path, IOBase, List[str], List[Path], List[IOBase]], 
        data_model: DataModel, 
        file_extension: Literal['csv', 'xlsx', 'json', 'xml'] = None,
        **kwargs,
):  
    def recursive_dict_call(d, keys):
        if len(keys) == 1:
            return d[keys[0]]
        else:
            return recursive_dict_call(d[keys[0]], keys[1:])
    data_reader = DataReader(file, file_extension=file_extension)
    xml_dict = data_reader.data
    for k, v in kwargs.items():
        print(f"{k=}: {v=}")
        v_keys = v.split('.')
        v = recursive_dict_call(xml_dict, v_keys)
        print(f"retrieved {k=}: {v=}")

In [55]:
data_model_instance = load_hierarchical_data(
    buffer, 
    genomic_interpretation, 
    file_extension="xml",
    subject_or_biosample_id="ODM.ClinicalData.SubjectData.SubjectKey",
    example__a_number="ODM.ClinicalData.SubjectData.ANumber",
)

k='subject_or_biosample_id': v='ODM.ClinicalData.SubjectData.SubjectKey'
retrieved k='subject_or_biosample_id': v=101
k='example__a_number': v='ODM.ClinicalData.SubjectData.ANumber'
retrieved k='example__a_number': v=123
