In [8]:
from phenopacket_mapper.data_standards import DataModelInstance
from phenopacket_mapper.data_standards import DataField
from phenopacket_mapper.data_standards import DataModel, ValueSet, DataSection, OrGroup
from phenopacket_mapper.data_standards.data_model import DataSectionInstance
from phenopacket_mapper.utils.io import DataReader

In [9]:
genomic_interpretation = DataModel(
    data_model_name="Phenopacket schema Genomic Interpretation",
    fields=(
        DataField(
            name="subject_or_biosample_id",
            specification=str,
            required=True,
            description="The id of the patient or biosample that is the subject being interpreted. REQUIRED."
        ),
        
        DataField(
            name="interpretation_status",
            specification=ValueSet(
                name="Interpretation Status Value Set",
                elements=["UNKNOWN_STATUS", "REJECTED", "CANDIDATE", "CONTRIBUTORY", "CAUSATIVE"],
            ),
            required=True,
            description="status of the interpretation. REQUIRED.",
        ),
        
        DataSection(
            name="example",
            required=True,
            fields=(
                DataField(
                    name="a_number",
                    required=True,
                    specification=int,                    
                ),
            )
        ),
        
        OrGroup(
            name="call",
            fields=(
                DataSection(
                    name="GeneDescriptor",
                    fields=(
                        DataField(
                            name="value_id",
                            specification=str,
                            required=True,
                            description="Official identifier of the gene. REQUIRED."
                        ),

                        DataField(
                            name="symbol",
                            specification=str,
                            required=True,
                            description="Official gene symbol. REQUIRED."
                        ),

                        DataField(
                            name="description",
                            specification=str,
                            required=False,
                            description="A free-text description of the gene"
                        ),
                    ),
                ),
            ),
        ),
    )
)

In [10]:
genomic_interpretation.example.a_number

DataField(name='a_number', specification=ValueSet(elements=[<class 'int'>], name='', description=''), id='a_number', required=True, description='', cardinality=Cardinality(min=1, max='n'))

In [11]:
s = str(genomic_interpretation)

print(s)

DataModel(
	name: Phenopacket schema Genomic Interpretation
	DataField(
		id: subject_or_biosample_id,
		name: subject_or_biosample_id,
		required: True
		specification: ValueSet(elements=[<class 'str'>], name='', description='')
		cardinality: 1..n
	)
	DataField(
		id: interpretation_status,
		name: interpretation_status,
		required: True
		specification: ValueSet(elements=['UNKNOWN_STATUS', 'REJECTED', 'CANDIDATE', 'CONTRIBUTORY', 'CAUSATIVE'], name='Interpretation Status Value Set', description='')
		cardinality: 1..n
	)
	DataSection(
		id: example,
		name: example,
		required: True
		cardinality: 1..n
	DataField(
		id: a_number,
		name: a_number,
		required: True
		specification: ValueSet(elements=[<class 'int'>], name='', description='')
		cardinality: 1..n
	)
	)
	OrGroup(
		id: call,
		name: call,
		required: False
		cardinality: 0..n
	DataSection(
		id: genedescriptor,
		name: GeneDescriptor,
		required: False
		cardinality: 0..n
	DataField(
		id: value_id,
		name: value_id,
		r

In [12]:
from io import StringIO

xml_data = \
    (
        '<?xml version="1.0" encoding="UTF-8" ?> <ODM xmlns="http://www.cdisc.org/ns/odm/v1.3" xmlns:ds="http://www.w3.org/2000/09/xmldsig#" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:redcap="https://projectredcap.org" xsi:schemaLocation="http://www.cdisc.org/ns/odm/v1.3 schema/odm/ODM1-3-1.xsd" ODMVersion="1.3.1" FileOID="000-00-0000" FileType="Snapshot" Description="genAdipositas - ALT Demo" AsOfDateTime="2024-10-14T11:57:18" CreationDateTime="2024-10-14T11:57:18" SourceSystem="REDCap" SourceSystemVersion="14.6.9"> '
        '<ClinicalData StudyOID="Project.GenAdipositasALTDemo" MetaDataVersionOID="Metadata.GenAdipositasALTDemo_2024-10-14_1157">'
        '<SubjectData SubjectKey="101" redcap:RecordIdField="record_id">'
            '<ANumber>123</ANumber>'
        '</SubjectData>'
        '</ClinicalData>'
        '</ODM>'
    )

buffer = StringIO(xml_data)

In [13]:
import warnings
from phenopacket_mapper.data_standards import DataFieldValue
import math
from io import IOBase
from pathlib import Path
from typing import Union, List, Literal, Dict

from phenopacket_mapper.utils import parsing
    
def load_hierarchical_data_recursive(
        loaded_data_instance_identifier: Union[int, str],
        loaded_data_instance: Dict,
        data_model: Union[DataModel, DataSection, OrGroup, DataField],
        compliance: Literal['lenient', 'strict'] = 'lenient',
        mapping: Dict[DataField, str] = None,
):
    """Helper method for `load_hierarchical_data`, recurses through hierarchical :class:`DataModel`
    
    `loaded_data_instance` is expected to be a dictionary as returned by `DataReader.data` when reading a single xml or json file 
    
    :param loaded_data_instance_identifier: identifier of the loaded data_instance
    :param loaded_data_instance: data loaded in by :class:`DataReader`
    :param data_model:
    :param compliance: Compliance level to enforce when reading the file. If 'lenient', the file can have extra fields
                        that are not in the DataModel. If 'strict', the file must have all fields in the DataModel.
    :param mapping: specifies the mapping from data fields present in the data model to identifiers of fields in the data
    """
    if isinstance(data_model, DataModel):
        return (
            load_hierarchical_data_recursive(
                loaded_data_instance_identifier=loaded_data_instance_identifier,
                loaded_data_instance=loaded_data_instance,
                data_model=f,
                compliance=compliance,
                mapping=mapping
            )
            for f in data_model.fields
        )
    elif isinstance(data_model, DataSection):
        data_section: DataSection = data_model
        
        values = (
            load_hierarchical_data_recursive(
                loaded_data_instance_identifier=loaded_data_instance_identifier,
                loaded_data_instance=loaded_data_instance,
                data_model=f,
                compliance=compliance,
                mapping=mapping,
            )
            for f in data_section.fields
        )
        
        return DataSectionInstance(
            identifier=str(loaded_data_instance_identifier) + ":" + data_section.id,  # TODO: get identifiers of parents
            data_section=data_section,
            values=values,
        )
    elif isinstance(data_model, OrGroup):
        # TODO: resolve or this seems to be very difficult
        pass
    elif isinstance(data_model, DataField):
        data_field = data_model
        
        keys_str = mapping.get(data_model, None)
        
        if keys_str:
            keys = keys_str.split('.')
            dict_value = recursive_dict_call(loaded_data_instance, keys)

            if not dict_value or (isinstance(dict_value, float) and math.isnan(dict_value)):
                return None

            value_str = str(dict_value)
            value = parsing.parse_value(value_str=value_str, resources=data_model.resources, compliance=compliance)
            data_field_value = DataFieldValue(
                row_no=str(loaded_data_instance_identifier) + ":" + keys_str, 
                field=data_field, 
                value=value
            )

            return data_field_value
    else:
        err_msg = f"DataModel {data_model} is not a valid type ({type(data_model)})."
        if compliance == 'strict':
            raise ValueError(err_msg)
        elif compliance == 'lenient':
            warnings.warn(err_msg)
        else:
            raise ValueError(f"Invalid compliance level: {compliance}")
        

def load_hierarchical_data(
        file: Union[str, Path, IOBase, List[str], List[Path], List[IOBase]], 
        data_model: DataModel, 
        file_extension: Literal['csv', 'xlsx', 'json', 'xml'] = None,
        compliance: Literal['lenient', 'strict'] = 'lenient',
        mapping: Dict[DataField, str] = None,
):  
    if not mapping:
        raise AttributeError(f"Parameter 'mapping' must not be empty or None. {mapping=}, {type(mapping)=}")
    
    if not data_model.is_hierarchical:
        warnings.warn("This method is only for loading hierarchical data, it may behave unexpectedly for tabular data.")
    
    data_reader = DataReader(file, file_extension=file_extension)
    data, data_iterable = data_reader.data, data_reader.iterable
    
    # assembling data model instances
    data_model_instances = []
    
    for i, data_instance in enumerate(data_iterable):
        data_model_instances.append(
            DataModelInstance(
                row_no=i,
                data_model=data_model,
                values=load_hierarchical_data_recursive(
                    loaded_data_instance_identifier=str(i),
                    loaded_data_instance=data_instance,
                    data_model=data_model,
                    compliance=compliance,
                    mapping=mapping
                ),
                compliance=compliance,
            )
        )
        

In [14]:
data_model_instance = load_hierarchical_data(
    file=buffer, 
    data_model=genomic_interpretation, 
    file_extension="xml",
    mapping={
        genomic_interpretation.subject_or_biosample_id: "ODM.ClinicalData.SubjectData.SubjectKey",
        genomic_interpretation.example.a_number: "ODM.ClinicalData.SubjectData.ANumber",
    }
)

TypeError: unhashable type: 'list'