In [4]:
#!pip install xmlschema
#!pip install lxml

In [12]:
import xmlschema
from xmlschema import limits
import pandas as pd
import os
from lxml import etree
import gzip
import shutil

In [6]:

limits.MAX_XML_ELEMENTS = 10 ** 10

In [7]:
def xsd_to_df(schema):
    """
    Parses an XSD file and returns a pandas DataFrame as a tabular representation.
    """
    data = {}

    def traverse_element(element, parent_name=None):
        """Recursive function to traverse schema elements."""
        name = element.local_name
        type_name = element.type.local_name if element.type else "N/A"
        occurrence = f"minOccurs={element.min_occurs}, maxOccurs={element.max_occurs if element.max_occurs is not None else 'unbounded'}"
        search_path=[]
        if parent_name is not None:
            search_path += data[parent_name]['search_path']
            search_path.append(parent_name)
        
        data[name] = {
            'element_name': name,
            'type': type_name,
            'parent_element': parent_name,
            'search_path': search_path,
            'occurrence': occurrence,
            'description': str(element.annotation.documentation[0]).strip() if element.annotation and element.annotation.documentation else ''
        }

        # Check for complex types with child elements
        if hasattr(element.type, 'content'):
            # Iterate over the content's elements
            for child_element in element.type.content.iter_elements():
                traverse_element(child_element, name)
        
        # Handle elements within a choice or sequence directly
        if hasattr(element, 'content') and element.content:
             for child_element in element.content.iter_elements():
                traverse_element(child_element, name)


    # Start traversal from global elements
    for global_element in schema.elements.values():
        traverse_element(global_element)

    df = pd.DataFrame(list(data.values()))

    return df

In [16]:
workspace_path = os.path.join('scratch', 'NPD Data Dictionaries & Initial File Share (Nov 2025)')
data_dictionary_path = os.path.join(workspace_path, 'PECOS UGE OnBoarding Documents', 'ODS_Data_Dictionary_R45.xlsx')
xsd_path = os.path.join(workspace_path, 'PECOS UGE OnBoarding Documents', 'PECOS UGE v6.xsd')
xml_path = os.path.join(workspace_path, 'P.UGE.PECOS.INITIAL.D251115.T0015300.gz')
test_file_path = os.path.join('scratch', 'PECOS UGE Sample Test file.xml')
compressed_test_file = os.path.join('scratch', 'PECOS_UGE_Sample.gz')

In [9]:
schema = xmlschema.XMLSchema(xsd_path)

In [10]:
xsd_df = xsd_to_df(schema)
data_dict_df = pd.read_excel(data_dictionary_path)
target_cols = ['COLUMN_NAME', 'DATA_TYPE', 'COLUMN_DEF']
dedup_data_dict_df = data_dict_df.drop_duplicates(subset = [c for c in data_dict_df.columns if c in target_cols])[target_cols]
xsd_df['search_path_string'] = xsd_df['search_path'].apply(lambda x: str(x))

KeyboardInterrupt: 

In [None]:
xsd_data_dict_df = xsd_df.merge(dedup_data_dict_df, left_on = 'element_name', right_on = 'COLUMN_NAME', how = 'left').drop_duplicates(subset=['element_name', 'type', 'search_path_string'])
xsd_data_dict_df.to_csv('xsd.csv', index=False)

In [None]:
with open(test_file_path, 'r') as fp:
    tree = etree.parse(fp)


In [None]:
tree

<lxml.etree._ElementTree at 0x110d9dd00>

In [None]:
in_file = compressed_test_file
tag_name = '{http://pecos.cms.gov}BIRTH_CNTRY_CD'
count = 0
with gzip.open(in_file, 'rb') as f:
    print('reading')
    xml_tree = etree.iterparse(f, events=('end',), tag=tag_name, recover=True, huge_tree=True)
    print('read')
    print(xml_tree)
    for event, elem in xml_tree:
        print('looping')
        # Process the element (chunk) here
        print(f"Processing element: {elem.tag}, attributes: {elem.attrib}")
            
        # Extract data from the element's children/attributes
            
        # Crucial for memory efficiency: clear the element and its ancestors once processed
        # to free memory immediately.
        elem.clear()
        count += 1
        if count > 0:
            break
        while elem.getprevious() is not None:
            del elem.getparent()[0]
print(count)

reading
read
<lxml.etree.iterparse object at 0xf86d8f920>
looping
Processing element: {http://pecos.cms.gov}ASCT_CNT, attributes: {}
1


In [None]:

with open(test_file_path, 'rb') as f_in:
    # Open the destination file in write-binary mode ('wb')
    with gzip.open(compressed_test_uge, 'wb') as f_out:
        # Copy the file contents, compressing as it writes
        shutil.copyfileobj(f_in, f_out)


In [25]:
txt

b'<?xml version="1.0" encoding="UTF-8" standalone="no"?>\n<UGE xmlns="http://pecos.cms.gov" xsi:schemaLocation="http://pecos.cms.gov EDSC_MDM_Uninterrupted_UGE.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">\n<HDR>\n<ASCT_CNT>5</ASCT_CNT>\n<ENRLMT_CNT>7</ENRLMT_CNT>\n<AFDVT_CNT>0</AFDVT_CNT>\n</HDR>\n<CNTNT>\n<MDCR_PRVDR>\n<PRVDR>\n<PRVDR_INFO>\n<INDVDL_INFO>\n<PECOS_ASCT_CNTL_ID>3678892288</PECOS_ASCT_CNTL_ID>\n<BIRTH_DT>19521217</BIRTH_DT>\n<BIRTH_STATE_CD>OK</BIRTH_STATE_CD>\n<BIRTH_STATE_NAME>OKLAHOMA</BIRTH_STATE_NAME>\n<BIRTH_CNTRY_CD>US</BIRTH_CNTRY_CD>\n<BIRTH_CNTRY_NAME>UNITED STATES</BIRTH_CNTRY_NAME>\n<BIRTH_FRGN_SW>D</BIRTH_FRGN_SW>\n<NAME_LIST>\n<PEC_INDVDL_NAME>\n<NAME_CD>I</NAME_CD>\n<NAME_DESC>INDIVIDUAL NAME</NAME_DESC>\n<FIRST_NAME>JO ANN</FIRST_NAME>\n<LAST_NAME>SCOTT</LAST_NAME>\n<TRMNTN_DT>2015-05-04T11:33:54.000000000</TRMNTN_DT>\n<DATA_STUS_CD>HISTORY</DATA_STUS_CD>\n</PEC_INDVDL_NAME>\n<PEC_INDVDL_NAME>\n<NAME_CD>I</NAME_CD>\n<NAME_DESC>INDIVIDUAL NA