In [None]:
#!pip install xmlschema
#!pip install lxml

In [None]:
import xmlschema
from xmlschema import limits
import pandas as pd
import os
from lxml import etree
import gzip
import shutil

In [None]:

limits.MAX_XML_ELEMENTS = 10 ** 10

In [None]:
def xsd_to_df(schema):
    """
    Parses an XSD file and returns a pandas DataFrame as a tabular representation.
    """
    data = {}

    def traverse_element(element, parent_name=None):
        """Recursive function to traverse schema elements."""
        name = element.local_name
        type_name = element.type.local_name if element.type else "N/A"
        occurrence = f"minOccurs={element.min_occurs}, maxOccurs={element.max_occurs if element.max_occurs is not None else 'unbounded'}"
        search_path=[]
        if parent_name is not None:
            search_path += data[parent_name]['search_path']
            search_path.append(parent_name)
        
        data[name] = {
            'element_name': name,
            'type': type_name,
            'parent_element': parent_name,
            'search_path': search_path,
            'occurrence': occurrence,
            'description': str(element.annotation.documentation[0]).strip() if element.annotation and element.annotation.documentation else ''
        }

        # Check for complex types with child elements
        if hasattr(element.type, 'content'):
            # Iterate over the content's elements
            for child_element in element.type.content.iter_elements():
                traverse_element(child_element, name)
        
        # Handle elements within a choice or sequence directly
        if hasattr(element, 'content') and element.content:
             for child_element in element.content.iter_elements():
                traverse_element(child_element, name)


    # Start traversal from global elements
    for global_element in schema.elements.values():
        traverse_element(global_element)

    df = pd.DataFrame(list(data.values()))

    return df

In [None]:
workspace_path = os.path.join('scratch', 'NPD Data Dictionaries & Initial File Share (Nov 2025)')
data_dictionary_path = os.path.join(workspace_path, 'PECOS UGE OnBoarding Documents', 'ODS_Data_Dictionary_R45.xlsx')
xsd_path = os.path.join(workspace_path, 'PECOS UGE OnBoarding Documents', 'PECOS UGE v6.xsd')
xml_path = os.path.join(workspace_path, 'P.UGE.PECOS.INITIAL.D251115.T0015300.gz')
test_file_path = os.path.join('scratch', 'PECOS UGE Sample Test file.xml')
compressed_test_file = os.path.join('scratch', 'PECOS_UGE_Sample.gz')

In [None]:
schema = xmlschema.XMLSchema(xsd_path)

In [None]:
xsd_df = xsd_to_df(schema)
data_dict_df = pd.read_excel(data_dictionary_path)
target_cols = ['COLUMN_NAME', 'DATA_TYPE', 'COLUMN_DEF']
dedup_data_dict_df = data_dict_df.drop_duplicates(subset = [c for c in data_dict_df.columns if c in target_cols])[target_cols]
xsd_df['search_path_string'] = xsd_df['search_path'].apply(lambda x: str(x))

In [None]:
xsd_data_dict_df = xsd_df.merge(dedup_data_dict_df, left_on = 'element_name', right_on = 'COLUMN_NAME', how = 'left').drop_duplicates(subset=['element_name', 'type', 'search_path_string'])
xsd_data_dict_df.to_csv('xsd.csv', index=False)

In [None]:
with open(test_file_path, 'r') as fp:
    tree = etree.parse(fp)


In [None]:
tree

In [33]:
in_file = xml_path#compressed_test_file
tag_name = '{http://pecos.cms.gov}PRVDR'
count = 0
with gzip.open(in_file, 'rb') as f:
    print('reading')
    xml_tree = etree.iterparse(f, events=('end',), tag=tag_name, recover=True, huge_tree=True)
    print('read')
    print(xml_tree)
    for event, elem in xml_tree:
        print('looping')
        # Process the element (chunk) here
        print(f"Processing element: {elem.tag}, attributes: {elem.attrib}")
            
        # Extract data from the element's children/attributes
            
        # Crucial for memory efficiency: clear the element and its ancestors once processed
        # to free memory immediately.
        elem.clear()
        count += 1
        while elem.getprevious() is not None:
            del elem.getparent()[0]
print(count)

looping
Processing element: {http://pecos.cms.gov}PRVDR, attributes: {}
looping
Processing element: {http://pecos.cms.gov}PRVDR, attributes: {}
looping
Processing element: {http://pecos.cms.gov}PRVDR, attributes: {}
looping
Processing element: {http://pecos.cms.gov}PRVDR, attributes: {}
looping
Processing element: {http://pecos.cms.gov}PRVDR, attributes: {}
looping
Processing element: {http://pecos.cms.gov}PRVDR, attributes: {}
looping
Processing element: {http://pecos.cms.gov}PRVDR, attributes: {}
looping
Processing element: {http://pecos.cms.gov}PRVDR, attributes: {}
looping
Processing element: {http://pecos.cms.gov}PRVDR, attributes: {}
looping
Processing element: {http://pecos.cms.gov}PRVDR, attributes: {}
looping
Processing element: {http://pecos.cms.gov}PRVDR, attributes: {}
looping
Processing element: {http://pecos.cms.gov}PRVDR, attributes: {}
looping
Processing element: {http://pecos.cms.gov}PRVDR, attributes: {}
looping
Processing element: {http://pecos.cms.gov}PRVDR, attribu

KeyboardInterrupt: 

In [None]:

in_file = xml_path#compressed_test_file
tag_name = '{http://pecos.cms.gov}PRVDR'
count = 0
with gzip.open(in_file, 'rb') as f:
    print('reading')
    xml_tree = etree.iterparse(f, events=('end',), tag=tag_name, recover=True, huge_tree=True)
    print('read')
    print(xml_tree)
    for event, elem in xml_tree:
        print('looping')
        # Process the element (chunk) here
        print(f"Processing element: {elem.tag}, attributes: {elem.attrib}")
            
        # Extract data from the element's children/attributes
            
        # Crucial for memory efficiency: clear the element and its ancestors once processed
        # to free memory immediately.
        elem.clear()
        count += 1
        while elem.getprevious() is not None:
            del elem.getparent()[0]
print(count)