In [None]:
#!pip install xmlschema
#!pip install lxml
#!pip install bs4

In [132]:
import xmlschema
from xmlschema import limits
import pandas as pd
import os

In [None]:

limits.MAX_XML_ELEMENTS = 10 ** 10

In [135]:
def xsd_to_df(schema):
    """
    Parses an XSD file and returns a pandas DataFrame as a tabular representation.
    """
    data = {}

    def traverse_element(element, parent_name=None):
        """Recursive function to traverse schema elements."""
        name = element.local_name
        type_name = element.type.local_name if element.type else "N/A"
        occurrence = f"minOccurs={element.min_occurs}, maxOccurs={element.max_occurs if element.max_occurs is not None else 'unbounded'}"
        search_path=[]
        if parent_name is not None:
            search_path += data[parent_name]['search_path']
            search_path.append(parent_name)
        
        data[name] = {
            'element_name': name,
            'type': type_name,
            'parent_element': parent_name,
            'search_path': search_path,
            'occurrence': occurrence,
            'description': str(element.annotation.documentation[0]).strip() if element.annotation and element.annotation.documentation else ''
        }

        # Check for complex types with child elements
        if hasattr(element.type, 'content'):
            # Iterate over the content's elements
            for child_element in element.type.content.iter_elements():
                traverse_element(child_element, name)
        
        # Handle elements within a choice or sequence directly
        if hasattr(element, 'content') and element.content:
             for child_element in element.content.iter_elements():
                traverse_element(child_element, name)


    # Start traversal from global elements
    for global_element in schema.elements.values():
        traverse_element(global_element)

    df = pd.DataFrame(list(data.values()))

    return df

In [139]:
workspace_path = os.path.join('scratch', 'NPD Data Dictionaries & Initial File Share (Nov 2025)')
data_dictionary_path = os.path.join(workspace_path, 'PECOS UGE OnBoarding Documents', 'ODS_Data_Dictionary_R45.xlsx')
xsd_path = os.path.join(workspace_path, 'PECOS UGE OnBoarding Documents', 'PECOS UGE v6.xsd')
xml_path = os.path.join(workspace_path, 'P.UGE.PECOS.INITIAL.D251115.T0015300.gz')

In [None]:
schema = xmlschema.XMLSchema(xsd_path)

In [151]:
xsd_df = xsd_to_df(schema)
data_dict_df = pd.read_excel(data_dictionary_path)
target_cols = ['COLUMN_NAME', 'DATA_TYPE', 'COLUMN_DEF']
dedup_data_dict_df = data_dict_df.drop_duplicates(subset = [c for c in data_dict_df.columns if c in target_cols])[target_cols]

In [154]:
xsd_df.merge(dedup_data_dict_df, left_on = 'element_name', right_on = 'COLUMN_NAME').drop_duplicates(subset=['element_name', 'type', 'parent_element', 'search_path'])

TypeError: unhashable type: 'list'

In [153]:
xsd_df

Unnamed: 0,element_name,type,parent_element,search_path,occurrence,description
0,UGE,,,[],"minOccurs=1, maxOccurs=1",<Element '{http://www.w3.org/2001/XMLSchema}do...
1,HDR,,UGE,[UGE],"minOccurs=0, maxOccurs=1",<Element '{http://www.w3.org/2001/XMLSchema}do...
2,ASCT_CNT,String10,HDR,"[UGE, HDR]","minOccurs=0, maxOccurs=1",
3,ENRLMT_CNT,String10,HDR,"[UGE, HDR]","minOccurs=0, maxOccurs=1",
4,AFDVT_CNT,String10,HDR,"[UGE, HDR]","minOccurs=0, maxOccurs=1",
...,...,...,...,...,...,...
438,PEC_AFDVT,PEC_AFDVT_TYPE,PEC_AFDVT_LIST,"[UGE, CNTNT, MDCR_PRVDR, PRVDR, PEC_AFDVT_LIST]","minOccurs=0, maxOccurs=unbounded",
439,AFDVT_ID,String15,PEC_AFDVT,"[UGE, CNTNT, MDCR_PRVDR, PRVDR, PEC_AFDVT_LIST...","minOccurs=0, maxOccurs=1",
440,PEC_AFDVT_ADR,ADR_TYPE,PEC_AFDVT,"[UGE, CNTNT, MDCR_PRVDR, PRVDR, PEC_AFDVT_LIST...","minOccurs=0, maxOccurs=unbounded",
441,RSN_CD,String3,PEC_AFDVT,"[UGE, CNTNT, MDCR_PRVDR, PRVDR, PEC_AFDVT_LIST...","minOccurs=0, maxOccurs=1",
