In [None]:
#!pip install xmlschema
#!pip install lxml
#!pip install bs4

In [3]:
import xmlschema
from xmlschema import limits
import pandas as pd
import os

In [4]:

limits.MAX_XML_ELEMENTS = 10 ** 10

In [5]:
def xsd_to_df(schema):
    """
    Parses an XSD file and returns a pandas DataFrame as a tabular representation.
    """
    data = {}

    def traverse_element(element, parent_name=None):
        """Recursive function to traverse schema elements."""
        name = element.local_name
        type_name = element.type.local_name if element.type else "N/A"
        occurrence = f"minOccurs={element.min_occurs}, maxOccurs={element.max_occurs if element.max_occurs is not None else 'unbounded'}"
        search_path=[]
        if parent_name is not None:
            search_path += data[parent_name]['search_path']
            search_path.append(parent_name)
        
        data[name] = {
            'element_name': name,
            'type': type_name,
            'parent_element': parent_name,
            'search_path': search_path,
            'occurrence': occurrence,
            'description': str(element.annotation.documentation[0]).strip() if element.annotation and element.annotation.documentation else ''
        }

        # Check for complex types with child elements
        if hasattr(element.type, 'content'):
            # Iterate over the content's elements
            for child_element in element.type.content.iter_elements():
                traverse_element(child_element, name)
        
        # Handle elements within a choice or sequence directly
        if hasattr(element, 'content') and element.content:
             for child_element in element.content.iter_elements():
                traverse_element(child_element, name)


    # Start traversal from global elements
    for global_element in schema.elements.values():
        traverse_element(global_element)

    df = pd.DataFrame(list(data.values()))

    return df

In [6]:
workspace_path = os.path.join('scratch', 'NPD Data Dictionaries & Initial File Share (Nov 2025)')
data_dictionary_path = os.path.join(workspace_path, 'PECOS UGE OnBoarding Documents', 'ODS_Data_Dictionary_R45.xlsx')
xsd_path = os.path.join(workspace_path, 'PECOS UGE OnBoarding Documents', 'PECOS UGE v6.xsd')
xml_path = os.path.join(workspace_path, 'P.UGE.PECOS.INITIAL.D251115.T0015300.gz')

In [7]:
schema = xmlschema.XMLSchema(xsd_path)

In [8]:
xsd_df = xsd_to_df(schema)
data_dict_df = pd.read_excel(data_dictionary_path)
target_cols = ['COLUMN_NAME', 'DATA_TYPE', 'COLUMN_DEF']
dedup_data_dict_df = data_dict_df.drop_duplicates(subset = [c for c in data_dict_df.columns if c in target_cols])[target_cols]
xsd_df['search_path_string'] = xsd_df['search_path'].apply(lambda x: str(x))

In [9]:
xsd_df.merge(dedup_data_dict_df, left_on = 'element_name', right_on = 'COLUMN_NAME').drop_duplicates(subset=['element_name', 'type', 'parent_element', 'search_path_string'])

Unnamed: 0,element_name,type,parent_element,search_path,occurrence,description,search_path_string,COLUMN_NAME,DATA_TYPE,COLUMN_DEF
0,ERR_DESC,String100,ERR_LIST,"[UGE, HDR, ERR_LIST]","minOccurs=0, maxOccurs=1",,"['UGE', 'HDR', 'ERR_LIST']",ERR_DESC,VARCHAR2,Error description for the error object_x000D_
5,PECOS_ASCT_CNTL_ID,String10,PEC_NPHYSN_TCHNCN,"[UGE, CNTNT, MDCR_PRVDR, PRVDR, ENRLMTS, CMS_8...","minOccurs=0, maxOccurs=1",,"['UGE', 'CNTNT', 'MDCR_PRVDR', 'PRVDR', 'ENRLM...",PECOS_ASCT_CNTL_ID,CHAR,Unique number that identifies a Pecos Associat...
17,BIRTH_DT,DATE_TYPE,INDVDL_ASCTN_INFO,"[UGE, CNTNT, MDCR_PRVDR, PRVDR, ENRLMTS, CMS_8...","minOccurs=0, maxOccurs=1",,"['UGE', 'CNTNT', 'MDCR_PRVDR', 'PRVDR', 'ENRLM...",BIRTH_DT,DATE,Birth date of the Medicare Exclusionary Data...
21,DEATH_DT,DATE_TYPE,INDVDL_INFO,"[UGE, CNTNT, MDCR_PRVDR, PRVDR, PRVDR_INFO, IN...","minOccurs=0, maxOccurs=1",,"['UGE', 'CNTNT', 'MDCR_PRVDR', 'PRVDR', 'PRVDR...",DEATH_DT,DATE,Death Date of the individual._x000D_
22,BIRTH_STATE_CD,String2,INDVDL_INFO,"[UGE, CNTNT, MDCR_PRVDR, PRVDR, PRVDR_INFO, IN...","minOccurs=0, maxOccurs=1",,"['UGE', 'CNTNT', 'MDCR_PRVDR', 'PRVDR', 'PRVDR...",BIRTH_STATE_CD,CHAR,Unique value that identifies a state._x000D_
...,...,...,...,...,...,...,...,...,...,...
663,DME_PROD_SRVC_TYPE_OTHR_TXT,String60,PEC_DME_PROD_AND_SRVC,"[UGE, CNTNT, MDCR_PRVDR, PRVDR, ENRLMTS, CMS_8...","minOccurs=0, maxOccurs=1",,"['UGE', 'CNTNT', 'MDCR_PRVDR', 'PRVDR', 'ENRLM...",DME_PROD_SRVC_TYPE_OTHR_TXT,VARCHAR2,Other type of product or service the supplier ...
664,ACRDTN_DESC,String10,PEC_DME_ACRDTN_PROD,"[UGE, CNTNT, MDCR_PRVDR, PRVDR, ENRLMTS, CMS_8...","minOccurs=1, maxOccurs=1",,"['UGE', 'CNTNT', 'MDCR_PRVDR', 'PRVDR', 'ENRLM...",ACRDTN_DESC,VARCHAR2,Description of the accreditation code._x000D_
665,AFDVT_ID,String15,PEC_AFDVT,"[UGE, CNTNT, MDCR_PRVDR, PRVDR, PEC_AFDVT_LIST...","minOccurs=0, maxOccurs=1",,"['UGE', 'CNTNT', 'MDCR_PRVDR', 'PRVDR', 'PEC_A...",AFDVT_ID,CHAR,Unique number for an Opt-Out Affidavit. Opt Ou...
667,RSN_CD,String3,PEC_AFDVT,"[UGE, CNTNT, MDCR_PRVDR, PRVDR, PEC_AFDVT_LIST...","minOccurs=0, maxOccurs=1",,"['UGE', 'CNTNT', 'MDCR_PRVDR', 'PRVDR', 'PEC_A...",RSN_CD,CHAR,Unique Code of Mailing Address change Reason.U...


In [10]:
xsd_df

Unnamed: 0,element_name,type,parent_element,search_path,occurrence,description,search_path_string
0,UGE,,,[],"minOccurs=1, maxOccurs=1",<Element '{http://www.w3.org/2001/XMLSchema}do...,[]
1,HDR,,UGE,[UGE],"minOccurs=0, maxOccurs=1",<Element '{http://www.w3.org/2001/XMLSchema}do...,['UGE']
2,ASCT_CNT,String10,HDR,"[UGE, HDR]","minOccurs=0, maxOccurs=1",,"['UGE', 'HDR']"
3,ENRLMT_CNT,String10,HDR,"[UGE, HDR]","minOccurs=0, maxOccurs=1",,"['UGE', 'HDR']"
4,AFDVT_CNT,String10,HDR,"[UGE, HDR]","minOccurs=0, maxOccurs=1",,"['UGE', 'HDR']"
...,...,...,...,...,...,...,...
438,PEC_AFDVT,PEC_AFDVT_TYPE,PEC_AFDVT_LIST,"[UGE, CNTNT, MDCR_PRVDR, PRVDR, PEC_AFDVT_LIST]","minOccurs=0, maxOccurs=unbounded",,"['UGE', 'CNTNT', 'MDCR_PRVDR', 'PRVDR', 'PEC_A..."
439,AFDVT_ID,String15,PEC_AFDVT,"[UGE, CNTNT, MDCR_PRVDR, PRVDR, PEC_AFDVT_LIST...","minOccurs=0, maxOccurs=1",,"['UGE', 'CNTNT', 'MDCR_PRVDR', 'PRVDR', 'PEC_A..."
440,PEC_AFDVT_ADR,ADR_TYPE,PEC_AFDVT,"[UGE, CNTNT, MDCR_PRVDR, PRVDR, PEC_AFDVT_LIST...","minOccurs=0, maxOccurs=unbounded",,"['UGE', 'CNTNT', 'MDCR_PRVDR', 'PRVDR', 'PEC_A..."
441,RSN_CD,String3,PEC_AFDVT,"[UGE, CNTNT, MDCR_PRVDR, PRVDR, PEC_AFDVT_LIST...","minOccurs=0, maxOccurs=1",,"['UGE', 'CNTNT', 'MDCR_PRVDR', 'PRVDR', 'PEC_A..."
