In [5]:
from lxml import etree

# Define the namespaces
NS = {'': 'http://xml.house.gov/schemas/uslm/1.0'}  # Default namespace

# Function to parse the XML and extract key information
def parse_complex_uslm(file_path):
    tree = etree.parse(file_path)
    root = tree.getroot()

    # Recursive function to traverse and process elements
    def process_element(element, depth=0):
        tag = etree.QName(element).localname  # Strip namespace prefix
        attributes = {k: v for k, v in element.attrib.items()}
        text = element.text.strip() if element.text else None
        indent = "  " * depth

        # Print the element, attributes, and text for debugging
        print(f"{indent}Element: <{tag}>")
        if attributes:
            print(f"{indent}  Attributes: {attributes}")
        if text:
            print(f"{indent}  Text: {text}")

        # Recursively process child elements
        for child in element:
            process_element(child, depth + 1)

    # Start processing from the root
    print("Parsing the XML document...\n")
    process_element(root)

# Example usage
file_path = 'usc48.xml'  # Replace with your file path
parse_complex_uslm(file_path)


Parsing the XML document...

Element: <uscDoc>
  Attributes: {'{http://www.w3.org/2001/XMLSchema-instance}schemaLocation': 'http://xml.house.gov/schemas/uslm/1.0 USLM-1.0.15.xsd', '{http://www.w3.org/XML/1998/namespace}lang': 'en', 'identifier': '/us/usc/t48'}
  Element: <meta>
    Element: <title>
      Text: Title 48
    Element: <type>
      Text: USCTitle
    Element: <docNumber>
      Text: 48
    Element: <docPublicationName>
      Text: Online@118-136
    Element: <property>
      Attributes: {'role': 'is-positive-law'}
      Text: no
    Element: <publisher>
      Text: OLRC
    Element: <created>
      Text: 2024-10-02T10:00:23
    Element: <creator>
      Text: USCConverter 1.7.2
  Element: <main>
    Element: <title>
      Attributes: {'id': 'idad4e8982-80c6-11ef-8e4a-adf638f8c7c3', 'identifier': '/us/usc/t48'}
      Element: <num>
        Attributes: {'value': '48'}
        Text: Title 48—
      Element: <heading>
        Text: TERRITORIES AND INSULAR POSSESSIONS
      Elem

In [2]:
def clean_text(text):
    """
    Cleans and normalizes text by removing excessive whitespace and unwanted characters.
    """
    if not text:
        return ""
    # Remove excessive whitespace
    text = re.sub(r'\s+', ' ', text)
    # Strip leading and trailing spaces
    text = text.strip()
    return text

In [4]:
# src/data_preprocessing.py

import os
import re
import pickle
from lxml import etree
import numpy as np

# Define the namespaces
NS = {'uslm': 'http://xml.house.gov/schemas/uslm/1.0'}  # Prefix 'uslm' for default namespace

def parse_complex_uslm(file_path):
    """
    Parses a USLM XML file and extracts relevant sections into a list of dictionaries.
    Each dictionary represents a section with its hierarchical context and content.
    """
    try:
        tree = etree.parse(file_path)
    except etree.XMLSyntaxError as e:
        print(f"XMLSyntaxError while parsing {file_path}: {e}")
        return []

    root = tree.getroot()

    # Attempt to find the <main> element regardless of the root tag
    main = root.find('.//uslm:main', namespaces=NS)

    if main is None:
        print(f"No <main> element found in file {file_path}. Skipping this file.")
        return []

    documents = []

    # Traverse through <level> elements
    for level in main.findall('.//uslm:level', namespaces=NS):
        # Extract hierarchical identifiers
        num_elem = level.find('.//uslm:num', namespaces=NS)
        level_num = num_elem.text.strip() if num_elem is not None and num_elem.text else "N/A"

        heading_elem = level.find('.//uslm:heading', namespaces=NS)
        heading = clean_text(heading_elem.text) if heading_elem is not None and heading_elem.text else ""

        subheading_elem = level.find('.//uslm:subheading', namespaces=NS)
        subheading = clean_text(subheading_elem.text) if subheading_elem is not None and subheading_elem.text else ""

        # Extract text content
        text_elems = level.findall('.//uslm:text', namespaces=NS)
        content = " ".join([clean_text(text.text) for text in text_elems if text.text])

        # Combine hierarchical context
        full_heading = " > ".join(filter(None, [level_num, heading, subheading]))

        # Create a document entry
        document = {
            'id': f"{os.path.basename(file_path)}_{level_num}",
            'heading': full_heading,
            'content': content
        }
        documents.append(document)

    return documents

documents = parse_complex_uslm('usc01.xml')
documents

[]