# Explore XML Data

In [13]:
from lxml import etree
import duckdb
import glob
import os


In [16]:
# --- Configuration ---
DATA_DIRECTORY = 'data' # Directory containing your XML files
DUCKDB_FILE = 'roadworks_data.duckdb' # Name for your DuckDB database file
TABLE_NAME = 'planned_roadworks_raw' # Name for the table within DuckDB

# Define the namespace map
NSMAP = {'d': 'WebTeam'}

# XPath to find the repeating record element
ROADWORK_RECORD_XPATH = './/d:HE_PLANNED_WORKS'

# --- Define Target Columns Explicitly ---
# This ensures consistent order for table creation and data insertion
TARGET_COLUMNS = [
    'source_filename', # Added first for provenance
    'NEW_EVENT_NUMBER',
    'SDATE',
    'EDATE',
    'EXPDEL',
    'DESCRIPTION',
    'CLOSURE_TYPE',
    'STATUS',
    'PUBLISHED_DATE',
    # Add any other direct attributes you want here, matching the order below
    'centre_easting', # From nested structure
    'centre_northing',# From nested structure
    'road_numbers'     # From nested structure (potentially multiple)
]

# Define XPaths for nested data relative to the HE_PLANNED_WORKS element
COORD_XPATH = './d:EASTNORTH/d:Report/d:EASTINGNORTHING/d:EASTNORTH_Collection/d:EASTNORTH'
ROAD_XPATH = './d:ROADS/d:Report/d:ROADS/d:ROAD_Collection/d:ROAD'

# How many records to inspect in detail
NUM_RECORDS_TO_INSPECT = 3
# ---

In [17]:
def explore_roadworks_xml_updated(file_path):
    """Parses and explores the specific structure of the provided roadworks XML."""

    if not os.path.exists(file_path):
        print(f"Error: File not found at {file_path}")
        return

    print(f"--- Exploring XML File (Updated): {file_path} ---")

    try:
        # Parse the XML file
        # Using recover=True can help skip over minor errors if files are slightly malformed
        parser = etree.XMLParser(recover=True, ns_clean=True)
        tree = etree.parse(file_path, parser)
        root = tree.getroot()

        print(f"\n1. Root Element Tag: {root.tag}") # Should be {WebTeam}Report
        print(f"   Root Namespace Map: {root.nsmap}")

        # Use xpath with the namespace map to find the records
        records = root.xpath(ROADWORK_RECORD_XPATH, namespaces=NSMAP)

        if not records:
            print(f"\nError: Could not find any elements matching XPath '{ROADWORK_RECORD_XPATH}'.")
            print("Please double-check the ROADWORK_RECORD_XPATH and the XML structure.")
            # Print children with their full {namespace}tag names to help debug
            print("\nFirst few children of the root (with full tags):")
            for i, child in enumerate(root[:5]):
                 print(f"  Child {i+1}: {child.tag}")
            return

        print(f"\n2. Found {len(records)} records matching XPath '{ROADWORK_RECORD_XPATH}'.")
        print(f"--- Inspecting first {min(NUM_RECORDS_TO_INSPECT, len(records))} records ---")

        for i, record in enumerate(records[:NUM_RECORDS_TO_INSPECT]):
            print(f"\n\n--- Record {i+1} ---")

            # --- Accessing Attributes of <HE_PLANNED_WORKS> ---
            print(" Attributes of <HE_PLANNED_WORKS>:")
            record_attrs = record.attrib
            for key, value in record_attrs.items():
                 print(f"    {key}: {value}")

            # Extract specific attributes by name
            event_id = record.get('NEW_EVENT_NUMBER') # Correct attribute name
            start_date = record.get('SDATE')          # Correct attribute name
            end_date = record.get('EDATE')            # Correct attribute name
            description = record.get('DESCRIPTION')   # Correct attribute name
            closure_type = record.get('CLOSURE_TYPE')
            status = record.get('STATUS')
            published_date = record.get('PUBLISHED_DATE')
            exp_del = record.get('EXPDEL')

            print("\n Extracted Key Attributes:")
            print(f"- NEW_EVENT_NUMBER: {event_id}")
            print(f"- SDATE: {start_date}")
            print(f"- EDATE: {end_date}")
            print(f"- DESCRIPTION: {description}")
            print(f"- CLOSURE_TYPE: {closure_type}")
            print(f"- STATUS: {status}")
            print(f"- PUBLISHED_DATE: {published_date}")
            print(f"- EXPDEL: {exp_del}")


            # --- Accessing Nested Coordinates ---
            print("\n Extracting Nested Coordinates:")
            # Define the precise XPath relative to the current 'record' element
            coord_xpath = './d:EASTNORTH/d:Report/d:EASTINGNORTHING/d:EASTNORTH_Collection/d:EASTNORTH'
            coord_elements = record.xpath(coord_xpath, namespaces=NSMAP)

            if coord_elements:
                # Usually expect only one coordinate block per record
                coord_element = coord_elements[0]
                easting = coord_element.get('CENTRE_EASTING')
                northing = coord_element.get('CENTRE_NORTHING')
                print(f"    CENTRE_EASTING: {easting}")
                print(f"    CENTRE_NORTHING: {northing}")
            else:
                print("    Coordinate elements not found.")

            # --- Accessing Nested Roads ---
            print("\nExtracting Nested Roads:")
            # Define the precise XPath relative to the current 'record' element
            road_xpath = './d:ROADS/d:Report/d:ROADS/d:ROAD_Collection/d:ROAD'
            road_elements = record.xpath(road_xpath, namespaces=NSMAP)

            if road_elements:
                road_numbers = [road.get('ROAD_NUMBER') for road in road_elements]
                print(f"    ROAD_NUMBER(s): {road_numbers}") # Might be multiple roads
            else:
                print("    Road elements not found.")


    except etree.XMLSyntaxError as e:
        print(f"\nError parsing XML file: {e}")
    except Exception as e:
        print(f"\nAn unexpected error occurred: {e}")

    print(f"\n--- End of Exploration for {file_path} ---")


explore_roadworks_xml_updated("data/nh_roadworks_2025_14_4.xml")

--- Exploring XML File (Updated): data/nh_roadworks_2025_14_4.xml ---

1. Root Element Tag: {WebTeam}Report
   Root Namespace Map: {'xsi': 'http://www.w3.org/2001/XMLSchema-instance', None: 'WebTeam'}

2. Found 1429 records matching XPath './/d:HE_PLANNED_WORKS'.
--- Inspecting first 3 records ---


--- Record 1 ---
 Attributes of <HE_PLANNED_WORKS>:
    NEW_EVENT_NUMBER: 00352573-001
    SDATE: 31-DEC-2023 23:59
    EDATE: 31-MAY-2025 23:59
    EXPDEL: Moderate (10 - 30 mins)
    DESCRIPTION: M25 Anticlockwise Jct 11 to Jct 9
Narrow Lanes for Major Improvement Scheme 
    CLOSURE_TYPE: Major Schemes
    STATUS: Published
    PUBLISHED_DATE: 2023-12-21T14:45:07

 Extracted Key Attributes:
- NEW_EVENT_NUMBER: 00352573-001
- SDATE: 31-DEC-2023 23:59
- EDATE: 31-MAY-2025 23:59
- DESCRIPTION: M25 Anticlockwise Jct 11 to Jct 9
Narrow Lanes for Major Improvement Scheme 
- CLOSURE_TYPE: Major Schemes
- STATUS: Published
- PUBLISHED_DATE: 2023-12-21T14:45:07
- EXPDEL: Moderate (10 - 30 mins)



In [24]:
def extract_record_data_as_dict(record_element, source_filename):
    """
    Extracts data from a single <HE_PLANNED_WORKS> lxml element into a dictionary.
    Handles nested structures.

    Args:
        record_element: The lxml element for the record.
        source_filename: The name of the file this record came from.

    Returns:
        A dictionary containing the extracted data for one record,
        or None if essential data (like event number) is missing.
    """
    data = {}
    data['source_filename'] = source_filename

    # Extract direct attributes based on TARGET_COLUMNS (excluding derived ones)
    direct_attrs = [col for col in TARGET_COLUMNS if col not in ['source_filename', 'centre_easting', 'centre_northing', 'road_numbers']]
    for attr in direct_attrs:
        data[attr] = record_element.get(attr)

    # Basic check - skip if no event number
    if data.get('NEW_EVENT_NUMBER') is None:
        # print(f"Warning: Record missing NEW_EVENT_NUMBER in {source_filename}. Skipping.")
        return None # Return None to indicate skipping this record

    # Extract nested coordinates
    coord_elements = record_element.xpath(COORD_XPATH, namespaces=NSMAP)
    if coord_elements:
        coord_element = coord_elements[0]
        data['centre_easting'] = coord_element.get('CENTRE_EASTING')
        data['centre_northing'] = coord_element.get('CENTRE_NORTHING')
    else:
        data['centre_easting'] = None
        data['centre_northing'] = None

    # Extract nested roads
    road_elements = record_element.xpath(ROAD_XPATH, namespaces=NSMAP)
    if road_elements:
        road_numbers = [road.get('ROAD_NUMBER') for road in road_elements if road.get('ROAD_NUMBER')]
        data['road_numbers'] = '; '.join(road_numbers) if road_numbers else None
    else:
        data['road_numbers'] = None

    return data

In [25]:
def process_xml_files(data_dir, db_file, table_name):
    """
    Processes all XML files in a directory and loads data into DuckDB
    directly using executemany without Pandas.
    """
    all_records_data_dicts = [] # Keep collecting dictionaries first
    xml_files = glob.glob(os.path.join(data_dir, '*.xml'))

    if not xml_files:
        print(f"Error: No XML files found in directory: {data_dir}")
        return

    print(f"Found {len(xml_files)} XML files to process in '{data_dir}'.")
    parser = etree.XMLParser(recover=True, ns_clean=True)

    total_processed_records = 0
    skipped_records = 0

    for file_path in xml_files:
        filename = os.path.basename(file_path)
        print(f"Processing file: {filename}...")
        try:
            tree = etree.parse(file_path, parser)
            root = tree.getroot()
            records = root.xpath(ROADWORK_RECORD_XPATH, namespaces=NSMAP)

            if not records:
                print(f"  Warning: No records found matching XPath in {filename}.")
                continue

            file_record_count = 0
            for record in records:
                try:
                    extracted_dict = extract_record_data_as_dict(record, filename)
                    if extracted_dict:
                        all_records_data_dicts.append(extracted_dict)
                        file_record_count += 1
                    else:
                        skipped_records += 1 # Count records skipped due to missing ID
                except Exception as e_rec:
                    event_id = record.get('NEW_EVENT_NUMBER', 'UNKNOWN_ID')
                    print(f"  Error processing record {event_id} in {filename}: {e_rec}")
                    skipped_records += 1

            print(f"  Extracted {file_record_count} valid records from {filename}.")
            total_processed_records += file_record_count

        except etree.XMLSyntaxError as e_xml:
            print(f"  Error parsing XML file {filename}: {e_xml}. Skipping file.")
        except Exception as e_file:
            print(f"  An unexpected error occurred processing file {filename}: {e_file}. Skipping file.")

    if not all_records_data_dicts:
        print("No valid data extracted from any files. Database will not be updated.")
        if skipped_records > 0:
             print(f"Note: {skipped_records} records were skipped due to errors or missing IDs.")
        return

    print(f"\nTotal valid records extracted across all files: {total_processed_records}")
    if skipped_records > 0:
        print(f"Total records skipped due to errors or missing IDs: {skipped_records}")

    # --- Convert list of dictionaries to list of tuples/lists for insertion ---
    print("Preparing data for insertion...")
    data_to_insert = []
    for record_dict in all_records_data_dicts:
        row_values = [record_dict.get(col_name) for col_name in TARGET_COLUMNS]
        data_to_insert.append(row_values)

    # --- Load data into DuckDB directly using executemany ---
    print(f"Connecting to DuckDB database: {db_file}")
    con = duckdb.connect(database=db_file, read_only=False)

    try:
        print(f"Creating or replacing table: {table_name}")
        column_defs = [f'"{col}" VARCHAR' for col in TARGET_COLUMNS] # Quote names
        create_table_sql = f"CREATE OR REPLACE TABLE {table_name} ({', '.join(column_defs)})"
        con.execute(create_table_sql)

        print(f"Inserting {len(data_to_insert)} records into {table_name}...")

        # ***** CORRECTED INSERTION METHOD using executemany *****
        # Create the SQL insert statement with placeholders
        placeholders = ', '.join(['?'] * NUM_COLUMNS) # e.g., "?, ?, ?, ..."
        insert_sql = f'INSERT INTO {table_name} VALUES ({placeholders})'

        # Execute the insert statement for all rows in data_to_insert
        con.executemany(insert_sql, data_to_insert)
        # *******************************************************

        con.commit() # Commit the transaction
        print("Data insertion complete.")

        # Verify insertion (optional)
        count_result = con.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()
        print(f"Verification: Table '{table_name}' now contains {count_result[0]} rows.")

    except duckdb.Error as e_db: # Catch specific DuckDB errors
        print(f"Database error: {e_db}")
        try:
            print("Attempting to rollback transaction.")
            con.rollback()
        except duckdb.TransactionException as e_tx:
            print(f"Rollback failed (likely no active transaction): {e_tx}")
    except Exception as e:
         print(f"An unexpected error occurred during DB operation: {e}")
         try:
            con.rollback()
         except duckdb.TransactionException as e_tx:
            print(f"Rollback failed (likely no active transaction): {e_tx}")
    finally:
        con.close()
        print("Database connection closed.")

In [26]:
process_xml_files(DATA_DIRECTORY, DUCKDB_FILE, TABLE_NAME)

Found 15 XML files to process in 'data'.
Processing file: ha-roadworks_2011_10_10.xml...
Processing file: ha-roadworks_2012_04_09.xml...
Processing file: ha-roadworks_2013_05_06.xml...
Processing file: ha-roadworks_2014_03_31.xml...
Processing file: ha_roadworks_2015_03_16.xml...
Processing file: he_roadworks_2016_02_29.xml...
Processing file: he_roadworks_2017_06_05.xml...
Processing file: he_roadworks_2018_02_26.xml...
  Extracted 1477 valid records from he_roadworks_2018_02_26.xml.
Processing file: he_roadworks_2019_04_15.xml...
  Extracted 1103 valid records from he_roadworks_2019_04_15.xml.
Processing file: he_roadworks_2020_05_25.xml...
  Extracted 1426 valid records from he_roadworks_2020_05_25.xml.
Processing file: he_roadworks_2021_03_01.xml...
  Extracted 1567 valid records from he_roadworks_2021_03_01.xml.
Processing file: nh_roadworks_2022_3_14.xml...
  Extracted 1621 valid records from nh_roadworks_2022_3_14.xml.
Processing file: nh_roadworks_2023_3_6.xml...
  Extracted 14