# Explore XML Data

In [1]:
from lxml import etree
import duckdb
import glob
import os


In [None]:
# --- Configuration ---
NEW_DATA_DIRECTORY = 'data/new_format'     # data from 2018 onwards
OLD_DATA_DIRECTORY = 'data/old_format' # data from 2017 and earlier

DUCKDB_FILE = 'roadworks_data.duckdb'  # Name for your DuckDB database file
TABLE_NAME = 'planned_roadworks_raw'   # Name for the table within DuckDB

# Define the namespace map
NSMAP = {'d': 'WebTeam'}

# XPath to find the repeating record element
NEW_ROADWORK_RECORD_XPATH = './/d:HE_PLANNED_WORKS'
OLD_ROADWORK_RECORD_XPATH = './/ha_planned_works' # XPath for the old format record

# --- Define Unified Target Columns ---
# Represents the desired final structure in the database
UNIFIED_TARGET_COLUMNS = [
    'source_filename',      # Provenance
    'event_number',         # Unified ID (from NEW_EVENT_NUMBER or reference_number)
    'start_date',           # Unified (from SDATE or start_date)
    'end_date',             # Unified (from EDATE or end_date)
    'expected_delay',       # Unified (from EXPDEL or expected_delay)
    'description',          # Unified
    'closure_type',         # Unified
    'status',               # Unified
    'published_date',       # Unified
    'centre_easting',       # Unified (nested in new, direct in old)
    'centre_northing',      # Unified (nested in new, direct in old)
    'road_numbers',         # Unified (nested in new, 'road' in old - may need joining logic if old has multiple)
    'location',             # Old format specific (or potentially map from new description?)
    'local_authority',      # Old format specific
    'traffic_management',   # Old format specific
    'old_reference_number'  # New format specific (OLD_REFERENCE_NUMBER attribute)
]

# Define XPaths for nested data relative to the NEW format HE_PLANNED_WORKS element
NEW_COORD_XPATH = './d:EASTNORTH/d:Report/d:EASTINGNORTHING/d:EASTNORTH_Collection/d:EASTNORTH'
NEW_ROAD_XPATH = './d:ROADS/d:Report/d:ROADS/d:ROAD_Collection/d:ROAD'

### Find all unique attributes in many XML files

##### For 'new' format (attributes-based)

In [23]:
def find_all_record_attributes_in_directory(directory_path):
    """
    Parses all XML files (new format) in a directory and finds all unique attribute names
    used across all elements matching the NEW_ROADWORK_RECORD_XPATH in any file.
    """
    xml_files = glob.glob(os.path.join(directory_path, '*.xml'))

    if not xml_files:
        print(f"Error: No XML files found in directory: {directory_path}")
        return None

    print(f"--- Finding All Unique Attributes in Directory: {directory_path} ---")
    print(f"Found {len(xml_files)} XML files to scan.")

    all_attribute_names = set() # Use a set to automatically store unique names across all files
    parser = etree.XMLParser(recover=True, ns_clean=True) # Define parser once

    processed_files = 0
    files_with_errors = 0

    for file_path in xml_files:
        filename = os.path.basename(file_path)
        # print(f"Scanning file: {filename}...") # Optional: uncomment for more verbose output
        try:
            # Parse the XML file
            tree = etree.parse(file_path, parser)
            root = tree.getroot()

            # Use xpath with the namespace map to find all records in this file
            records = root.xpath(NEW_ROADWORK_RECORD_XPATH, namespaces=NSMAP)

            if not records:
                # print(f"  Warning: No records found matching XPath in {filename}.") # Optional warning
                continue # Move to the next file if no records found

            # Iterate through ALL found records in the current file
            for record in records:
                # Get the keys (attribute names) from the current record's attributes
                attribute_keys = record.attrib.keys()
                all_attribute_names.update(attribute_keys)

                # Additionally: find attributes in DESCENDANT elements
                # Use iterdescendants() to visit every element below the current record
                for descendant in record.iterdescendants():
                    all_attribute_names.update(descendant.attrib.keys())

            processed_files += 1

        except etree.XMLSyntaxError as e:
            print(f"  Error parsing XML file {filename}: {e}. Skipping file.")
            files_with_errors += 1
        except Exception as e:
            print(f"  An unexpected error occurred scanning file {filename}: {e}. Skipping file.")
            files_with_errors += 1

    print(f"\n--- Scan Complete ---")
    print(f"Successfully scanned {processed_files} files.")
    if files_with_errors > 0:
        print(f"Skipped {files_with_errors} files due to errors.")

    if not all_attribute_names:
        print("No attributes found in any successfully processed files.")
        return None

    # Sort the results for readability
    sorted_attributes = sorted(list(all_attribute_names))

    print(f"\nFound {len(sorted_attributes)} unique attributes across all scanned files:")
    return sorted_attributes

In [24]:
find_all_record_attributes_in_directory(NEW_DATA_DIRECTORY)

--- Finding All Unique Attributes in Directory: data/new_format ---
Found 8 XML files to scan.

--- Scan Complete ---
Successfully scanned 8 files.

Found 13 unique attributes across all scanned files:


['CENTRE_EASTING',
 'CENTRE_NORTHING',
 'CLOSURE_TYPE',
 'DESCRIPTION',
 'EDATE',
 'EXPDEL',
 'NEW_EVENT_NUMBER',
 'Name',
 'OLD_REFERENCE_NUMBER',
 'PUBLISHED_DATE',
 'ROAD_NUMBER',
 'SDATE',
 'STATUS']

##### For 'old' format (child element-based)

In [25]:
def find_all_record_elements_in_directory(directory_path):
    """
    Parses all XML files (old format) in a directory and finds all
    unique child element tag names used across all elements matching the
    OLD_ROADWORK_RECORD_XPATH in any file.
    """
    xml_files = glob.glob(os.path.join(directory_path, '*.xml'))

    if not xml_files:
        print(f"Error: No XML files found in directory: {directory_path}")
        return None

    print(f"--- Finding All Unique Child Element Tags in Directory: {directory_path} ---")
    print(f"Found {len(xml_files)} XML files to scan.")

    all_element_tags = set() # Use a set to automatically store unique tag names
    # Use a simpler parser if namespaces are not expected/needed for old format
    parser = etree.XMLParser(recover=True)

    processed_files = 0
    files_with_errors = 0

    for file_path in xml_files:
        filename = os.path.basename(file_path)
        # print(f"Scanning file: {filename}...") # Optional: uncomment for more verbose output
        try:
            # Parse the XML file
            tree = etree.parse(file_path, parser)
            root = tree.getroot()

            # Check if the root tag matches the expected old format root
            if root.tag != 'ha_planned_roadworks':
                # print(f"  Skipping file {filename}: Root tag '{root.tag}' does not match expected 'ha_planned_roadworks'.")
                continue # Skip files that don't match the old root tag

            # Use xpath to find all records in this file (no namespace needed)
            records = root.xpath(OLD_ROADWORK_RECORD_XPATH) # Use the XPath for the old format

            if not records:
                # print(f"  Warning: No records found matching XPath '{OLD_ROADWORK_RECORD_XPATH}' in {filename}.")
                continue # Move to the next file if no records found

            # Iterate through ALL found records in the current file
            for record in records:
                # Iterate through the child elements of the record
                for child_element in record:
                    # Add the tag name of the child element to the set
                    all_element_tags.add(child_element.tag)

            processed_files += 1

        except etree.XMLSyntaxError as e:
            print(f"  Error parsing XML file {filename}: {e}. Skipping file.")
            files_with_errors += 1
        except Exception as e:
            print(f"  An unexpected error occurred scanning file {filename}: {e}. Skipping file.")
            files_with_errors += 1

    print(f"\n--- Scan Complete ---")
    print(f"Successfully scanned {processed_files} files (matching root tag).")
    if files_with_errors > 0:
        print(f"Skipped {files_with_errors} files due to errors during parsing.")
    skipped_non_matching = len(xml_files) - processed_files - files_with_errors
    if skipped_non_matching > 0:
         print(f"Skipped {skipped_non_matching} files because their root tag did not match 'ha_planned_roadworks'.")


    if not all_element_tags:
        print("No child element tags found in any successfully processed files.")
        return None

    # Sort the results for readability
    sorted_tags = sorted(list(all_element_tags))

    print(f"\nFound {len(sorted_tags)} unique child element tags across all scanned files:")
    return sorted_tags

In [26]:
old_format_elements = find_all_record_elements_in_directory(OLD_DATA_DIRECTORY)
old_format_elements

--- Finding All Unique Child Element Tags in Directory: data/old_format ---
Found 7 XML files to scan.

--- Scan Complete ---
Successfully scanned 7 files (matching root tag).

Found 14 unique child element tags across all scanned files:


['centre_easting',
 'centre_northing',
 'closure_type',
 'description',
 'end_date',
 'expected_delay',
 'local_authority',
 'location',
 'published_date',
 'reference_number',
 'road',
 'start_date',
 'status',
 'traffic_management']

'New' format attributes:
```python
['CENTRE_EASTING',
 'CENTRE_NORTHING',
 'CLOSURE_TYPE',
 'DESCRIPTION',
 'EDATE',
 'EXPDEL',
 'NEW_EVENT_NUMBER',
 'Name',
 'OLD_REFERENCE_NUMBER',
 'PUBLISHED_DATE',
 'ROAD_NUMBER',
 'SDATE',
 'STATUS']
```

'Old' format attributes:
```python
['centre_easting',
 'centre_northing',
 'closure_type',
 'description',
 'end_date',
 'expected_delay',
 'local_authority',
 'location',
 'published_date',
 'reference_number',
 'road',
 'start_date',
 'status',
 'traffic_management']
```

### Explore some records

##### For 'new' format

In [33]:
NUM_RECORDS_TO_INSPECT = 3

In [30]:
def explore_roadworks_xml_new(file_path):
    """Parses and explores the specific structure of the provided roadworks XML."""

    if not os.path.exists(file_path):
        print(f"Error: File not found at {file_path}")
        return

    print(f"--- Exploring XML File (Updated): {file_path} ---")

    try:
        # Parse the XML file
        # Using recover=True can help skip over minor errors if files are slightly malformed
        parser = etree.XMLParser(recover=True, ns_clean=True)
        tree = etree.parse(file_path, parser)
        root = tree.getroot()

        print(f"\n1. Root Element Tag: {root.tag}") # Should be {WebTeam}Report
        print(f"   Root Namespace Map: {root.nsmap}")

        # Use xpath with the namespace map to find the records
        records = root.xpath(NEW_ROADWORK_RECORD_XPATH, namespaces=NSMAP)

        if not records:
            print(f"\nError: Could not find any elements matching XPath '{NEW_ROADWORK_RECORD_XPATH}'.")
            print("Please double-check the NEW_ROADWORK_RECORD_XPATH and the XML structure.")
            # Print children with their full {namespace}tag names to help debug
            print("\nFirst few children of the root (with full tags):")
            for i, child in enumerate(root[:5]):
                 print(f"  Child {i+1}: {child.tag}")
            return

        print(f"\n2. Found {len(records)} records matching XPath '{NEW_ROADWORK_RECORD_XPATH}'.")
        print(f"--- Inspecting first {min(NUM_RECORDS_TO_INSPECT, len(records))} records ---")

        for i, record in enumerate(records[:NUM_RECORDS_TO_INSPECT]):
            print(f"\n\n--- Record {i+1} ---")

            # --- Accessing Attributes of <HE_PLANNED_WORKS> ---
            print(" Attributes of <HE_PLANNED_WORKS>:")
            record_attrs = record.attrib
            for key, value in record_attrs.items():
                 print(f"    {key}: {value}")

            # Extract specific attributes by name
            event_id = record.get('NEW_EVENT_NUMBER')
            start_date = record.get('SDATE')
            end_date = record.get('EDATE')
            description = record.get('DESCRIPTION')
            closure_type = record.get('CLOSURE_TYPE')
            status = record.get('STATUS')
            published_date = record.get('PUBLISHED_DATE')
            exp_del = record.get('EXPDEL')

            print("\n Extracted Key Attributes:")
            print(f"- NEW_EVENT_NUMBER: {event_id}")
            print(f"- SDATE: {start_date}")
            print(f"- EDATE: {end_date}")
            print(f"- DESCRIPTION: {description}")
            print(f"- CLOSURE_TYPE: {closure_type}")
            print(f"- STATUS: {status}")
            print(f"- PUBLISHED_DATE: {published_date}")
            print(f"- EXPDEL: {exp_del}")


            # --- Accessing Nested Coordinates ---
            print("\n Extracting Nested Coordinates:")
            # Define the precise XPath relative to the current 'record' element
            coord_xpath = './d:EASTNORTH/d:Report/d:EASTINGNORTHING/d:EASTNORTH_Collection/d:EASTNORTH'
            coord_elements = record.xpath(coord_xpath, namespaces=NSMAP)

            if coord_elements:
                # Usually expect only one coordinate block per record
                coord_element = coord_elements[0]
                easting = coord_element.get('CENTRE_EASTING')
                northing = coord_element.get('CENTRE_NORTHING')
                print(f"    CENTRE_EASTING: {easting}")
                print(f"    CENTRE_NORTHING: {northing}")
            else:
                print("    Coordinate elements not found.")

            # --- Accessing Nested Roads ---
            print("\nExtracting Nested Roads:")
            # Define the precise XPath relative to the current 'record' element
            road_xpath = './d:ROADS/d:Report/d:ROADS/d:ROAD_Collection/d:ROAD'
            road_elements = record.xpath(road_xpath, namespaces=NSMAP)

            if road_elements:
                road_numbers = [road.get('ROAD_NUMBER') for road in road_elements]
                print(f"    ROAD_NUMBER(s): {road_numbers}") # Might be multiple roads
            else:
                print("    Road elements not found.")


    except etree.XMLSyntaxError as e:
        print(f"\nError parsing XML file: {e}")
    except Exception as e:
        print(f"\nAn unexpected error occurred: {e}")

    print(f"\n--- End of Exploration for {file_path} ---")


explore_roadworks_xml_new("data/new_format/nh_roadworks_2025_14_4.xml")

--- Exploring XML File (Updated): data/new_format/nh_roadworks_2025_14_4.xml ---

1. Root Element Tag: {WebTeam}Report
   Root Namespace Map: {'xsi': 'http://www.w3.org/2001/XMLSchema-instance', None: 'WebTeam'}

2. Found 1429 records matching XPath './/d:HE_PLANNED_WORKS'.
--- Inspecting first 3 records ---


--- Record 1 ---
 Attributes of <HE_PLANNED_WORKS>:
    NEW_EVENT_NUMBER: 00352573-001
    SDATE: 31-DEC-2023 23:59
    EDATE: 31-MAY-2025 23:59
    EXPDEL: Moderate (10 - 30 mins)
    DESCRIPTION: M25 Anticlockwise Jct 11 to Jct 9
Narrow Lanes for Major Improvement Scheme 
    CLOSURE_TYPE: Major Schemes
    STATUS: Published
    PUBLISHED_DATE: 2023-12-21T14:45:07

 Extracted Key Attributes:
- NEW_EVENT_NUMBER: 00352573-001
- SDATE: 31-DEC-2023 23:59
- EDATE: 31-MAY-2025 23:59
- DESCRIPTION: M25 Anticlockwise Jct 11 to Jct 9
Narrow Lanes for Major Improvement Scheme 
- CLOSURE_TYPE: Major Schemes
- STATUS: Published
- PUBLISHED_DATE: 2023-12-21T14:45:07
- EXPDEL: Moderate (10 -

##### For 'old' format

In [31]:
def explore_roadworks_xml_old(file_path):
    """Parses and explores the structure of an old-format roadworks XML."""

    if not os.path.exists(file_path):
        print(f"Error: File not found at {file_path}")
        return

    print(f"--- Exploring XML File (Old Format): {file_path} ---")

    try:
        # Use a simpler parser, potentially without namespace handling if not needed
        parser = etree.XMLParser(recover=True)
        tree = etree.parse(file_path, parser)
        root = tree.getroot()

        print(f"\n1. Root Element Tag: {root.tag}") # Should be ha_planned_roadworks

        # Check if the root tag is as expected for the old format
        if root.tag != 'ha_planned_roadworks':
            print(f"  Warning: Root tag '{root.tag}' does not match expected 'ha_planned_roadworks'.")
            # Optionally, still try to find records if the XPath might work
            # return # Or uncomment to stop if root tag is wrong

        # Use xpath to find the records (no namespace typically needed for old format)
        records = root.xpath(OLD_ROADWORK_RECORD_XPATH)

        if not records:
            print(f"\nError: Could not find any elements matching XPath '{OLD_ROADWORK_RECORD_XPATH}'.")
            print("Please double-check the OLD_ROADWORK_RECORD_XPATH and the XML structure.")
            print("\nFirst few children of the root:")
            for i, child in enumerate(root[:5]):
                 print(f"  Child {i+1}: {child.tag}")
            return

        print(f"\n2. Found {len(records)} records matching XPath '{OLD_ROADWORK_RECORD_XPATH}'.")
        print(f"--- Inspecting first {min(NUM_RECORDS_TO_INSPECT, len(records))} records ---")

        for i, record in enumerate(records[:NUM_RECORDS_TO_INSPECT]):
            print(f"\n\n--- Record {i+1} ---")
            print(f" Record Element Tag: {record.tag}") # Should be ha_planned_works

            # --- Accessing Child Elements ---
            print(" Child Elements (Tag: Text Content):")
            record_data = {}
            for child in record:
                # Clean up text content (strip whitespace, handle None)
                text_content = (child.text or '').strip()
                print(f"    {child.tag}: {text_content}")
                record_data[child.tag] = text_content # Store for easier access later

            # Extract specific child element text content by tag name
            # Based on the output of find_all_record_elements_in_directory
            ref_num = record_data.get('reference_number')
            start_date = record_data.get('start_date')
            end_date = record_data.get('end_date')
            description = record_data.get('description')
            road = record_data.get('road')
            status = record_data.get('status')
            easting = record_data.get('centre_easting')
            northing = record_data.get('centre_northing')
            delay = record_data.get('expected_delay')
            closure = record_data.get('closure_type')

            print("\n Extracted Key Child Element Values:")
            print(f"- reference_number: {ref_num}")
            print(f"- start_date: {start_date}")
            print(f"- end_date: {end_date}")
            print(f"- description: {description}")
            print(f"- road: {road}")
            print(f"- status: {status}")
            print(f"- centre_easting: {easting}")
            print(f"- centre_northing: {northing}")
            print(f"- expected_delay: {delay}")
            print(f"- closure_type: {closure}")


    except etree.XMLSyntaxError as e:
        print(f"\nError parsing XML file: {e}")
    except Exception as e:
        print(f"\nAn unexpected error occurred: {e}")

    print(f"\n--- End of Exploration for {file_path} ---")


example_old_file = os.path.join(OLD_DATA_DIRECTORY, 'he_roadworks_2017_06_05')
if os.path.exists(example_old_file):
    explore_roadworks_xml_old(example_old_file)
else:
    # Find the first available XML file in the old data directory if the example doesn't exist
    old_files = glob.glob(os.path.join(OLD_DATA_DIRECTORY, '*.xml'))
    if old_files:
        print(f"Example file '{example_old_file}' not found. Using first available file: {old_files[0]}")
        explore_roadworks_xml_old(old_files[0])
    else:
        print(f"Error: No XML files found in {OLD_DATA_DIRECTORY} to explore.")


Example file 'data/old_format\he_roadworks_2017_06_05' not found. Using first available file: data/old_format\ha-roadworks_2011_10_10.xml
--- Exploring XML File (Old Format): data/old_format\ha-roadworks_2011_10_10.xml ---

1. Root Element Tag: ha_planned_roadworks

2. Found 1425 records matching XPath './/ha_planned_works'.
--- Inspecting first 3 records ---


--- Record 1 ---
 Record Element Tag: ha_planned_works
 Child Elements (Tag: Text Content):
    reference_number: 972963
    road: M1
    local_authority: Leicestershire / Northamptonshire
    location: Catthorpe
    start_date: 2010-07-12T07:00:00
    end_date: 2013-03-23T06:00:00
    expected_delay: Moderate (10 - 30 mins)
    description: Major junction works will include lane closures, contraflow, full closures and 50 MPH speed restrictions on the M1 and M6.
    traffic_management: Other
    closure_type: Planned Works
    centre_easting: 456252
    centre_northing: 278173
    status: Firm
    published_date: 2011-10-09T21:0

### Extract all records from XML

In [None]:
def extract_record_data_as_dict(record_element, source_filename):
    """
    Extracts data from a single <HE_PLANNED_WORKS> lxml element into a dictionary.
    Handles nested structures.
    Args:
        record_element: The lxml element for the record.
        source_filename: The name of the file this record came from.

    Returns:
        A dictionary containing the extracted data for one record,
        or None if essential data (like event number) is missing.
    """
    data = {}
    data['source_filename'] = source_filename

    # Extract direct attributes based on TARGET_COLUMNS (excluding derived ones)
    direct_attrs = [col for col in TARGET_COLUMNS if col not in ['source_filename', 'centre_easting', 'centre_northing', 'road_numbers']]
    for attr in direct_attrs:
        data[attr] = record_element.get(attr)

    # Basic check - skip if no event number
    if data.get('NEW_EVENT_NUMBER') is None:
        # print(f"Warning: Record missing NEW_EVENT_NUMBER in {source_filename}. Skipping.")
        return None # Return None to indicate skipping this record

    # Extract nested coordinates
    coord_elements = record_element.xpath(NEW_COORD_XPATH, namespaces=NSMAP)
    if coord_elements:
        coord_element = coord_elements[0]
        data['centre_easting'] = coord_element.get('CENTRE_EASTING')
        data['centre_northing'] = coord_element.get('CENTRE_NORTHING')
    else:
        data['centre_easting'] = None
        data['centre_northing'] = None

    # Extract nested roads
    road_elements = record_element.xpath(NEW_ROAD_XPATH, namespaces=NSMAP)
    if road_elements:
        road_numbers = [road.get('ROAD_NUMBER') for road in road_elements if road.get('ROAD_NUMBER')]
        data['road_numbers'] = '; '.join(road_numbers) if road_numbers else None
    else:
        data['road_numbers'] = None

    return data

In [None]:
def process_xml_files(data_dir, db_file, table_name):
    """
    Processes all XML files in a directory and loads data into DuckDB
    directly using executemany without Pandas.
    """
    all_records_data_dicts = [] # Keep collecting dictionaries first
    xml_files = glob.glob(os.path.join(data_dir, '*.xml'))

    if not xml_files:
        print(f"Error: No XML files found in directory: {data_dir}")
        return

    print(f"Found {len(xml_files)} XML files to process in '{data_dir}'.")
    parser = etree.XMLParser(recover=True, ns_clean=True)

    total_processed_records = 0
    skipped_records = 0

    for file_path in xml_files:
        filename = os.path.basename(file_path)
        print(f"Processing file: {filename}...")
        try:
            tree = etree.parse(file_path, parser)
            root = tree.getroot()
            records = root.xpath(NEW_ROADWORK_RECORD_XPATH, namespaces=NSMAP)

            if not records:
                print(f"  Warning: No records found matching XPath in {filename}.")
                continue

            file_record_count = 0
            for record in records:
                try:
                    extracted_dict = extract_record_data_as_dict(record, filename)
                    if extracted_dict:
                        all_records_data_dicts.append(extracted_dict)
                        file_record_count += 1
                    else:
                        skipped_records += 1 # Count records skipped due to missing ID
                except Exception as e_rec:
                    event_id = record.get('NEW_EVENT_NUMBER', 'UNKNOWN_ID')
                    print(f"  Error processing record {event_id} in {filename}: {e_rec}")
                    skipped_records += 1

            print(f"  Extracted {file_record_count} valid records from {filename}.")
            total_processed_records += file_record_count

        except etree.XMLSyntaxError as e_xml:
            print(f"  Error parsing XML file {filename}: {e_xml}. Skipping file.")
        except Exception as e_file:
            print(f"  An unexpected error occurred processing file {filename}: {e_file}. Skipping file.")

    if not all_records_data_dicts:
        print("No valid data extracted from any files. Database will not be updated.")
        if skipped_records > 0:
             print(f"Note: {skipped_records} records were skipped due to errors or missing IDs.")
        return

    print(f"\nTotal valid records extracted across all files: {total_processed_records}")
    if skipped_records > 0:
        print(f"Total records skipped due to errors or missing IDs: {skipped_records}")

    # --- Convert list of dictionaries to list of tuples/lists for insertion ---
    print("Preparing data for insertion...")
    data_to_insert = []
    t = 0 # DEBUG
    for record_dict in all_records_data_dicts:
        row_values = [record_dict.get(col_name) for col_name in TARGET_COLUMNS]
        if t == 0: # DEBUG
            print(f"Row {t}: {row_values}") # DEBUG
        t += 1 # DEBUG
        data_to_insert.append(row_values)

    # --- Load data into DuckDB directly using executemany ---
    print(f"Connecting to DuckDB database: {db_file}")
    con = duckdb.connect(database=db_file, read_only=False)

    try:
        print(f"Creating or replacing table: {table_name}")
        column_defs = [f'"{col}" VARCHAR' for col in TARGET_COLUMNS] # Quote names
        create_table_sql = f"CREATE OR REPLACE TABLE {table_name} ({', '.join(column_defs)})"
        con.execute(create_table_sql)

        print(f"Inserting {len(data_to_insert)} records into {table_name}...")

        # ***** CORRECTED INSERTION METHOD using executemany *****
        # Create the SQL insert statement with placeholders
        placeholders = ', '.join(['?'] * NUM_COLUMNS) # e.g., "?, ?, ?, ..."
        insert_sql = f'INSERT INTO {table_name} VALUES ({placeholders})'

        # Execute the insert statement for all rows in data_to_insert
        con.executemany(insert_sql, data_to_insert)
        # *******************************************************

        con.commit() # Commit the transaction
        print("Data insertion complete.")

        # Verify insertion (optional)
        count_result = con.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()
        print(f"Verification: Table '{table_name}' now contains {count_result[0]} rows.")

    except duckdb.Error as e_db: # Catch specific DuckDB errors
        print(f"Database error: {e_db}")
        try:
            print("Attempting to rollback transaction.")
            con.rollback()
        except duckdb.TransactionException as e_tx:
            print(f"Rollback failed (likely no active transaction): {e_tx}")
    except Exception as e:
         print(f"An unexpected error occurred during DB operation: {e}")
         try:
            con.rollback()
         except duckdb.TransactionException as e_tx:
            print(f"Rollback failed (likely no active transaction): {e_tx}")
    finally:
        con.close()
        print("Database connection closed.")

In [28]:
process_xml_files(DATA_DIRECTORY, DUCKDB_FILE, TABLE_NAME)

Found 15 XML files to process in 'data'.
Processing file: ha-roadworks_2011_10_10.xml...
Processing file: ha-roadworks_2012_04_09.xml...
Processing file: ha-roadworks_2013_05_06.xml...
Processing file: ha-roadworks_2014_03_31.xml...
Processing file: ha_roadworks_2015_03_16.xml...
Processing file: he_roadworks_2016_02_29.xml...
Processing file: he_roadworks_2017_06_05.xml...
Processing file: he_roadworks_2018_02_26.xml...
  Extracted 1477 valid records from he_roadworks_2018_02_26.xml.
Processing file: he_roadworks_2019_04_15.xml...
  Extracted 1103 valid records from he_roadworks_2019_04_15.xml.
Processing file: he_roadworks_2020_05_25.xml...
  Extracted 1426 valid records from he_roadworks_2020_05_25.xml.
Processing file: he_roadworks_2021_03_01.xml...
  Extracted 1567 valid records from he_roadworks_2021_03_01.xml.
Processing file: nh_roadworks_2022_3_14.xml...
  Extracted 1621 valid records from nh_roadworks_2022_3_14.xml.
Processing file: nh_roadworks_2023_3_6.xml...
  Extracted 14