# Explore XML Data

In [14]:
from lxml import etree
import duckdb
import polars as pl
import glob
import os
from IPython.display import display

In [15]:
# --- Configuration ---
NEW_DATA_DIRECTORY = 'data/new_format'     # data from 2018 onwards
OLD_DATA_DIRECTORY = 'data/old_format' # data from 2017 and earlier

DUCKDB_FILE = 'roadworks_data.duckdb'  # Name for your DuckDB database file
# Define separate table names for new and old formats
RAW_NEW_TABLE_NAME = 'raw_new_roadworks'
RAW_OLD_TABLE_NAME = 'raw_old_roadworks'

# Define the namespace map
NSMAP = {'d': 'WebTeam'}

# XPath to find the repeating record element
NEW_ROADWORK_RECORD_XPATH = './/d:HE_PLANNED_WORKS'
OLD_ROADWORK_RECORD_XPATH = './/ha_planned_works' # XPath for the old format record

# --- Define Raw Columns based on exploration ---

# Columns for the 'new' format raw table
# Includes source_filename and handles nested elements
RAW_NEW_COLUMNS = [
    'source_filename',
    # Attributes from HE_PLANNED_WORKS
    'NEW_EVENT_NUMBER',
    'OLD_REFERENCE_NUMBER',
    'SDATE',
    'EDATE',
    'EXPDEL',
    'DESCRIPTION',
    'CLOSURE_TYPE',
    'STATUS',
    'PUBLISHED_DATE',
    # Nested attributes (will be extracted)
    'CENTRE_EASTING',
    'CENTRE_NORTHING',
    'ROAD_NUMBERS' # Potentially multiple, joined by ';'
]

# Columns for the 'old' format raw table
# Includes source_filename and direct child element tags
RAW_OLD_COLUMNS = [
    'source_filename',
    # Child elements of ha_planned_works
    'reference_number',
    'start_date',
    'end_date',
    'expected_delay',
    'description',
    'closure_type',
    'status',
    'published_date',
    'centre_easting',
    'centre_northing',
    'road',
    'location',
    'local_authority',
    'traffic_management'
]

# Define XPaths for nested data relative to the NEW format HE_PLANNED_WORKS element
NEW_COORD_XPATH = './d:EASTNORTH/d:Report/d:EASTINGNORTHING/d:EASTNORTH_Collection/d:EASTNORTH'
NEW_ROAD_XPATH = './d:ROADS/d:Report/d:ROADS/d:ROAD_Collection/d:ROAD'

### OLD: Extract single XML file

In [None]:
def extract_record_data_as_dict(record_element, source_filename):
    """
    Extracts data from a single <HE_PLANNED_WORKS> lxml element into a dictionary.
    Handles nested structures.
    Args:
        record_element: The lxml element for the record.
        source_filename: The name of the file this record came from.

    Returns:
        A dictionary containing the extracted data for one record,
        or None if essential data (like event number) is missing.
    """
    data = {}
    data['source_filename'] = source_filename

    # Extract direct attributes based on TARGET_COLUMNS (excluding derived ones)
    direct_attrs = [col for col in TARGET_COLUMNS if col not in ['source_filename', 'centre_easting', 'centre_northing', 'road_numbers']]
    for attr in direct_attrs:
        data[attr] = record_element.get(attr)

    # Basic check - skip if no event number
    if data.get('NEW_EVENT_NUMBER') is None:
        # print(f"Warning: Record missing NEW_EVENT_NUMBER in {source_filename}. Skipping.")
        return None # Return None to indicate skipping this record

    # Extract nested coordinates
    coord_elements = record_element.xpath(NEW_COORD_XPATH, namespaces=NSMAP)
    if coord_elements:
        coord_element = coord_elements[0]
        data['centre_easting'] = coord_element.get('CENTRE_EASTING')
        data['centre_northing'] = coord_element.get('CENTRE_NORTHING')
    else:
        data['centre_easting'] = None
        data['centre_northing'] = None

    # Extract nested roads
    road_elements = record_element.xpath(NEW_ROAD_XPATH, namespaces=NSMAP)
    if road_elements:
        road_numbers = [road.get('ROAD_NUMBER') for road in road_elements if road.get('ROAD_NUMBER')]
        data['road_numbers'] = '; '.join(road_numbers) if road_numbers else None
    else:
        data['road_numbers'] = None

    return data

In [None]:
def process_xml_files(data_dir, db_file, table_name):
    """
    Processes all XML files in a directory and loads data into DuckDB
    directly using executemany.
    """
    all_records_data_dicts = [] # Keep collecting dictionaries first
    xml_files = glob.glob(os.path.join(data_dir, '*.xml'))

    if not xml_files:
        print(f"Error: No XML files found in directory: {data_dir}")
        return

    print(f"Found {len(xml_files)} XML files to process in '{data_dir}'.")
    parser = etree.XMLParser(recover=True, ns_clean=True)

    total_processed_records = 0
    skipped_records = 0

    for file_path in xml_files:
        filename = os.path.basename(file_path)
        print(f"Processing file: {filename}...")
        try:
            tree = etree.parse(file_path, parser)
            root = tree.getroot()
            records = root.xpath(NEW_ROADWORK_RECORD_XPATH, namespaces=NSMAP)

            if not records:
                print(f"  Warning: No records found matching XPath in {filename}.")
                continue

            file_record_count = 0
            for record in records:
                try:
                    extracted_dict = extract_record_data_as_dict(record, filename)
                    if extracted_dict:
                        all_records_data_dicts.append(extracted_dict)
                        file_record_count += 1
                    else:
                        skipped_records += 1 # Count records skipped due to missing ID
                except Exception as e_rec:
                    event_id = record.get('NEW_EVENT_NUMBER', 'UNKNOWN_ID')
                    print(f"  Error processing record {event_id} in {filename}: {e_rec}")
                    skipped_records += 1

            print(f"  Extracted {file_record_count} valid records from {filename}.")
            total_processed_records += file_record_count

        except etree.XMLSyntaxError as e_xml:
            print(f"  Error parsing XML file {filename}: {e_xml}. Skipping file.")
        except Exception as e_file:
            print(f"  An unexpected error occurred processing file {filename}: {e_file}. Skipping file.")

    if not all_records_data_dicts:
        print("No valid data extracted from any files. Database will not be updated.")
        if skipped_records > 0:
             print(f"Note: {skipped_records} records were skipped due to errors or missing IDs.")
        return

    print(f"\nTotal valid records extracted across all files: {total_processed_records}")
    if skipped_records > 0:
        print(f"Total records skipped due to errors or missing IDs: {skipped_records}")

    # --- Convert list of dictionaries to list of tuples/lists for insertion ---
    print("Preparing data for insertion...")
    data_to_insert = []
    t = 0 # DEBUG
    for record_dict in all_records_data_dicts:
        row_values = [record_dict.get(col_name) for col_name in TARGET_COLUMNS]
        if t == 0: # DEBUG
            print(f"Row {t}: {row_values}") # DEBUG
        t += 1 # DEBUG
        data_to_insert.append(row_values)

    # --- Load data into DuckDB directly using executemany ---
    print(f"Connecting to DuckDB database: {db_file}")
    con = duckdb.connect(database=db_file, read_only=False)

    try:
        print(f"Creating or replacing table: {table_name}")
        column_defs = [f'"{col}" VARCHAR' for col in TARGET_COLUMNS] # Quote names
        create_table_sql = f"CREATE OR REPLACE TABLE {table_name} ({', '.join(column_defs)})"
        con.execute(create_table_sql)

        print(f"Inserting {len(data_to_insert)} records into {table_name}...")

        # ***** CORRECTED INSERTION METHOD using executemany *****
        # Create the SQL insert statement with placeholders
        placeholders = ', '.join(['?'] * NUM_COLUMNS) # e.g., "?, ?, ?, ..."
        insert_sql = f'INSERT INTO {table_name} VALUES ({placeholders})'

        # Execute the insert statement for all rows in data_to_insert
        con.executemany(insert_sql, data_to_insert)
        # *******************************************************

        con.commit() # Commit the transaction
        print("Data insertion complete.")

        # Verify insertion (optional)
        count_result = con.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()
        print(f"Verification: Table '{table_name}' now contains {count_result[0]} rows.")

    except duckdb.Error as e_db: # Catch specific DuckDB errors
        print(f"Database error: {e_db}")
        try:
            print("Attempting to rollback transaction.")
            con.rollback()
        except duckdb.TransactionException as e_tx:
            print(f"Rollback failed (likely no active transaction): {e_tx}")
    except Exception as e:
         print(f"An unexpected error occurred during DB operation: {e}")
         try:
            con.rollback()
         except duckdb.TransactionException as e_tx:
            print(f"Rollback failed (likely no active transaction): {e_tx}")
    finally:
        con.close()
        print("Database connection closed.")

### Find all unique attributes in many XML files

##### For 'new' format (attributes-based)

In [16]:
def find_all_record_attributes_in_directory(directory_path):
    """
    Parses all XML files (new format) in a directory and finds all unique attribute names
    used across all elements matching the NEW_ROADWORK_RECORD_XPATH in any file.
    """
    xml_files = glob.glob(os.path.join(directory_path, '*.xml'))

    if not xml_files:
        print(f"Error: No XML files found in directory: {directory_path}")
        return None

    print(f"--- Finding All Unique Attributes in Directory: {directory_path} ---")
    print(f"Found {len(xml_files)} XML files to scan.")

    all_attribute_names = set() # Use a set to automatically store unique names across all files
    parser = etree.XMLParser(recover=True, ns_clean=True) # Define parser once

    processed_files = 0
    files_with_errors = 0

    for file_path in xml_files:
        filename = os.path.basename(file_path)
        # print(f"Scanning file: {filename}...") # Optional: uncomment for more verbose output
        try:
            # Parse the XML file
            tree = etree.parse(file_path, parser)
            root = tree.getroot()

            # Use xpath with the namespace map to find all records in this file
            records = root.xpath(NEW_ROADWORK_RECORD_XPATH, namespaces=NSMAP)

            if not records:
                # print(f"  Warning: No records found matching XPath in {filename}.") # Optional warning
                continue # Move to the next file if no records found

            # Iterate through ALL found records in the current file
            for record in records:
                # Get the keys (attribute names) from the current record's attributes
                attribute_keys = record.attrib.keys()
                all_attribute_names.update(attribute_keys)

                # Additionally: find attributes in DESCENDANT elements
                # Use iterdescendants() to visit every element below the current record
                for descendant in record.iterdescendants():
                    all_attribute_names.update(descendant.attrib.keys())

            processed_files += 1

        except etree.XMLSyntaxError as e:
            print(f"  Error parsing XML file {filename}: {e}. Skipping file.")
            files_with_errors += 1
        except Exception as e:
            print(f"  An unexpected error occurred scanning file {filename}: {e}. Skipping file.")
            files_with_errors += 1

    print(f"\n--- Scan Complete ---")
    print(f"Successfully scanned {processed_files} files.")
    if files_with_errors > 0:
        print(f"Skipped {files_with_errors} files due to errors.")

    if not all_attribute_names:
        print("No attributes found in any successfully processed files.")
        return None

    # Sort the results for readability
    sorted_attributes = sorted(list(all_attribute_names))

    print(f"\nFound {len(sorted_attributes)} unique attributes across all scanned files:")
    return sorted_attributes

In [17]:
find_all_record_attributes_in_directory(NEW_DATA_DIRECTORY)

--- Finding All Unique Attributes in Directory: data/new_format ---
Found 8 XML files to scan.

--- Scan Complete ---
Successfully scanned 8 files.

Found 13 unique attributes across all scanned files:


['CENTRE_EASTING',
 'CENTRE_NORTHING',
 'CLOSURE_TYPE',
 'DESCRIPTION',
 'EDATE',
 'EXPDEL',
 'NEW_EVENT_NUMBER',
 'Name',
 'OLD_REFERENCE_NUMBER',
 'PUBLISHED_DATE',
 'ROAD_NUMBER',
 'SDATE',
 'STATUS']

##### For 'old' format (child element-based)

In [18]:
def find_all_record_elements_in_directory(directory_path):
    """
    Parses all XML files (old format) in a directory and finds all
    unique child element tag names used across all elements matching the
    OLD_ROADWORK_RECORD_XPATH in any file.
    """
    xml_files = glob.glob(os.path.join(directory_path, '*.xml'))

    if not xml_files:
        print(f"Error: No XML files found in directory: {directory_path}")
        return None

    print(f"--- Finding All Unique Child Element Tags in Directory: {directory_path} ---")
    print(f"Found {len(xml_files)} XML files to scan.")

    all_element_tags = set() # Use a set to automatically store unique tag names
    # Use a simpler parser if namespaces are not expected/needed for old format
    parser = etree.XMLParser(recover=True)

    processed_files = 0
    files_with_errors = 0

    for file_path in xml_files:
        filename = os.path.basename(file_path)
        # print(f"Scanning file: {filename}...") # Optional: uncomment for more verbose output
        try:
            # Parse the XML file
            tree = etree.parse(file_path, parser)
            root = tree.getroot()

            # Check if the root tag matches the expected old format root
            if root.tag != 'ha_planned_roadworks':
                # print(f"  Skipping file {filename}: Root tag '{root.tag}' does not match expected 'ha_planned_roadworks'.")
                continue # Skip files that don't match the old root tag

            # Use xpath to find all records in this file (no namespace needed)
            records = root.xpath(OLD_ROADWORK_RECORD_XPATH) # Use the XPath for the old format

            if not records:
                # print(f"  Warning: No records found matching XPath '{OLD_ROADWORK_RECORD_XPATH}' in {filename}.")
                continue # Move to the next file if no records found

            # Iterate through ALL found records in the current file
            for record in records:
                # Iterate through the child elements of the record
                for child_element in record:
                    # Add the tag name of the child element to the set
                    all_element_tags.add(child_element.tag)

            processed_files += 1

        except etree.XMLSyntaxError as e:
            print(f"  Error parsing XML file {filename}: {e}. Skipping file.")
            files_with_errors += 1
        except Exception as e:
            print(f"  An unexpected error occurred scanning file {filename}: {e}. Skipping file.")
            files_with_errors += 1

    print(f"\n--- Scan Complete ---")
    print(f"Successfully scanned {processed_files} files (matching root tag).")
    if files_with_errors > 0:
        print(f"Skipped {files_with_errors} files due to errors during parsing.")
    skipped_non_matching = len(xml_files) - processed_files - files_with_errors
    if skipped_non_matching > 0:
         print(f"Skipped {skipped_non_matching} files because their root tag did not match 'ha_planned_roadworks'.")


    if not all_element_tags:
        print("No child element tags found in any successfully processed files.")
        return None

    # Sort the results for readability
    sorted_tags = sorted(list(all_element_tags))

    print(f"\nFound {len(sorted_tags)} unique child element tags across all scanned files:")
    return sorted_tags

In [19]:
old_format_elements = find_all_record_elements_in_directory(OLD_DATA_DIRECTORY)
old_format_elements

--- Finding All Unique Child Element Tags in Directory: data/old_format ---
Found 7 XML files to scan.

--- Scan Complete ---
Successfully scanned 7 files (matching root tag).

Found 14 unique child element tags across all scanned files:


['centre_easting',
 'centre_northing',
 'closure_type',
 'description',
 'end_date',
 'expected_delay',
 'local_authority',
 'location',
 'published_date',
 'reference_number',
 'road',
 'start_date',
 'status',
 'traffic_management']

'New' format attributes:
```python
['CENTRE_EASTING',
 'CENTRE_NORTHING',
 'CLOSURE_TYPE',
 'DESCRIPTION',
 'EDATE',
 'EXPDEL',
 'NEW_EVENT_NUMBER',
 'Name',
 'OLD_REFERENCE_NUMBER',
 'PUBLISHED_DATE',
 'ROAD_NUMBER',
 'SDATE',
 'STATUS']
```

'Old' format attributes:
```python
['centre_easting',
 'centre_northing',
 'closure_type',
 'description',
 'end_date',
 'expected_delay',
 'local_authority',
 'location',
 'published_date',
 'reference_number',
 'road',
 'start_date',
 'status',
 'traffic_management']
```

### Explore some records

##### For 'new' format

In [20]:
NUM_RECORDS_TO_INSPECT = 3

In [21]:
def explore_roadworks_xml_new(file_path):
    """Parses and explores the specific structure of the provided roadworks XML."""

    if not os.path.exists(file_path):
        print(f"Error: File not found at {file_path}")
        return

    print(f"--- Exploring XML File (Updated): {file_path} ---")

    try:
        # Parse the XML file
        # Using recover=True can help skip over minor errors if files are slightly malformed
        parser = etree.XMLParser(recover=True, ns_clean=True)
        tree = etree.parse(file_path, parser)
        root = tree.getroot()

        print(f"\n1. Root Element Tag: {root.tag}") # Should be {WebTeam}Report
        print(f"   Root Namespace Map: {root.nsmap}")

        # Use xpath with the namespace map to find the records
        records = root.xpath(NEW_ROADWORK_RECORD_XPATH, namespaces=NSMAP)

        if not records:
            print(f"\nError: Could not find any elements matching XPath '{NEW_ROADWORK_RECORD_XPATH}'.")
            print("Please double-check the NEW_ROADWORK_RECORD_XPATH and the XML structure.")
            # Print children with their full {namespace}tag names to help debug
            print("\nFirst few children of the root (with full tags):")
            for i, child in enumerate(root[:5]):
                 print(f"  Child {i+1}: {child.tag}")
            return

        print(f"\n2. Found {len(records)} records matching XPath '{NEW_ROADWORK_RECORD_XPATH}'.")
        print(f"--- Inspecting first {min(NUM_RECORDS_TO_INSPECT, len(records))} records ---")

        for i, record in enumerate(records[:NUM_RECORDS_TO_INSPECT]):
            print(f"\n\n--- Record {i+1} ---")

            # --- Accessing Attributes of <HE_PLANNED_WORKS> ---
            print(" Attributes of <HE_PLANNED_WORKS>:")
            record_attrs = record.attrib
            for key, value in record_attrs.items():
                 print(f"    {key}: {value}")

            # Extract specific attributes by name
            event_id = record.get('NEW_EVENT_NUMBER')
            start_date = record.get('SDATE')
            end_date = record.get('EDATE')
            description = record.get('DESCRIPTION')
            closure_type = record.get('CLOSURE_TYPE')
            status = record.get('STATUS')
            published_date = record.get('PUBLISHED_DATE')
            exp_del = record.get('EXPDEL')

            print("\n Extracted Key Attributes:")
            print(f"- NEW_EVENT_NUMBER: {event_id}")
            print(f"- SDATE: {start_date}")
            print(f"- EDATE: {end_date}")
            print(f"- DESCRIPTION: {description}")
            print(f"- CLOSURE_TYPE: {closure_type}")
            print(f"- STATUS: {status}")
            print(f"- PUBLISHED_DATE: {published_date}")
            print(f"- EXPDEL: {exp_del}")


            # --- Accessing Nested Coordinates ---
            print("\n Extracting Nested Coordinates:")
            # Define the precise XPath relative to the current 'record' element
            coord_xpath = './d:EASTNORTH/d:Report/d:EASTINGNORTHING/d:EASTNORTH_Collection/d:EASTNORTH'
            coord_elements = record.xpath(coord_xpath, namespaces=NSMAP)

            if coord_elements:
                # Usually expect only one coordinate block per record
                coord_element = coord_elements[0]
                easting = coord_element.get('CENTRE_EASTING')
                northing = coord_element.get('CENTRE_NORTHING')
                print(f"    CENTRE_EASTING: {easting}")
                print(f"    CENTRE_NORTHING: {northing}")
            else:
                print("    Coordinate elements not found.")

            # --- Accessing Nested Roads ---
            print("\nExtracting Nested Roads:")
            # Define the precise XPath relative to the current 'record' element
            road_xpath = './d:ROADS/d:Report/d:ROADS/d:ROAD_Collection/d:ROAD'
            road_elements = record.xpath(road_xpath, namespaces=NSMAP)

            if road_elements:
                road_numbers = [road.get('ROAD_NUMBER') for road in road_elements]
                print(f"    ROAD_NUMBER(s): {road_numbers}") # Might be multiple roads
            else:
                print("    Road elements not found.")


    except etree.XMLSyntaxError as e:
        print(f"\nError parsing XML file: {e}")
    except Exception as e:
        print(f"\nAn unexpected error occurred: {e}")

    print(f"\n--- End of Exploration for {file_path} ---")


explore_roadworks_xml_new("data/new_format/nh_roadworks_2025_14_4.xml")

--- Exploring XML File (Updated): data/new_format/nh_roadworks_2025_14_4.xml ---

1. Root Element Tag: {WebTeam}Report
   Root Namespace Map: {'xsi': 'http://www.w3.org/2001/XMLSchema-instance', None: 'WebTeam'}

2. Found 1429 records matching XPath './/d:HE_PLANNED_WORKS'.
--- Inspecting first 3 records ---


--- Record 1 ---
 Attributes of <HE_PLANNED_WORKS>:
    NEW_EVENT_NUMBER: 00352573-001
    SDATE: 31-DEC-2023 23:59
    EDATE: 31-MAY-2025 23:59
    EXPDEL: Moderate (10 - 30 mins)
    DESCRIPTION: M25 Anticlockwise Jct 11 to Jct 9
Narrow Lanes for Major Improvement Scheme 
    CLOSURE_TYPE: Major Schemes
    STATUS: Published
    PUBLISHED_DATE: 2023-12-21T14:45:07

 Extracted Key Attributes:
- NEW_EVENT_NUMBER: 00352573-001
- SDATE: 31-DEC-2023 23:59
- EDATE: 31-MAY-2025 23:59
- DESCRIPTION: M25 Anticlockwise Jct 11 to Jct 9
Narrow Lanes for Major Improvement Scheme 
- CLOSURE_TYPE: Major Schemes
- STATUS: Published
- PUBLISHED_DATE: 2023-12-21T14:45:07
- EXPDEL: Moderate (10 -

##### For 'old' format

In [22]:
def explore_roadworks_xml_old(file_path):
    """Parses and explores the structure of an old-format roadworks XML."""

    if not os.path.exists(file_path):
        print(f"Error: File not found at {file_path}")
        return

    print(f"--- Exploring XML File (Old Format): {file_path} ---")

    try:
        # Use a simpler parser, potentially without namespace handling if not needed
        parser = etree.XMLParser(recover=True)
        tree = etree.parse(file_path, parser)
        root = tree.getroot()

        print(f"\n1. Root Element Tag: {root.tag}") # Should be ha_planned_roadworks

        # Check if the root tag is as expected for the old format
        if root.tag != 'ha_planned_roadworks':
            print(f"  Warning: Root tag '{root.tag}' does not match expected 'ha_planned_roadworks'.")
            # Optionally, still try to find records if the XPath might work
            # return # Or uncomment to stop if root tag is wrong

        # Use xpath to find the records (no namespace typically needed for old format)
        records = root.xpath(OLD_ROADWORK_RECORD_XPATH)

        if not records:
            print(f"\nError: Could not find any elements matching XPath '{OLD_ROADWORK_RECORD_XPATH}'.")
            print("Please double-check the OLD_ROADWORK_RECORD_XPATH and the XML structure.")
            print("\nFirst few children of the root:")
            for i, child in enumerate(root[:5]):
                 print(f"  Child {i+1}: {child.tag}")
            return

        print(f"\n2. Found {len(records)} records matching XPath '{OLD_ROADWORK_RECORD_XPATH}'.")
        print(f"--- Inspecting first {min(NUM_RECORDS_TO_INSPECT, len(records))} records ---")

        for i, record in enumerate(records[:NUM_RECORDS_TO_INSPECT]):
            print(f"\n\n--- Record {i+1} ---")
            print(f" Record Element Tag: {record.tag}") # Should be ha_planned_works

            # --- Accessing Child Elements ---
            print(" Child Elements (Tag: Text Content):")
            record_data = {}
            for child in record:
                # Clean up text content (strip whitespace, handle None)
                text_content = (child.text or '').strip()
                print(f"    {child.tag}: {text_content}")
                record_data[child.tag] = text_content # Store for easier access later

            # Extract specific child element text content by tag name
            # Based on the output of find_all_record_elements_in_directory
            ref_num = record_data.get('reference_number')
            start_date = record_data.get('start_date')
            end_date = record_data.get('end_date')
            description = record_data.get('description')
            road = record_data.get('road')
            status = record_data.get('status')
            easting = record_data.get('centre_easting')
            northing = record_data.get('centre_northing')
            delay = record_data.get('expected_delay')
            closure = record_data.get('closure_type')

            print("\n Extracted Key Child Element Values:")
            print(f"- reference_number: {ref_num}")
            print(f"- start_date: {start_date}")
            print(f"- end_date: {end_date}")
            print(f"- description: {description}")
            print(f"- road: {road}")
            print(f"- status: {status}")
            print(f"- centre_easting: {easting}")
            print(f"- centre_northing: {northing}")
            print(f"- expected_delay: {delay}")
            print(f"- closure_type: {closure}")


    except etree.XMLSyntaxError as e:
        print(f"\nError parsing XML file: {e}")
    except Exception as e:
        print(f"\nAn unexpected error occurred: {e}")

    print(f"\n--- End of Exploration for {file_path} ---")


example_old_file = os.path.join(OLD_DATA_DIRECTORY, 'he_roadworks_2017_06_05')
if os.path.exists(example_old_file):
    explore_roadworks_xml_old(example_old_file)
else:
    # Find the first available XML file in the old data directory if the example doesn't exist
    old_files = glob.glob(os.path.join(OLD_DATA_DIRECTORY, '*.xml'))
    if old_files:
        print(f"Example file '{example_old_file}' not found. Using first available file: {old_files[0]}")
        explore_roadworks_xml_old(old_files[0])
    else:
        print(f"Error: No XML files found in {OLD_DATA_DIRECTORY} to explore.")


Example file 'data/old_format\he_roadworks_2017_06_05' not found. Using first available file: data/old_format\ha-roadworks_2011_10_10.xml
--- Exploring XML File (Old Format): data/old_format\ha-roadworks_2011_10_10.xml ---

1. Root Element Tag: ha_planned_roadworks

2. Found 1425 records matching XPath './/ha_planned_works'.
--- Inspecting first 3 records ---


--- Record 1 ---
 Record Element Tag: ha_planned_works
 Child Elements (Tag: Text Content):
    reference_number: 972963
    road: M1
    local_authority: Leicestershire / Northamptonshire
    location: Catthorpe
    start_date: 2010-07-12T07:00:00
    end_date: 2013-03-23T06:00:00
    expected_delay: Moderate (10 - 30 mins)
    description: Major junction works will include lane closures, contraflow, full closures and 50 MPH speed restrictions on the M1 and M6.
    traffic_management: Other
    closure_type: Planned Works
    centre_easting: 456252
    centre_northing: 278173
    status: Firm
    published_date: 2011-10-09T21:0

### Define XML-record extraction functions

In [23]:
def extract_record_new_format(record_element, source_filename):
    """
    Extracts raw data from a 'new' format <HE_PLANNED_WORKS> element
    into a dictionary matching RAW_NEW_COLUMNS.
    """
    data = {col: None for col in RAW_NEW_COLUMNS} # Initialize with None
    data['source_filename'] = source_filename

    # --- Extract direct attributes ---
    data['NEW_EVENT_NUMBER'] = record_element.get('NEW_EVENT_NUMBER')
    data['OLD_REFERENCE_NUMBER'] = record_element.get('OLD_REFERENCE_NUMBER')
    data['SDATE'] = record_element.get('SDATE')
    data['EDATE'] = record_element.get('EDATE')
    data['EXPDEL'] = record_element.get('EXPDEL')
    data['DESCRIPTION'] = record_element.get('DESCRIPTION')
    data['CLOSURE_TYPE'] = record_element.get('CLOSURE_TYPE')
    data['STATUS'] = record_element.get('STATUS')
    data['PUBLISHED_DATE'] = record_element.get('PUBLISHED_DATE')

    # Basic check - skip if no event number (essential identifier)
    if data.get('NEW_EVENT_NUMBER') is None:
        # print(f"Warning: New format record missing NEW_EVENT_NUMBER in {source_filename}. Skipping.")
        return None

    # --- Extract nested coordinates ---
    coord_elements = record_element.xpath(NEW_COORD_XPATH, namespaces=NSMAP)
    if coord_elements:
        coord_element = coord_elements[0]
        data['CENTRE_EASTING'] = coord_element.get('CENTRE_EASTING')
        data['CENTRE_NORTHING'] = coord_element.get('CENTRE_NORTHING')

    # --- Extract nested roads ---
    road_elements = record_element.xpath(NEW_ROAD_XPATH, namespaces=NSMAP)
    if road_elements:
        road_numbers_list = [road.get('ROAD_NUMBER') for road in road_elements if road.get('ROAD_NUMBER')]
        # Join multiple roads with a separator
        data['ROAD_NUMBERS'] = '; '.join(road_numbers_list) if road_numbers_list else None

    return data

def extract_record_old_format(record_element, source_filename):
    """
    Extracts raw data from an 'old' format <ha_planned_works> element
    into a dictionary matching RAW_OLD_COLUMNS.
    """
    data = {col: None for col in RAW_OLD_COLUMNS} # Initialize with None
    data['source_filename'] = source_filename

    # Helper to get text content safely
    def get_text(tag_name):
        element = record_element.find(tag_name)
        return element.text.strip() if element is not None and element.text else None

    # --- Map child elements to raw columns ---
    # Iterate through expected raw old columns (excluding source_filename)
    for col_name in RAW_OLD_COLUMNS:
        if col_name != 'source_filename':
             data[col_name] = get_text(col_name)

    # Basic check - skip if no reference number (essential identifier)
    if data.get('reference_number') is None:
        # print(f"Warning: Old format record missing reference_number in {source_filename}. Skipping.")
        return None

    return data

### Generic directory processor

In [24]:
# OLD METHOD: Returns a list (memory inefficient for large datasets)
def process_directory(directory_path, record_xpath, extraction_func, nsmap=None):
    """
    Processes all XML files in a directory using a specific XPath and extraction function.

    Args:
        directory_path (str): Path to the directory containing XML files.
        record_xpath (str): XPath expression to find record elements.
        extraction_func (callable): Function to call for each record element found.
                                    It should accept (record_element, source_filename)
                                    and return a dictionary or None.
        nsmap (dict, optional): Namespace map for XPath evaluation. Defaults to None.

    Returns:
        list: A list of dictionaries, where each dictionary represents a processed record.
    """
    all_records_data_dicts = []
    xml_files = glob.glob(os.path.join(directory_path, '*.xml'))
    parser = etree.XMLParser(recover=True, ns_clean=True) # Use robust parser

    if not xml_files:
        print(f"Warning: No XML files found in directory: {directory_path}")
        return []

    print(f"\n--- Processing Directory: {directory_path} ---")
    print(f"Found {len(xml_files)} XML files.")

    total_processed_records = 0
    total_skipped_records = 0
    files_with_errors = 0

    for file_path in xml_files:
        filename = os.path.basename(file_path)
        # print(f"Processing file: {filename}...") # Optional verbose output
        try:
            tree = etree.parse(file_path, parser)
            root = tree.getroot()
            # Find records using the provided XPath and namespace map
            records = root.xpath(record_xpath, namespaces=nsmap)

            if not records:
                # print(f"  Warning: No records found matching XPath in {filename}.")
                continue

            file_record_count = 0
            file_skipped_count = 0
            for record in records:
                try:
                    extracted_dict = extraction_func(record, filename)
                    if extracted_dict:
                        all_records_data_dicts.append(extracted_dict)
                        file_record_count += 1
                    else:
                        file_skipped_count += 1 # Count records skipped by extraction func
                except Exception as e_rec:
                    # Try to get an ID for logging, adapt based on potential extraction func errors
                    event_id = "UNKNOWN_ID"
                    try:
                        if nsmap: # Likely new format
                             event_id = record.get('NEW_EVENT_NUMBER', event_id)
                        else: # Likely old format
                             ref_num_el = record.find('reference_number')
                             if ref_num_el is not None and ref_num_el.text:
                                 event_id = ref_num_el.text.strip()
                    except: pass # Ignore errors getting ID for logging
                    print(f"  Error processing record {event_id} in {filename}: {e_rec}")
                    file_skipped_count += 1

            # if file_record_count > 0 or file_skipped_count > 0: # Only print if something happened
            #    print(f"  Extracted {file_record_count} valid records from {filename}. Skipped {file_skipped_count}.")

            total_processed_records += file_record_count
            total_skipped_records += file_skipped_count

        except etree.XMLSyntaxError as e_xml:
            print(f"  Error parsing XML file {filename}: {e_xml}. Skipping file.")
            files_with_errors += 1
        except Exception as e_file:
            print(f"  An unexpected error occurred processing file {filename}: {e_file}. Skipping file.")
            files_with_errors += 1

    print(f"--- Directory Scan Complete: {directory_path} ---")
    print(f"Successfully extracted {total_processed_records} records.")
    if total_skipped_records > 0:
        print(f"Skipped {total_skipped_records} records (missing ID or processing error).")
    if files_with_errors > 0:
        print(f"Skipped {files_with_errors} files due to parsing/file errors.")

    return all_records_data_dicts

In [None]:
#  UPDATED GENERATOR FUNCTION: Yields records one by one instead of returning a list
def process_directory(directory_path, record_xpath, extraction_func, nsmap=None):
    """
    Processes all XML files in a directory using a specific XPath and extraction function,
    yielding each processed record as a dictionary.

    Args:
        directory_path (str): Path to the directory containing XML files.
        record_xpath (str): XPath expression to find record elements.
        extraction_func (callable): Function to call for each record element found.
                                    It should accept (record_element, source_filename)
                                    and return a dictionary or None.
        nsmap (dict, optional): Namespace map for XPath evaluation. Defaults to None.

    Yields:
        dict: A dictionary representing a processed record, if valid.
    """
    xml_files = glob.glob(os.path.join(directory_path, '*.xml'))
    parser = etree.XMLParser(recover=True, ns_clean=True) # Use robust parser

    if not xml_files:
        print(f"Warning: No XML files found in directory: {directory_path}")
        return # Return early if no files

    print(f"\n--- Processing Directory: {directory_path} ---")
    print(f"Found {len(xml_files)} XML files.")

    total_yielded_records = 0
    total_skipped_records = 0
    files_with_errors = 0

    for file_path in xml_files:
        filename = os.path.basename(file_path)
        try:
            tree = etree.parse(file_path, parser)
            root = tree.getroot()
            records = root.xpath(record_xpath, namespaces=nsmap)

            if not records:
                continue

            file_yielded_count = 0
            file_skipped_count = 0
            for record in records:
                try:
                    extracted_dict = extraction_func(record, filename)
                    if extracted_dict:
                        yield extracted_dict
                        file_yielded_count += 1
                    else:
                        file_skipped_count += 1
                except Exception as e_rec:
                    event_id = "UNKNOWN_ID"
                    try: # Attempt to get ID for logging
                        if nsmap: event_id = record.get('NEW_EVENT_NUMBER', event_id)
                        else:
                             ref_num_el = record.find('reference_number')
                             if ref_num_el is not None and ref_num_el.text: event_id = ref_num_el.text.strip()
                    except: pass
                    print(f"  Error processing record {event_id} in {filename}: {e_rec}")
                    file_skipped_count += 1

            total_yielded_records += file_yielded_count
            total_skipped_records += file_skipped_count

        except etree.XMLSyntaxError as e_xml:
            print(f"  Error parsing XML file {filename}: {e_xml}. Skipping file.")
            files_with_errors += 1
        except Exception as e_file:
            print(f"  An unexpected error occurred processing file {filename}: {e_file}. Skipping file.")
            files_with_errors += 1

    print(f"--- Directory Scan Complete: {directory_path} ---")
    print(f"Successfully yielded {total_yielded_records} records.") 
    if total_skipped_records > 0:
        print(f"Skipped {total_skipped_records} records (missing ID or processing error).")
    if files_with_errors > 0:
        print(f"Skipped {files_with_errors} files due to parsing/file errors.")

### Process data in batches

In [None]:
def load_data_in_batches(con, table_name, target_columns, data_iterator, batch_size=1000):
    """
    Loads data from an iterator into a DuckDB table in batches.

    Args:
        con: Active DuckDB connection object.
        table_name (str): Name of the target table.
        target_columns (list): List of column names in the target table order.
        data_iterator (iterator): An iterator yielding dictionaries of data.
        batch_size (int): Number of records to insert per batch.
    """
    batch_data = []
    total_inserted = 0
    num_columns = len(target_columns)
    placeholders = ', '.join(['?'] * num_columns)
    insert_sql = f'INSERT INTO "{table_name}" VALUES ({placeholders})'

    print(f"Starting batch insertion into '{table_name}' (batch size: {batch_size})...")

    for record_dict in data_iterator:
        # Convert dict to list/tuple in the correct column order
        row_values = [record_dict.get(col_name) for col_name in target_columns]
        batch_data.append(row_values)

        if len(batch_data) >= batch_size:
            try:
                con.executemany(insert_sql, batch_data)
                total_inserted += len(batch_data)
                print(f"  Inserted batch of {len(batch_data)}. Total inserted: {total_inserted}")
                batch_data = [] # Clear the batch
            except duckdb.Error as e:
                print(f"  Error inserting batch: {e}")
                # Decide how to handle batch errors (e.g., log, skip, stop)
                # For now, just print and continue trying next batch
                batch_data = [] # Clear potentially problematic batch

    # Insert any remaining records in the last batch
    if batch_data:
        try:
            con.executemany(insert_sql, batch_data)
            total_inserted += len(batch_data)
            print(f"  Inserted final batch of {len(batch_data)}. Total inserted: {total_inserted}")
        except duckdb.Error as e:
            print(f"  Error inserting final batch: {e}")

    print(f"Batch insertion complete. Total records inserted: {total_inserted}")

In [28]:
# --- Main Data Processing and Loading (Batch Mode) ---

print(f"Connecting to DuckDB database: {DUCKDB_FILE}")

con = None # Initialize connection variable
try:
    con = duckdb.connect(database=DUCKDB_FILE, read_only=False)

    # --- Create/Replace RAW NEW Table Structure ---
    print(f"Creating or replacing table: {RAW_NEW_TABLE_NAME}")
    # Quote column names
    new_column_defs = [f'"{col}" VARCHAR' for col in RAW_NEW_COLUMNS]
    create_new_table_sql = f'CREATE OR REPLACE TABLE "{RAW_NEW_TABLE_NAME}" ({", ".join(new_column_defs)})'
    con.execute(create_new_table_sql)
    print(f"Table '{RAW_NEW_TABLE_NAME}' created/replaced successfully.")

    # --- Process and Load New Format Raw Data ---
    print("\nProcessing NEW format data...")
    new_data_iterator = process_directory(
        directory_path=NEW_DATA_DIRECTORY,
        record_xpath=NEW_ROADWORK_RECORD_XPATH,
        extraction_func=extract_record_new_format,
        nsmap=NSMAP
    )
    # Load into the raw new table using the specific columns
    load_data_in_batches(con, RAW_NEW_TABLE_NAME, RAW_NEW_COLUMNS, new_data_iterator)

    # --- Create/Replace RAW OLD Table Structure ---
    print(f"\nCreating or replacing table: {RAW_OLD_TABLE_NAME}")
    # Quote column names
    old_column_defs = [f'"{col}" VARCHAR' for col in RAW_OLD_COLUMNS]
    create_old_table_sql = f'CREATE OR REPLACE TABLE "{RAW_OLD_TABLE_NAME}" ({", ".join(old_column_defs)})'
    con.execute(create_old_table_sql)
    print(f"Table '{RAW_OLD_TABLE_NAME}' created/replaced successfully.")

    # --- Process and Load Old Format Raw Data ---
    print("\nProcessing OLD format data...")
    old_data_iterator = process_directory(
        directory_path=OLD_DATA_DIRECTORY,
        record_xpath=OLD_ROADWORK_RECORD_XPATH,
        extraction_func=extract_record_old_format,
        nsmap=None # No namespace needed for old format XPath
    )
    # Load into the raw old table using the specific columns
    load_data_in_batches(con, RAW_OLD_TABLE_NAME, RAW_OLD_COLUMNS, old_data_iterator)

    # --- Finalize ---
    print("\nCommitting transaction...")
    con.commit()
    print("Transaction committed.")

    # Verify final counts
    count_new = con.execute(f'SELECT COUNT(*) FROM "{RAW_NEW_TABLE_NAME}"').fetchone()
    count_old = con.execute(f'SELECT COUNT(*) FROM "{RAW_OLD_TABLE_NAME}"').fetchone()
    print(f"\nVerification: Table '{RAW_NEW_TABLE_NAME}' now contains {count_new[0]} rows.")
    print(f"Verification: Table '{RAW_OLD_TABLE_NAME}' now contains {count_old[0]} rows.")


except duckdb.Error as e_db:
    print(f"\nDatabase error occurred: {e_db}")
    if con:
        try:
            print("Attempting to rollback transaction.")
            con.rollback()
        except duckdb.Error as e_tx: # More specific exception type if available
            print(f"Rollback failed: {e_tx}")
except Exception as e:
    print(f"\nAn unexpected error occurred: {e}")
    if con:
        try:
            con.rollback()
        except duckdb.Error as e_tx:
            print(f"Rollback failed: {e_tx}")
finally:
    if con:
        con.close()
        print("Database connection closed.")

print("\n--- Raw Data Pipeline Complete ---")

Connecting to DuckDB database: roadworks_data.duckdb
Creating or replacing table: raw_new_roadworks
Table 'raw_new_roadworks' created/replaced successfully.

Processing NEW format data...
Starting batch insertion into 'raw_new_roadworks' (batch size: 1000)...

--- Processing Directory: data/new_format ---
Found 8 XML files.
  Inserted batch of 1000. Total inserted: 1000
  Inserted batch of 1000. Total inserted: 2000
  Inserted batch of 1000. Total inserted: 3000
  Inserted batch of 1000. Total inserted: 4000
  Inserted batch of 1000. Total inserted: 5000
  Inserted batch of 1000. Total inserted: 6000
  Inserted batch of 1000. Total inserted: 7000
  Inserted batch of 1000. Total inserted: 8000
  Inserted batch of 1000. Total inserted: 9000
  Inserted batch of 1000. Total inserted: 10000
  Inserted batch of 1000. Total inserted: 11000
--- Directory Scan Complete: data/new_format ---
Successfully yielded 11353 records.
  Inserted final batch of 353. Total inserted: 11353
Batch insertion c

## Analyze data quality

In [42]:
# --- Basic Quality Checks Setup ---

con = None

def run_query(connection, sql_query):
    """Helper function to run a query and return a Polars DataFrame."""
    if not connection:
        print("Error: Database connection is not established.")
        return None
    try:
        # print(f"Running query:\n{sql_query}") # Optional: print query being run
        return connection.sql(sql_query).pl()
    except duckdb.Error as e:
        print(f"Error running query:\n{sql_query}\nError: {e}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None

# Establish connection (read-only)
try:
    print(f"Connecting to {DUCKDB_FILE} for quality checks...")
    con = duckdb.connect(database=DUCKDB_FILE, read_only=True)
    print("Connection successful.")
except duckdb.Error as e:
    print(f"Error connecting to database: {e}")
    con = None # Ensure con_check is None if connection failed
except Exception as e:
    print(f"An unexpected error occurred during connection: {e}")
    con = None

# Define common placeholders to check
PLACEHOLDERS = ["''", "'none'", "'n/a'", "'null'", "'unknown'"]
#PLACEHOLDER_FILTER = " OR ".join([f'lower("{col}") = {p}' for p in PLACEHOLDERS])

# Define tables and columns to iterate over
TABLES_INFO = {
    RAW_NEW_TABLE_NAME: RAW_NEW_COLUMNS,
    RAW_OLD_TABLE_NAME: RAW_OLD_COLUMNS
}

Connecting to roadworks_data.duckdb for quality checks...
Connection successful.


In [43]:
new_table = RAW_NEW_TABLE_NAME
old_table = RAW_OLD_TABLE_NAME

print(f"--- Inspecting DuckDB Database: {DUCKDB_FILE} ---")

if not os.path.exists(DUCKDB_FILE):
    print(f"Error: Database file '{DUCKDB_FILE}' not found.")
elif not con_check: # Check if the connection from the previous cell was successful
     print(f"Error: Cannot inspect database. Connection 'con_check' not established.")
else:
    # Connection is already established via con_check in the previous cell

    # --- Inspect NEW Raw Table ---
    print(f"--- Inspecting Table: {new_table} ---")
    try:
        # Describe schema using run_query
        print(f"\nSchema for table '{new_table}':")
        schema_df_new = run_query(con, f'DESCRIBE "{new_table}"')
        if schema_df_new is not None and not schema_df_new.is_empty():
            print(f"Table '{new_table}' found.")
            display(schema_df_new)
        else:
             # If DESCRIBE fails or returns empty, the table likely doesn't exist or there was an error
             print(f"Could not retrieve schema for table '{new_table}'. It might not exist or there was a query error.")
             # Skip further inspection for this table
             raise duckdb.CatalogException(f"Table '{new_table}' not found or query failed.") # Raise exception to skip next steps

        # Count rows using run_query
        count_df_new = run_query(con, f'SELECT COUNT(*) as count FROM "{new_table}"')
        if count_df_new is not None and not count_df_new.is_empty():
            count_new_val = count_df_new[0, "count"]
            print(f"\nTotal rows in '{new_table}': {count_new_val}")
        else:
            print(f"Could not count rows for table '{new_table}'.")
            count_new_val = 0 # Assume 0 if count fails

        # Display sample rows using run_query (only if table has rows)
        if count_new_val > 0:
            print(f"\nFirst 5 rows from '{new_table}':")
            sample_df_new = run_query(con, f'SELECT * FROM "{new_table}" LIMIT 5')
            if sample_df_new is not None and not sample_df_new.is_empty():
                # print(type(sample_df_new)) # Type is known to be Polars DataFrame
                display(sample_df_new)
            elif sample_df_new is not None and sample_df_new.is_empty():
                 print("Table has rows, but could not fetch sample (LIMIT 5 returned empty).")
            else:
                 print("Could not fetch sample rows.")
        elif count_new_val == 0:
             print("\nTable appears to be empty.")


    except duckdb.CatalogException as e: # Catch specific error if DESCRIBE failed as intended
         print(f"Skipping further inspection for '{new_table}' due to previous error: {e}")
    except Exception as e: # Catch any other unexpected errors during inspection
         print(f"An unexpected error occurred while inspecting '{new_table}': {e}")


    # --- Inspect OLD Raw Table ---
    print(f"\n--- Inspecting Table: {old_table} ---")
    try:
        # Describe schema using run_query
        print(f"\nSchema for table '{old_table}':")
        schema_df_old = run_query(con, f'DESCRIBE "{old_table}"')
        if schema_df_old is not None and not schema_df_old.is_empty():
            print(f"Table '{old_table}' found.")
            display(schema_df_old)
        else:
             print(f"Could not retrieve schema for table '{old_table}'. It might not exist or there was a query error.")
             raise duckdb.CatalogException(f"Table '{old_table}' not found or query failed.")

        # Count rows using run_query
        count_df_old = run_query(con, f'SELECT COUNT(*) as count FROM "{old_table}"')
        if count_df_old is not None and not count_df_old.is_empty():
            count_old_val = count_df_old[0, "count"]
            print(f"\nTotal rows in '{old_table}': {count_old_val}")
        else:
            print(f"Could not count rows for table '{old_table}'.")
            count_old_val = 0

        # Display sample rows using run_query (only if table has rows)
        if count_old_val > 0:
            print(f"\nFirst 5 rows from '{old_table}':")
            sample_df_old = run_query(con, f'SELECT * FROM "{old_table}" LIMIT 5')
            if sample_df_old is not None and not sample_df_old.is_empty():
                display(sample_df_old)
            elif sample_df_old is not None and sample_df_old.is_empty():
                 print("Table has rows, but could not fetch sample (LIMIT 5 returned empty).")
            else:
                 print("Could not fetch sample rows.")
        elif count_old_val == 0:
             print("\nTable appears to be empty.")


    except duckdb.CatalogException as e:
         print(f"Skipping further inspection for '{old_table}' due to previous error: {e}")
    except Exception as e:
         print(f"An unexpected error occurred while inspecting '{old_table}': {e}")

    # No need to close con_inspect as we are using the global con_check
    # The con_check connection will be closed later after all checks are done.
    # print("\nInspection connection closed.") # Remove this line

print("\n--- Inspection Complete ---")

--- Inspecting DuckDB Database: roadworks_data.duckdb ---
--- Inspecting Table: raw_new_roadworks ---

Schema for table 'raw_new_roadworks':
Table 'raw_new_roadworks' found.


column_name,column_type,null,key,default,extra
str,str,str,str,str,str
"""source_filename""","""VARCHAR""","""YES""",,,
"""NEW_EVENT_NUMBER""","""VARCHAR""","""YES""",,,
"""OLD_REFERENCE_NUMBER""","""VARCHAR""","""YES""",,,
"""SDATE""","""VARCHAR""","""YES""",,,
"""EDATE""","""VARCHAR""","""YES""",,,
…,…,…,…,…,…
"""STATUS""","""VARCHAR""","""YES""",,,
"""PUBLISHED_DATE""","""VARCHAR""","""YES""",,,
"""CENTRE_EASTING""","""VARCHAR""","""YES""",,,
"""CENTRE_NORTHING""","""VARCHAR""","""YES""",,,



Total rows in 'raw_new_roadworks': 11353

First 5 rows from 'raw_new_roadworks':


source_filename,NEW_EVENT_NUMBER,OLD_REFERENCE_NUMBER,SDATE,EDATE,EXPDEL,DESCRIPTION,CLOSURE_TYPE,STATUS,PUBLISHED_DATE,CENTRE_EASTING,CENTRE_NORTHING,ROAD_NUMBERS
str,str,str,str,str,str,str,str,str,str,str,str,str
"""he_roadworks_2018_02_26.xml""","""00026976-005""",,"""26-FEB-2018 21:00""","""28-FEB-2018 06:00""","""Slight (less than 10 mins)""","""A3 northbound Sheet Link entry…","""Area Renewals""","""Published""","""2018-02-22T16:49:17""","""475209""","""124975""","""A3"""
"""he_roadworks_2018_02_26.xml""","""00004020-008""","""4188720""","""08-JAN-2018 20:00""","""10-MAR-2018 06:00""","""Moderate (10 - 30 mins)""","""A14 Westbound Jct 58 to Jct 57…","""Area Schemes""","""Published""","""2018-02-22T10:13:27""","""614569""","""241115""","""A14"""
"""he_roadworks_2018_02_26.xml""","""00001459-026""","""4215713""","""31-JUL-2017 14:47""","""01-APR-2018 06:00""","""Slight (less than 10 mins)""","""M1 northbound and southbound T…","""Major Schemes""","""Published""","""2018-02-15T14:38:05""","""445124""","""364308""","""M1"""
"""he_roadworks_2018_02_26.xml""","""00027883-003""",,"""12-FEB-2018 20:00""","""17-MAR-2018 06:00""","""Moderate (10 - 30 mins)""","""A259, east and westbound betwe…","""Area Schemes""","""Published""","""2018-02-21T10:36:47""","""596442""","""123787""","""A259"""
"""he_roadworks_2018_02_26.xml""","""00026799-002""",,"""10-FEB-2018 22:00""","""22-MAR-2018 06:00""","""Slight (less than 10 mins)""","""A3 northbound Compton to Denni…","""Regional Technology Works""","""Published""","""2018-02-22T14:08:43""","""498261""","""150727""","""A3"""



--- Inspecting Table: raw_old_roadworks ---

Schema for table 'raw_old_roadworks':
Table 'raw_old_roadworks' found.


column_name,column_type,null,key,default,extra
str,str,str,str,str,str
"""source_filename""","""VARCHAR""","""YES""",,,
"""reference_number""","""VARCHAR""","""YES""",,,
"""start_date""","""VARCHAR""","""YES""",,,
"""end_date""","""VARCHAR""","""YES""",,,
"""expected_delay""","""VARCHAR""","""YES""",,,
…,…,…,…,…,…
"""centre_northing""","""VARCHAR""","""YES""",,,
"""road""","""VARCHAR""","""YES""",,,
"""location""","""VARCHAR""","""YES""",,,
"""local_authority""","""VARCHAR""","""YES""",,,



Total rows in 'raw_old_roadworks': 12068

First 5 rows from 'raw_old_roadworks':


source_filename,reference_number,start_date,end_date,expected_delay,description,closure_type,status,published_date,centre_easting,centre_northing,road,location,local_authority,traffic_management
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""ha-roadworks_2011_10_10.xml""","""972963""","""2010-07-12T07:00:00""","""2013-03-23T06:00:00""","""Moderate (10 - 30 mins)""","""Major junction works will incl…","""Planned Works""","""Firm""","""2011-10-09T21:08:32""","""456252""","""278173""","""M1""","""Catthorpe""","""Leicestershire / Northamptonsh…","""Other"""
"""ha-roadworks_2011_10_10.xml""","""978905""","""2011-04-01T22:00:00""","""2011-12-31T05:00:00""","""Moderate (10 - 30 mins)""","""Contraflow with speed restrict…","""Planned Works""","""Firm""","""2010-04-23T10:18:30""","""499082""","""235992""","""M1""","""Jct 13 to Jct 12""","""Bedfordshire / Buckinghamshire""","""Contraflow"""
"""ha-roadworks_2011_10_10.xml""","""998294""","""2009-09-24T06:00:00""","""2013-09-24T05:00:00""","""Slight (less than 10 mins)""","""Lane 1 closure and 24/7 Hardsh…","""Planned Works""","""Firm""","""2010-06-19T05:03:50""","""465924""","""260154""","""M1""","""Approach to Junction 16 (21011…","""Northamptonshire""","""Lane Closure"""
"""ha-roadworks_2011_10_10.xml""","""1172899""","""2011-10-10T22:00:00""","""2011-12-03T06:00:00""","""Slight (less than 10 mins)""","""Lane closures during the day w…","""Planned Works""","""Firm""","""2011-09-28T15:40:36""","""446842""","""324130""","""M1""","""Junction 23a (220116)""","""Leicestershire""","""Carriageway Closure"""
"""ha-roadworks_2011_10_10.xml""","""1306529""","""2010-08-04T00:00:00""","""2012-07-05T00:00:00""","""No Delay""","""24hrs, lane 1 closure, northbo…","""Planned Works""","""Firm""","""2011-08-22T16:47:52""","""511897""","""202047""","""M1""","""Jct 6 Exit Slip""","""Hertfordshire""","""Lane Closure"""



--- Inspection Complete ---


### Basic checks
1. Count NULLs & empty string placeholders
1. Check string length range of each column (e.g.: Is NEW_EVENT_NUMBER fixed length?)
1. Examine categorical values (e.g. STATUS, EXPDEL)
1. Check identifyer uniqueness across tables

In [None]:
con_inspect = None
try:
    # Connect in read-only mode
    con_inspect = duckdb.connect(database=DUCKDB_FILE, read_only=True)
    
    # Analyze data...
    
    
except duckdb.Error as e:
    print(f"Could not connect to database '{DUCKDB_FILE}': {e}")

### Convert data types
1. Numeric conversion (coordinates, reference number, NEW_EVENT_NUMBER?)
1. Convert dates

### Check converted data types for plausibility
1. Date ranges
1. Coordinate ranges (Correct locations in the UK?)
1. Did numeric conversions succeed?