# Explore XML Data

In [1]:
from lxml import etree
import duckdb
import polars as pl
import glob
import os
from IPython.display import display

In [2]:
# --- Configuration ---
NEW_DATA_DIRECTORY = 'data/new_format'     # data from 2018 onwards
OLD_DATA_DIRECTORY = 'data/old_format' # data from 2017 and earlier

DUCKDB_FILE = 'roadworks_data.duckdb'  # Name for your DuckDB database file
# Define separate table names for new and old formats
RAW_NEW_TABLE_NAME = 'raw_new_roadworks'
RAW_OLD_TABLE_NAME = 'raw_old_roadworks'

# Define the namespace map
NSMAP = {'d': 'WebTeam'}

# XPath to find the repeating record element
NEW_ROADWORK_RECORD_XPATH = './/d:HE_PLANNED_WORKS'
OLD_ROADWORK_RECORD_XPATH = './/ha_planned_works' # XPath for the old format record

# --- Define Raw Columns based on exploration ---

# Columns for the 'new' format raw table
# Includes source_filename and handles nested elements
RAW_NEW_COLUMNS = [
    'source_filename',
    # Attributes from HE_PLANNED_WORKS
    'NEW_EVENT_NUMBER',
    'OLD_REFERENCE_NUMBER',
    'SDATE',
    'EDATE',
    'EXPDEL',
    'DESCRIPTION',
    'CLOSURE_TYPE',
    'STATUS',
    'PUBLISHED_DATE',
    # Nested attributes (will be extracted)
    'CENTRE_EASTING',
    'CENTRE_NORTHING',
    'ROAD_NUMBERS' # Potentially multiple, joined by ';'
]

# Columns for the 'old' format raw table
# Includes source_filename and direct child element tags
RAW_OLD_COLUMNS = [
    'source_filename',
    # Child elements of ha_planned_works
    'reference_number',
    'start_date',
    'end_date',
    'expected_delay',
    'description',
    'closure_type',
    'status',
    'published_date',
    'centre_easting',
    'centre_northing',
    'road',
    'location',
    'local_authority',
    'traffic_management'
]

# Define XPaths for nested data relative to the NEW format HE_PLANNED_WORKS element
NEW_COORD_XPATH = './d:EASTNORTH/d:Report/d:EASTINGNORTHING/d:EASTNORTH_Collection/d:EASTNORTH'
NEW_ROAD_XPATH = './d:ROADS/d:Report/d:ROADS/d:ROAD_Collection/d:ROAD'

### Find all unique attributes in many XML files

##### For 'new' format (attributes-based)

In [3]:
def find_all_record_attributes_in_directory(directory_path):
    """
    Parses all XML files (new format) in a directory and finds all unique attribute names
    used across all elements matching the NEW_ROADWORK_RECORD_XPATH in any file.
    """
    xml_files = glob.glob(os.path.join(directory_path, '*.xml'))

    if not xml_files:
        print(f"Error: No XML files found in directory: {directory_path}")
        return None

    print(f"--- Finding All Unique Attributes in Directory: {directory_path} ---")
    print(f"Found {len(xml_files)} XML files to scan.")

    all_attribute_names = set() # Use a set to automatically store unique names across all files
    parser = etree.XMLParser(recover=True, ns_clean=True) # Define parser once

    processed_files = 0
    files_with_errors = 0

    for file_path in xml_files:
        filename = os.path.basename(file_path)
        # print(f"Scanning file: {filename}...") # Uncomment for more verbose output
        try:
            # Parse the XML file
            tree = etree.parse(file_path, parser)
            root = tree.getroot()

            # Use xpath with the namespace map to find all records in this file
            records = root.xpath(NEW_ROADWORK_RECORD_XPATH, namespaces=NSMAP)

            if not records:
                # print(f"  Warning: No records found matching XPath in {filename}.")
                continue # Move to the next file if no records found

            # Iterate through ALL found records in the current file
            for record in records:
                # Get the keys (attribute names) from the current record's attributes
                attribute_keys = record.attrib.keys()
                all_attribute_names.update(attribute_keys)

                # Additionally: find attributes in DESCENDANT elements
                # Use iterdescendants() to visit every element below the current record
                for descendant in record.iterdescendants():
                    all_attribute_names.update(descendant.attrib.keys())

            processed_files += 1

        except etree.XMLSyntaxError as e:
            print(f"  Error parsing XML file {filename}: {e}. Skipping file.")
            files_with_errors += 1
        except Exception as e:
            print(f"  An unexpected error occurred scanning file {filename}: {e}. Skipping file.")
            files_with_errors += 1

    print(f"\n--- Scan Complete ---")
    print(f"Successfully scanned {processed_files} files.")
    if files_with_errors > 0:
        print(f"Skipped {files_with_errors} files due to errors.")

    if not all_attribute_names:
        print("No attributes found in any successfully processed files.")
        return None

    # Sort the results for readability
    sorted_attributes = sorted(list(all_attribute_names))

    print(f"\nFound {len(sorted_attributes)} unique attributes across all scanned files:")
    return sorted_attributes

In [4]:
find_all_record_attributes_in_directory(NEW_DATA_DIRECTORY)

--- Finding All Unique Attributes in Directory: data/new_format ---
Found 8 XML files to scan.

--- Scan Complete ---
Successfully scanned 8 files.

Found 13 unique attributes across all scanned files:


['CENTRE_EASTING',
 'CENTRE_NORTHING',
 'CLOSURE_TYPE',
 'DESCRIPTION',
 'EDATE',
 'EXPDEL',
 'NEW_EVENT_NUMBER',
 'Name',
 'OLD_REFERENCE_NUMBER',
 'PUBLISHED_DATE',
 'ROAD_NUMBER',
 'SDATE',
 'STATUS']

##### For 'old' format (child element-based)

In [5]:
def find_all_record_elements_in_directory(directory_path):
    """
    Parses all XML files (old format) in a directory and finds all
    unique child element tag names used across all elements matching the
    OLD_ROADWORK_RECORD_XPATH in any file.
    """
    xml_files = glob.glob(os.path.join(directory_path, '*.xml'))

    if not xml_files:
        print(f"Error: No XML files found in directory: {directory_path}")
        return None

    print(f"--- Finding All Unique Child Element Tags in Directory: {directory_path} ---")
    print(f"Found {len(xml_files)} XML files to scan.")

    all_element_tags = set() # Use a set to automatically store unique tag names
    # Use a simpler parser if namespaces are not expected/needed for old format
    parser = etree.XMLParser(recover=True)

    processed_files = 0
    files_with_errors = 0

    for file_path in xml_files:
        filename = os.path.basename(file_path)
        # print(f"Scanning file: {filename}...")
        try:
            # Parse the XML file
            tree = etree.parse(file_path, parser)
            root = tree.getroot()

            # Check if the root tag matches the expected old format root
            if root.tag != 'ha_planned_roadworks':
                # print(f"  Skipping file {filename}: Root tag '{root.tag}' does not match expected 'ha_planned_roadworks'.")
                continue # Skip files that don't match the old root tag

            # Use xpath to find all records in this file (no namespace needed)
            records = root.xpath(OLD_ROADWORK_RECORD_XPATH) # Use the XPath for the old format

            if not records:
                # print(f"  Warning: No records found matching XPath '{OLD_ROADWORK_RECORD_XPATH}' in {filename}.")
                continue # Move to the next file if no records found

            # Iterate through ALL found records in the current file
            for record in records:
                # Iterate through the child elements of the record
                for child_element in record:
                    # Add the tag name of the child element to the set
                    all_element_tags.add(child_element.tag)

            processed_files += 1

        except etree.XMLSyntaxError as e:
            print(f"  Error parsing XML file {filename}: {e}. Skipping file.")
            files_with_errors += 1
        except Exception as e:
            print(f"  An unexpected error occurred scanning file {filename}: {e}. Skipping file.")
            files_with_errors += 1

    print(f"\n--- Scan Complete ---")
    print(f"Successfully scanned {processed_files} files (matching root tag).")
    if files_with_errors > 0:
        print(f"Skipped {files_with_errors} files due to errors during parsing.")
    skipped_non_matching = len(xml_files) - processed_files - files_with_errors
    if skipped_non_matching > 0:
         print(f"Skipped {skipped_non_matching} files because their root tag did not match 'ha_planned_roadworks'.")


    if not all_element_tags:
        print("No child element tags found in any successfully processed files.")
        return None

    # Sort the results for readability
    sorted_tags = sorted(list(all_element_tags))

    print(f"\nFound {len(sorted_tags)} unique child element tags across all scanned files:")
    return sorted_tags

In [6]:
old_format_elements = find_all_record_elements_in_directory(OLD_DATA_DIRECTORY)
old_format_elements

--- Finding All Unique Child Element Tags in Directory: data/old_format ---
Found 7 XML files to scan.

--- Scan Complete ---
Successfully scanned 7 files (matching root tag).

Found 14 unique child element tags across all scanned files:


['centre_easting',
 'centre_northing',
 'closure_type',
 'description',
 'end_date',
 'expected_delay',
 'local_authority',
 'location',
 'published_date',
 'reference_number',
 'road',
 'start_date',
 'status',
 'traffic_management']

'New' format attributes:
```python
['CENTRE_EASTING',
 'CENTRE_NORTHING',
 'CLOSURE_TYPE',
 'DESCRIPTION',
 'EDATE',
 'EXPDEL',
 'NEW_EVENT_NUMBER',
 'Name',
 'OLD_REFERENCE_NUMBER',
 'PUBLISHED_DATE',
 'ROAD_NUMBER',
 'SDATE',
 'STATUS']
```

'Old' format attributes:
```python
['centre_easting',
 'centre_northing',
 'closure_type',
 'description',
 'end_date',
 'expected_delay',
 'local_authority',
 'location',
 'published_date',
 'reference_number',
 'road',
 'start_date',
 'status',
 'traffic_management']
```

### Explore some records

##### For 'new' format

In [7]:
NUM_RECORDS_TO_INSPECT = 3

In [8]:
def explore_roadworks_xml_new(file_path):
    """Parses and explores the specific structure of the provided roadworks XML."""

    if not os.path.exists(file_path):
        print(f"Error: File not found at {file_path}")
        return

    print(f"--- Exploring XML File (Updated): {file_path} ---")

    try:
        # Parse the XML file
        # Using recover=True can help skip over minor errors if files are slightly malformed
        parser = etree.XMLParser(recover=True, ns_clean=True)
        tree = etree.parse(file_path, parser)
        root = tree.getroot()

        print(f"\n1. Root Element Tag: {root.tag}") # Should be {WebTeam}Report
        print(f"   Root Namespace Map: {root.nsmap}")

        # Use xpath with the namespace map to find the records
        records = root.xpath(NEW_ROADWORK_RECORD_XPATH, namespaces=NSMAP)

        if not records:
            print(f"\nError: Could not find any elements matching XPath '{NEW_ROADWORK_RECORD_XPATH}'.")
            print("Please double-check the NEW_ROADWORK_RECORD_XPATH and the XML structure.")
            # Print children with their full {namespace}tag names to help debug
            print("\nFirst few children of the root (with full tags):")
            for i, child in enumerate(root[:5]):
                 print(f"  Child {i+1}: {child.tag}")
            return

        print(f"\n2. Found {len(records)} records matching XPath '{NEW_ROADWORK_RECORD_XPATH}'.")
        print(f"--- Inspecting first {min(NUM_RECORDS_TO_INSPECT, len(records))} records ---")

        for i, record in enumerate(records[:NUM_RECORDS_TO_INSPECT]):
            print(f"\n\n--- Record {i+1} ---")

            # --- Accessing Attributes of <HE_PLANNED_WORKS> ---
            print(" Attributes of <HE_PLANNED_WORKS>:")
            record_attrs = record.attrib
            for key, value in record_attrs.items():
                 print(f"    {key}: {value}")


            # --- Accessing Nested Coordinates ---
            print("\n Extracting Nested Coordinates:")
            # Define the precise XPath relative to the current 'record' element
            coord_xpath = './d:EASTNORTH/d:Report/d:EASTINGNORTHING/d:EASTNORTH_Collection/d:EASTNORTH'
            coord_elements = record.xpath(coord_xpath, namespaces=NSMAP)

            if coord_elements:
                # Usually expect only one coordinate block per record
                coord_element = coord_elements[0]
                easting = coord_element.get('CENTRE_EASTING')
                northing = coord_element.get('CENTRE_NORTHING')
                print(f"    CENTRE_EASTING: {easting}")
                print(f"    CENTRE_NORTHING: {northing}")
            else:
                print("    Coordinate elements not found.")

            # --- Accessing Nested Roads ---
            print("\nExtracting Nested Roads:")
            # Define the precise XPath relative to the current 'record' element
            road_xpath = './d:ROADS/d:Report/d:ROADS/d:ROAD_Collection/d:ROAD'
            road_elements = record.xpath(road_xpath, namespaces=NSMAP)

            if road_elements:
                road_numbers = [road.get('ROAD_NUMBER') for road in road_elements]
                print(f"    ROAD_NUMBER(s): {road_numbers}") # Might be multiple roads
            else:
                print("    Road elements not found.")


    except etree.XMLSyntaxError as e:
        print(f"\nError parsing XML file: {e}")
    except Exception as e:
        print(f"\nAn unexpected error occurred: {e}")

    print(f"\n--- End of Exploration for {file_path} ---")


explore_roadworks_xml_new("data/new_format/nh_roadworks_2025_14_4.xml")

--- Exploring XML File (Updated): data/new_format/nh_roadworks_2025_14_4.xml ---

1. Root Element Tag: {WebTeam}Report
   Root Namespace Map: {'xsi': 'http://www.w3.org/2001/XMLSchema-instance', None: 'WebTeam'}

2. Found 1429 records matching XPath './/d:HE_PLANNED_WORKS'.
--- Inspecting first 3 records ---


--- Record 1 ---
 Attributes of <HE_PLANNED_WORKS>:
    NEW_EVENT_NUMBER: 00352573-001
    SDATE: 31-DEC-2023 23:59
    EDATE: 31-MAY-2025 23:59
    EXPDEL: Moderate (10 - 30 mins)
    DESCRIPTION: M25 Anticlockwise Jct 11 to Jct 9
Narrow Lanes for Major Improvement Scheme 
    CLOSURE_TYPE: Major Schemes
    STATUS: Published
    PUBLISHED_DATE: 2023-12-21T14:45:07

 Extracting Nested Coordinates:
    CENTRE_EASTING: 507930
    CENTRE_NORTHING: 159334

Extracting Nested Roads:
    ROAD_NUMBER(s): ['M25']


--- Record 2 ---
 Attributes of <HE_PLANNED_WORKS>:
    NEW_EVENT_NUMBER: 00380443-001
    SDATE: 29-MAY-2024 21:00
    EDATE: 29-MAY-2025 23:59
    EXPDEL: Slight (less than 

##### For 'old' format

In [9]:
def explore_roadworks_xml_old(file_path):
    """Parses and explores the structure of an old-format roadworks XML."""

    if not os.path.exists(file_path):
        print(f"Error: File not found at {file_path}")
        return

    print(f"--- Exploring XML File (Old Format): {file_path} ---")

    try:
        # Use a simpler parser, potentially without namespace handling if not needed
        parser = etree.XMLParser(recover=True)
        tree = etree.parse(file_path, parser)
        root = tree.getroot()

        print(f"\n1. Root Element Tag: {root.tag}") # Should be ha_planned_roadworks

        # Check if the root tag is as expected for the old format
        if root.tag != 'ha_planned_roadworks':
            print(f"  Warning: Root tag '{root.tag}' does not match expected 'ha_planned_roadworks'.")
            # Optionally, still try to find records if the XPath might work
            # return # Or uncomment to stop if root tag is wrong

        # Use xpath to find the records (no namespace typically needed for old format)
        records = root.xpath(OLD_ROADWORK_RECORD_XPATH)

        if not records:
            print(f"\nError: Could not find any elements matching XPath '{OLD_ROADWORK_RECORD_XPATH}'.")
            print("Please double-check the OLD_ROADWORK_RECORD_XPATH and the XML structure.")
            print("\nFirst few children of the root:")
            for i, child in enumerate(root[:5]):
                 print(f"  Child {i+1}: {child.tag}")
            return

        print(f"\n2. Found {len(records)} records matching XPath '{OLD_ROADWORK_RECORD_XPATH}'.")
        print(f"--- Inspecting first {min(NUM_RECORDS_TO_INSPECT, len(records))} records ---")

        for i, record in enumerate(records[:NUM_RECORDS_TO_INSPECT]):
            print(f"\n\n--- Record {i+1} ---")
            print(f" Record Element Tag: {record.tag}") # Should be ha_planned_works

            # --- Accessing Child Elements ---
            print(" Child Elements:")
            record_data = {}
            for child in record:
                # Clean up text content (strip whitespace, handle None)
                text_content = (child.text or '').strip()
                print(f"    {child.tag}: {text_content}")
                record_data[child.tag] = text_content # Store for easier access later


    except etree.XMLSyntaxError as e:
        print(f"\nError parsing XML file: {e}")
    except Exception as e:
        print(f"\nAn unexpected error occurred: {e}")

    print(f"\n--- End of Exploration for {file_path} ---")


example_old_file = os.path.join(OLD_DATA_DIRECTORY, 'he_roadworks_2017_06_05')
if os.path.exists(example_old_file):
    explore_roadworks_xml_old(example_old_file)
else:
    # Find the first available XML file in the old data directory if the example doesn't exist
    old_files = glob.glob(os.path.join(OLD_DATA_DIRECTORY, '*.xml'))
    if old_files:
        print(f"Example file '{example_old_file}' not found. Using first available file: {old_files[0]}")
        explore_roadworks_xml_old(old_files[0])
    else:
        print(f"Error: No XML files found in {OLD_DATA_DIRECTORY} to explore.")


Example file 'data/old_format\he_roadworks_2017_06_05' not found. Using first available file: data/old_format\ha-roadworks_2011_10_10.xml
--- Exploring XML File (Old Format): data/old_format\ha-roadworks_2011_10_10.xml ---

1. Root Element Tag: ha_planned_roadworks

2. Found 1425 records matching XPath './/ha_planned_works'.
--- Inspecting first 3 records ---


--- Record 1 ---
 Record Element Tag: ha_planned_works
 Child Elements:
    reference_number: 972963
    road: M1
    local_authority: Leicestershire / Northamptonshire
    location: Catthorpe
    start_date: 2010-07-12T07:00:00
    end_date: 2013-03-23T06:00:00
    expected_delay: Moderate (10 - 30 mins)
    description: Major junction works will include lane closures, contraflow, full closures and 50 MPH speed restrictions on the M1 and M6.
    traffic_management: Other
    closure_type: Planned Works
    centre_easting: 456252
    centre_northing: 278173
    status: Firm
    published_date: 2011-10-09T21:08:32


--- Record 2 

### Define XML-record extraction functions

In [10]:
def extract_record_new_format(record_element, source_filename):
    """
    Extracts raw data from a 'new' format <HE_PLANNED_WORKS> element
    into a dictionary matching RAW_NEW_COLUMNS.
    """
    data = {col: None for col in RAW_NEW_COLUMNS} # Initialize with None
    data['source_filename'] = source_filename

    # --- Extract direct attributes ---
    data['NEW_EVENT_NUMBER'] = record_element.get('NEW_EVENT_NUMBER')
    data['OLD_REFERENCE_NUMBER'] = record_element.get('OLD_REFERENCE_NUMBER')
    data['SDATE'] = record_element.get('SDATE')
    data['EDATE'] = record_element.get('EDATE')
    data['EXPDEL'] = record_element.get('EXPDEL')
    data['DESCRIPTION'] = record_element.get('DESCRIPTION')
    data['CLOSURE_TYPE'] = record_element.get('CLOSURE_TYPE')
    data['STATUS'] = record_element.get('STATUS')
    data['PUBLISHED_DATE'] = record_element.get('PUBLISHED_DATE')

    # Basic check - skip if no event number (essential identifier)
    if data.get('NEW_EVENT_NUMBER') is None:
        # print(f"Warning: New format record missing NEW_EVENT_NUMBER in {source_filename}. Skipping.")
        return None

    # --- Extract nested coordinates ---
    coord_elements = record_element.xpath(NEW_COORD_XPATH, namespaces=NSMAP)
    if coord_elements:
        coord_element = coord_elements[0]
        data['CENTRE_EASTING'] = coord_element.get('CENTRE_EASTING')
        data['CENTRE_NORTHING'] = coord_element.get('CENTRE_NORTHING')

    # --- Extract nested roads ---
    road_elements = record_element.xpath(NEW_ROAD_XPATH, namespaces=NSMAP)
    if road_elements:
        road_numbers_list = [road.get('ROAD_NUMBER') for road in road_elements if road.get('ROAD_NUMBER')]
        # Join multiple roads with a separator
        data['ROAD_NUMBERS'] = '; '.join(road_numbers_list) if road_numbers_list else None

    return data

def extract_record_old_format(record_element, source_filename):
    """
    Extracts raw data from an 'old' format <ha_planned_works> element
    into a dictionary matching RAW_OLD_COLUMNS.
    """
    data = {col: None for col in RAW_OLD_COLUMNS} # Initialize with None
    data['source_filename'] = source_filename

    # Helper to get text content safely
    def get_text(tag_name):
        element = record_element.find(tag_name)
        return element.text.strip() if element is not None and element.text else None

    # --- Map child elements to raw columns ---
    # Iterate through expected raw old columns (excluding source_filename)
    for col_name in RAW_OLD_COLUMNS:
        if col_name != 'source_filename':
             data[col_name] = get_text(col_name)

    # Basic check - skip if no reference number (essential identifier)
    if data.get('reference_number') is None:
        # print(f"Warning: Old format record missing reference_number in {source_filename}. Skipping.")
        return None

    return data

### Generic directory processor

In [11]:
# OLD METHOD: Returns a list (memory inefficient for large datasets)
def process_directory(directory_path, record_xpath, extraction_func, nsmap=None):
    """
    Processes all XML files in a directory using a specific XPath and extraction function.

    Args:
        directory_path (str): Path to the directory containing XML files.
        record_xpath (str): XPath expression to find record elements.
        extraction_func (callable): Function to call for each record element found.
                                    It should accept (record_element, source_filename)
                                    and return a dictionary or None.
        nsmap (dict, optional): Namespace map for XPath evaluation. Defaults to None.

    Returns:
        list: A list of dictionaries, where each dictionary represents a processed record.
    """
    all_records_data_dicts = []
    xml_files = glob.glob(os.path.join(directory_path, '*.xml'))
    parser = etree.XMLParser(recover=True, ns_clean=True) # Use robust parser

    if not xml_files:
        print(f"Warning: No XML files found in directory: {directory_path}")
        return []

    print(f"\n--- Processing Directory: {directory_path} ---")
    print(f"Found {len(xml_files)} XML files.")

    total_processed_records = 0
    total_skipped_records = 0
    files_with_errors = 0

    for file_path in xml_files:
        filename = os.path.basename(file_path)
        # print(f"Processing file: {filename}...") # Optional verbose output
        try:
            tree = etree.parse(file_path, parser)
            root = tree.getroot()
            # Find records using the provided XPath and namespace map
            records = root.xpath(record_xpath, namespaces=nsmap)

            if not records:
                # print(f"  Warning: No records found matching XPath in {filename}.")
                continue

            file_record_count = 0
            file_skipped_count = 0
            for record in records:
                try:
                    extracted_dict = extraction_func(record, filename)
                    if extracted_dict:
                        all_records_data_dicts.append(extracted_dict)
                        file_record_count += 1
                    else:
                        file_skipped_count += 1 # Count records skipped by extraction func
                except Exception as e_rec:
                    # Try to get an ID for logging, adapt based on potential extraction func errors
                    event_id = "UNKNOWN_ID"
                    try:
                        if nsmap: # Likely new format
                             event_id = record.get('NEW_EVENT_NUMBER', event_id)
                        else: # Likely old format
                             ref_num_el = record.find('reference_number')
                             if ref_num_el is not None and ref_num_el.text:
                                 event_id = ref_num_el.text.strip()
                    except: pass # Ignore errors getting ID for logging
                    print(f"  Error processing record {event_id} in {filename}: {e_rec}")
                    file_skipped_count += 1

            # if file_record_count > 0 or file_skipped_count > 0: # Only print if something happened
            #    print(f"  Extracted {file_record_count} valid records from {filename}. Skipped {file_skipped_count}.")

            total_processed_records += file_record_count
            total_skipped_records += file_skipped_count

        except etree.XMLSyntaxError as e_xml:
            print(f"  Error parsing XML file {filename}: {e_xml}. Skipping file.")
            files_with_errors += 1
        except Exception as e_file:
            print(f"  An unexpected error occurred processing file {filename}: {e_file}. Skipping file.")
            files_with_errors += 1

    print(f"--- Directory Scan Complete: {directory_path} ---")
    print(f"Successfully extracted {total_processed_records} records.")
    if total_skipped_records > 0:
        print(f"Skipped {total_skipped_records} records (missing ID or processing error).")
    if files_with_errors > 0:
        print(f"Skipped {files_with_errors} files due to parsing/file errors.")

    return all_records_data_dicts

In [12]:
#  UPDATED GENERATOR FUNCTION: Yields records one by one instead of returning a list
def process_directory(directory_path, record_xpath, extraction_func, nsmap=None):
    """
    Processes all XML files in a directory using a specific XPath and extraction function,
    yielding each processed record as a dictionary.

    Args:
        directory_path (str): Path to the directory containing XML files.
        record_xpath (str): XPath expression to find record elements.
        extraction_func (callable): Function to call for each record element found.
                                    It should accept (record_element, source_filename)
                                    and return a dictionary or None.
        nsmap (dict, optional): Namespace map for XPath evaluation. Defaults to None.

    Yields:
        dict: A dictionary representing a processed record, if valid.
    """
    xml_files = glob.glob(os.path.join(directory_path, '*.xml'))
    parser = etree.XMLParser(recover=True, ns_clean=True) # Use robust parser

    if not xml_files:
        print(f"Warning: No XML files found in directory: {directory_path}")
        return # Return early if no files

    print(f"\n--- Processing Directory: {directory_path} ---")
    print(f"Found {len(xml_files)} XML files.")

    total_yielded_records = 0
    total_skipped_records = 0
    files_with_errors = 0

    for file_path in xml_files:
        filename = os.path.basename(file_path)
        try:
            tree = etree.parse(file_path, parser)
            root = tree.getroot()
            records = root.xpath(record_xpath, namespaces=nsmap)

            if not records:
                continue

            file_yielded_count = 0
            file_skipped_count = 0
            for record in records:
                try:
                    extracted_dict = extraction_func(record, filename)
                    if extracted_dict:
                        yield extracted_dict
                        file_yielded_count += 1
                    else:
                        file_skipped_count += 1
                except Exception as e_rec:
                    event_id = "UNKNOWN_ID"
                    try: # Attempt to get ID for logging
                        if nsmap: event_id = record.get('NEW_EVENT_NUMBER', event_id)
                        else:
                             ref_num_el = record.find('reference_number')
                             if ref_num_el is not None and ref_num_el.text: event_id = ref_num_el.text.strip()
                    except: pass
                    print(f"  Error processing record {event_id} in {filename}: {e_rec}")
                    file_skipped_count += 1

            total_yielded_records += file_yielded_count
            total_skipped_records += file_skipped_count

        except etree.XMLSyntaxError as e_xml:
            print(f"  Error parsing XML file {filename}: {e_xml}. Skipping file.")
            files_with_errors += 1
        except Exception as e_file:
            print(f"  An unexpected error occurred processing file {filename}: {e_file}. Skipping file.")
            files_with_errors += 1

    print(f"--- Directory Scan Complete: {directory_path} ---")
    print(f"Successfully yielded {total_yielded_records} records.") 
    if total_skipped_records > 0:
        print(f"Skipped {total_skipped_records} records (missing ID or processing error).")
    if files_with_errors > 0:
        print(f"Skipped {files_with_errors} files due to parsing/file errors.")

### Process data in batches

In [13]:
def load_data_in_batches(con, table_name, target_columns, data_iterator, batch_size=1000):
    """
    Loads data from an iterator into a DuckDB table in batches.

    Args:
        con: Active DuckDB connection object.
        table_name (str): Name of the target table.
        target_columns (list): List of column names in the target table order.
        data_iterator (iterator): An iterator yielding dictionaries of data.
        batch_size (int): Number of records to insert per batch.
    """
    batch_data = []
    total_inserted = 0
    num_columns = len(target_columns)
    placeholders = ', '.join(['?'] * num_columns)
    insert_sql = f'INSERT INTO "{table_name}" VALUES ({placeholders})'

    print(f"Starting batch insertion into '{table_name}' (batch size: {batch_size})...")

    for record_dict in data_iterator:
        # Convert dict to list/tuple in the correct column order
        row_values = [record_dict.get(col_name) for col_name in target_columns]
        batch_data.append(row_values)

        if len(batch_data) >= batch_size:
            try:
                con.executemany(insert_sql, batch_data)
                total_inserted += len(batch_data)
                print(f"  Inserted batch of {len(batch_data)}. Total inserted: {total_inserted}")
                batch_data = [] # Clear the batch
            except duckdb.Error as e:
                print(f"  Error inserting batch: {e}")
                # Decide how to handle batch errors (e.g., log, skip, stop)
                # For now, just print and continue trying next batch
                batch_data = [] # Clear potentially problematic batch

    # Insert any remaining records in the last batch
    if batch_data:
        try:
            con.executemany(insert_sql, batch_data)
            total_inserted += len(batch_data)
            print(f"  Inserted final batch of {len(batch_data)}. Total inserted: {total_inserted}")
        except duckdb.Error as e:
            print(f"  Error inserting final batch: {e}")

    print(f"Batch insertion complete. Total records inserted: {total_inserted}")

In [14]:
# --- Main Data Processing and Loading (Batch Mode) ---

print(f"Connecting to DuckDB database: {DUCKDB_FILE}")

con = None # Initialize connection variable
try:
    con = duckdb.connect(database=DUCKDB_FILE, read_only=False)

    # --- Create/Replace RAW NEW Table Structure ---
    print(f"Creating or replacing table: {RAW_NEW_TABLE_NAME}")
    # Quote column names
    new_column_defs = [f'"{col}" VARCHAR' for col in RAW_NEW_COLUMNS]
    create_new_table_sql = f'CREATE OR REPLACE TABLE "{RAW_NEW_TABLE_NAME}" ({", ".join(new_column_defs)})'
    con.execute(create_new_table_sql)
    print(f"Table '{RAW_NEW_TABLE_NAME}' created/replaced successfully.")

    # --- Process and Load New Format Raw Data ---
    print("\nProcessing NEW format data...")
    new_data_iterator = process_directory(
        directory_path=NEW_DATA_DIRECTORY,
        record_xpath=NEW_ROADWORK_RECORD_XPATH,
        extraction_func=extract_record_new_format,
        nsmap=NSMAP
    )
    # Load into the raw new table using the specific columns
    load_data_in_batches(con, RAW_NEW_TABLE_NAME, RAW_NEW_COLUMNS, new_data_iterator)

    # --- Create/Replace RAW OLD Table Structure ---
    print(f"\nCreating or replacing table: {RAW_OLD_TABLE_NAME}")
    # Quote column names
    old_column_defs = [f'"{col}" VARCHAR' for col in RAW_OLD_COLUMNS]
    create_old_table_sql = f'CREATE OR REPLACE TABLE "{RAW_OLD_TABLE_NAME}" ({", ".join(old_column_defs)})'
    con.execute(create_old_table_sql)
    print(f"Table '{RAW_OLD_TABLE_NAME}' created/replaced successfully.")

    # --- Process and Load Old Format Raw Data ---
    print("\nProcessing OLD format data...")
    old_data_iterator = process_directory(
        directory_path=OLD_DATA_DIRECTORY,
        record_xpath=OLD_ROADWORK_RECORD_XPATH,
        extraction_func=extract_record_old_format,
        nsmap=None # No namespace needed for old format XPath
    )
    # Load into the raw old table using the specific columns
    load_data_in_batches(con, RAW_OLD_TABLE_NAME, RAW_OLD_COLUMNS, old_data_iterator)

    # --- Finalize ---
    print("\nCommitting transaction...")
    con.commit()
    print("Transaction committed.")

    # Verify final counts
    count_new = con.execute(f'SELECT COUNT(*) FROM "{RAW_NEW_TABLE_NAME}"').fetchone()
    count_old = con.execute(f'SELECT COUNT(*) FROM "{RAW_OLD_TABLE_NAME}"').fetchone()
    print(f"\nVerification: Table '{RAW_NEW_TABLE_NAME}' now contains {count_new[0]} rows.")
    print(f"Verification: Table '{RAW_OLD_TABLE_NAME}' now contains {count_old[0]} rows.")


except duckdb.Error as e_db:
    print(f"\nDatabase error occurred: {e_db}")
    if con:
        try:
            print("Attempting to rollback transaction.")
            con.rollback()
        except duckdb.Error as e_tx: # More specific exception type if available
            print(f"Rollback failed: {e_tx}")
except Exception as e:
    print(f"\nAn unexpected error occurred: {e}")
    if con:
        try:
            con.rollback()
        except duckdb.Error as e_tx:
            print(f"Rollback failed: {e_tx}")
finally:
    if con:
        con.close()
        print("Database connection closed.")

print("\n--- Raw Data Pipeline Complete ---")

Connecting to DuckDB database: roadworks_data.duckdb
Creating or replacing table: raw_new_roadworks
Table 'raw_new_roadworks' created/replaced successfully.

Processing NEW format data...
Starting batch insertion into 'raw_new_roadworks' (batch size: 1000)...

--- Processing Directory: data/new_format ---
Found 8 XML files.
  Inserted batch of 1000. Total inserted: 1000
  Inserted batch of 1000. Total inserted: 2000
  Inserted batch of 1000. Total inserted: 3000
  Inserted batch of 1000. Total inserted: 4000
  Inserted batch of 1000. Total inserted: 5000
  Inserted batch of 1000. Total inserted: 6000
  Inserted batch of 1000. Total inserted: 7000
  Inserted batch of 1000. Total inserted: 8000
  Inserted batch of 1000. Total inserted: 9000
  Inserted batch of 1000. Total inserted: 10000
  Inserted batch of 1000. Total inserted: 11000
--- Directory Scan Complete: data/new_format ---
Successfully yielded 11353 records.
  Inserted final batch of 353. Total inserted: 11353
Batch insertion c

## Analyze data quality

In [17]:
# --- Basic Quality Checks Setup ---

con = None

def run_query(connection, sql_query):
    """Helper function to run a query and return a Polars DataFrame."""
    if not connection:
        print("Error: Database connection is not established.")
        return None
    try:
        # print(f"Running query:\n{sql_query}") # Optional: print query being run
        return connection.sql(sql_query).pl()
    except duckdb.Error as e:
        print(f"Error running query:\n{sql_query}\nError: {e}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None

# Establish connection (read-only)
try:
    print(f"Connecting to {DUCKDB_FILE} for quality checks...")
    con = duckdb.connect(database=DUCKDB_FILE, read_only=True)
    print("Connection successful.")
except duckdb.Error as e:
    print(f"Error connecting to database: {e}")
    con = None # Ensure con_check is None if connection failed
except Exception as e:
    print(f"An unexpected error occurred during connection: {e}")
    con = None

# Define common placeholders to check
PLACEHOLDERS = ["''", "'none'", "'n/a'", "'null'", "'unknown'"]
#PLACEHOLDER_FILTER = " OR ".join([f'lower("{col}") = {p}' for p in PLACEHOLDERS])

# Define tables and columns to iterate over
TABLES_INFO = {
    RAW_NEW_TABLE_NAME: RAW_NEW_COLUMNS,
    RAW_OLD_TABLE_NAME: RAW_OLD_COLUMNS
}

Connecting to roadworks_data.duckdb for quality checks...
Connection successful.


In [22]:
pl.Config.set_tbl_rows(50)
pl.Config.set_tbl_cols(50)

new_table = RAW_NEW_TABLE_NAME
old_table = RAW_OLD_TABLE_NAME

print(f"--- Inspecting DuckDB Database: {DUCKDB_FILE} ---")

if not os.path.exists(DUCKDB_FILE):
    print(f"Error: Database file '{DUCKDB_FILE}' not found.")
elif not con: # Check if the connection from the previous cell was successful
     print(f"Error: Cannot inspect database. Connection 'con' not established.")
else:
    # Connection is already established via con in the previous cell

    # --- Inspect NEW Raw Table ---
    print(f"--- Inspecting Table: {new_table} ---")
    try:
        # Describe schema using run_query
        schema_df_new = run_query(con, f'DESCRIBE "{new_table}"')
        if schema_df_new is not None and not schema_df_new.is_empty():
            print(f"`DESCRIBE \"{new_table}\"` returned:")
            display(schema_df_new)
        else:
             # If DESCRIBE fails or returns empty, the table likely doesn't exist or there was an error
             print(f"Could not retrieve schema for table '{new_table}'. It might not exist or there was a query error.")
             # Skip further inspection for this table
             raise duckdb.CatalogException(f"Table '{new_table}' not found or query failed.") # Raise exception to skip next steps

        # Count rows using run_query
        count_df_new = run_query(con, f'SELECT COUNT(*) as count FROM "{new_table}"')
        if count_df_new is not None and not count_df_new.is_empty():
            count_new_val = count_df_new[0, "count"]
            print(f"\nTotal rows in '{new_table}': {count_new_val}")
        else:
            print(f"Could not count rows for table '{new_table}'.")
            count_new_val = 0 # Assume 0 if count fails

        # Display sample rows using run_query (only if table has rows)
        if count_new_val > 0:
            print(f"\nFirst 5 rows from '{new_table}':")
            sample_df_new = run_query(con, f'SELECT * FROM "{new_table}" LIMIT 5')
            if sample_df_new is not None and not sample_df_new.is_empty():
                # print(type(sample_df_new)) # Type is known to be Polars DataFrame
                display(sample_df_new)
            elif sample_df_new is not None and sample_df_new.is_empty():
                 print("Table has rows, but could not fetch sample (LIMIT 5 returned empty).")
            else:
                 print("Could not fetch sample rows.")
        elif count_new_val == 0:
             print("\nTable appears to be empty.")


    except duckdb.CatalogException as e: # Catch specific error if DESCRIBE failed as intended
         print(f"Skipping further inspection for '{new_table}' due to previous error: {e}")
    except Exception as e: # Catch any other unexpected errors during inspection
         print(f"An unexpected error occurred while inspecting '{new_table}': {e}")


    # --- Inspect OLD Raw Table ---
    print(f"\n--- Inspecting Table: {old_table} ---")
    try:
        # Describe schema using run_query
        print(f"\nSchema for table '{old_table}':")
        schema_df_old = run_query(con, f'DESCRIBE "{old_table}"')
        if schema_df_old is not None and not schema_df_old.is_empty():
            print(f'`DESCRIBE "{old_table}"` returned:')
            display(schema_df_old)
        else:
             print(f"Could not retrieve schema for table '{old_table}'. It might not exist or there was a query error.")
             raise duckdb.CatalogException(f"Table '{old_table}' not found or query failed.")

        # Count rows using run_query
        count_df_old = run_query(con, f'SELECT COUNT(*) as count FROM "{old_table}"')
        if count_df_old is not None and not count_df_old.is_empty():
            count_old_val = count_df_old[0, "count"]
            print(f"\nTotal rows in '{old_table}': {count_old_val}")
        else:
            print(f"Could not count rows for table '{old_table}'.")
            count_old_val = 0

        # Display sample rows using run_query (only if table has rows)
        if count_old_val > 0:
            print(f"\nFirst 5 rows from '{old_table}':")
            sample_df_old = run_query(con, f'SELECT * FROM "{old_table}" LIMIT 5')
            if sample_df_old is not None and not sample_df_old.is_empty():
                display(sample_df_old)
            elif sample_df_old is not None and sample_df_old.is_empty():
                 print("Table has rows, but could not fetch sample (LIMIT 5 returned empty).")
            else:
                 print("Could not fetch sample rows.")
        elif count_old_val == 0:
             print("\nTable appears to be empty.")


    except duckdb.CatalogException as e:
         print(f"Skipping further inspection for '{old_table}' due to previous error: {e}")
    except Exception as e:
         print(f"An unexpected error occurred while inspecting '{old_table}': {e}")

    # No need to close con_inspect as we are using the global con_check
    # The con_check connection will be closed later after all checks are done.
    # print("\nInspection connection closed.") # Remove this line

print("\n--- Inspection Complete ---")

--- Inspecting DuckDB Database: roadworks_data.duckdb ---
--- Inspecting Table: raw_new_roadworks ---
`DESCRIBE "raw_new_roadworks"` returned:


column_name,column_type,null,key,default,extra
str,str,str,str,str,str
"""source_filename""","""VARCHAR""","""YES""",,,
"""NEW_EVENT_NUMBER""","""VARCHAR""","""YES""",,,
"""OLD_REFERENCE_NUMBER""","""VARCHAR""","""YES""",,,
"""SDATE""","""VARCHAR""","""YES""",,,
"""EDATE""","""VARCHAR""","""YES""",,,
"""EXPDEL""","""VARCHAR""","""YES""",,,
"""DESCRIPTION""","""VARCHAR""","""YES""",,,
"""CLOSURE_TYPE""","""VARCHAR""","""YES""",,,
"""STATUS""","""VARCHAR""","""YES""",,,
"""PUBLISHED_DATE""","""VARCHAR""","""YES""",,,



Total rows in 'raw_new_roadworks': 11353

First 5 rows from 'raw_new_roadworks':


source_filename,NEW_EVENT_NUMBER,OLD_REFERENCE_NUMBER,SDATE,EDATE,EXPDEL,DESCRIPTION,CLOSURE_TYPE,STATUS,PUBLISHED_DATE,CENTRE_EASTING,CENTRE_NORTHING,ROAD_NUMBERS
str,str,str,str,str,str,str,str,str,str,str,str,str
"""he_roadworks_2018_02_26.xml""","""00026976-005""",,"""26-FEB-2018 21:00""","""28-FEB-2018 06:00""","""Slight (less than 10 mins)""","""A3 northbound Sheet Link entry…","""Area Renewals""","""Published""","""2018-02-22T16:49:17""","""475209""","""124975""","""A3"""
"""he_roadworks_2018_02_26.xml""","""00004020-008""","""4188720""","""08-JAN-2018 20:00""","""10-MAR-2018 06:00""","""Moderate (10 - 30 mins)""","""A14 Westbound Jct 58 to Jct 57…","""Area Schemes""","""Published""","""2018-02-22T10:13:27""","""614569""","""241115""","""A14"""
"""he_roadworks_2018_02_26.xml""","""00001459-026""","""4215713""","""31-JUL-2017 14:47""","""01-APR-2018 06:00""","""Slight (less than 10 mins)""","""M1 northbound and southbound T…","""Major Schemes""","""Published""","""2018-02-15T14:38:05""","""445124""","""364308""","""M1"""
"""he_roadworks_2018_02_26.xml""","""00027883-003""",,"""12-FEB-2018 20:00""","""17-MAR-2018 06:00""","""Moderate (10 - 30 mins)""","""A259, east and westbound betwe…","""Area Schemes""","""Published""","""2018-02-21T10:36:47""","""596442""","""123787""","""A259"""
"""he_roadworks_2018_02_26.xml""","""00026799-002""",,"""10-FEB-2018 22:00""","""22-MAR-2018 06:00""","""Slight (less than 10 mins)""","""A3 northbound Compton to Denni…","""Regional Technology Works""","""Published""","""2018-02-22T14:08:43""","""498261""","""150727""","""A3"""



--- Inspecting Table: raw_old_roadworks ---

Schema for table 'raw_old_roadworks':
`DESCRIBE "raw_old_roadworks"` returned:


column_name,column_type,null,key,default,extra
str,str,str,str,str,str
"""source_filename""","""VARCHAR""","""YES""",,,
"""reference_number""","""VARCHAR""","""YES""",,,
"""start_date""","""VARCHAR""","""YES""",,,
"""end_date""","""VARCHAR""","""YES""",,,
"""expected_delay""","""VARCHAR""","""YES""",,,
"""description""","""VARCHAR""","""YES""",,,
"""closure_type""","""VARCHAR""","""YES""",,,
"""status""","""VARCHAR""","""YES""",,,
"""published_date""","""VARCHAR""","""YES""",,,
"""centre_easting""","""VARCHAR""","""YES""",,,



Total rows in 'raw_old_roadworks': 12068

First 5 rows from 'raw_old_roadworks':


source_filename,reference_number,start_date,end_date,expected_delay,description,closure_type,status,published_date,centre_easting,centre_northing,road,location,local_authority,traffic_management
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""ha-roadworks_2011_10_10.xml""","""972963""","""2010-07-12T07:00:00""","""2013-03-23T06:00:00""","""Moderate (10 - 30 mins)""","""Major junction works will incl…","""Planned Works""","""Firm""","""2011-10-09T21:08:32""","""456252""","""278173""","""M1""","""Catthorpe""","""Leicestershire / Northamptonsh…","""Other"""
"""ha-roadworks_2011_10_10.xml""","""978905""","""2011-04-01T22:00:00""","""2011-12-31T05:00:00""","""Moderate (10 - 30 mins)""","""Contraflow with speed restrict…","""Planned Works""","""Firm""","""2010-04-23T10:18:30""","""499082""","""235992""","""M1""","""Jct 13 to Jct 12""","""Bedfordshire / Buckinghamshire""","""Contraflow"""
"""ha-roadworks_2011_10_10.xml""","""998294""","""2009-09-24T06:00:00""","""2013-09-24T05:00:00""","""Slight (less than 10 mins)""","""Lane 1 closure and 24/7 Hardsh…","""Planned Works""","""Firm""","""2010-06-19T05:03:50""","""465924""","""260154""","""M1""","""Approach to Junction 16 (21011…","""Northamptonshire""","""Lane Closure"""
"""ha-roadworks_2011_10_10.xml""","""1172899""","""2011-10-10T22:00:00""","""2011-12-03T06:00:00""","""Slight (less than 10 mins)""","""Lane closures during the day w…","""Planned Works""","""Firm""","""2011-09-28T15:40:36""","""446842""","""324130""","""M1""","""Junction 23a (220116)""","""Leicestershire""","""Carriageway Closure"""
"""ha-roadworks_2011_10_10.xml""","""1306529""","""2010-08-04T00:00:00""","""2012-07-05T00:00:00""","""No Delay""","""24hrs, lane 1 closure, northbo…","""Planned Works""","""Firm""","""2011-08-22T16:47:52""","""511897""","""202047""","""M1""","""Jct 6 Exit Slip""","""Hertfordshire""","""Lane Closure"""



--- Inspection Complete ---


### Basic checks
1. Count NULLs & empty string placeholders
1. Check string length range of each column (e.g.: Is NEW_EVENT_NUMBER fixed length?)
1. Examine categorical values (e.g. STATUS, EXPDEL)
1. Check identifyer uniqueness across tables

#### NULL & Placeholder check

In [None]:
pl.Config.set_tbl_rows(50)
pl.Config.set_tbl_cols(50)

# Ensure connection 'con' from the previous cell is available and valid
if con is None:
    print("Error: Database connection 'con' is not established. Please run the connection cell first.")

print("--- Running Basic Data Quality Checks ---")

# Define common placeholders (lowercase for case-insensitive comparison)
PLACEHOLDERS_LOWER = ["", "none", "n/a", "null", "unknown"]
# Create SQL list string like "('', 'none', 'n/a', 'null', 'unknown')"
PLACEHOLDERS_SQL_LIST = f"({', '.join([f'{pl!r}' for pl in PLACEHOLDERS_LOWER])})"


# --- Check 1: NULL and Placeholder Counts ---
print("--- Running Check 1: NULL and Placeholder Counts ---")
for table_name, columns in TABLES_INFO.items():
    print(f"\n--- Analyzing Table for NULLs/Placeholders: {table_name} ---")

    count_df = run_query(con, f'SELECT COUNT(*) as total_rows FROM "{table_name}"')
    total_rows = 0
    if count_df is not None and not count_df.is_empty():
        total_rows = count_df[0, "total_rows"]
    print(f"Total Rows: {total_rows}")

    if total_rows == 0:
        print("Table is empty. Skipping NULL/Placeholder checks for this table.")
        continue

    print("\n1. NULL and Placeholder Counts (Summary):")
    null_placeholder_results = []
    for col in columns:
        null_query = f'SELECT COUNT(*) as null_count FROM "{table_name}" WHERE "{col}" IS NULL'
        null_df = run_query(con, null_query)
        null_count = null_df[0, "null_count"] if null_df is not None and not null_df.is_empty() else 'Error'

        placeholder_query = f'''
            SELECT COUNT(*) as placeholder_count
            FROM "{table_name}"
            WHERE lower(trim("{col}")) IN {PLACEHOLDERS_SQL_LIST}
                AND "{col}" IS NOT NULL
        '''
        placeholder_df = run_query(con, placeholder_query)
        placeholder_count = placeholder_df[0, "placeholder_count"] if placeholder_df is not None and not placeholder_df.is_empty() else 'Error'

        if null_count != 'Error' and placeholder_count != 'Error':
                null_perc = f"({(null_count / total_rows * 100):.2f}%)" if total_rows > 0 else ""
                placeholder_perc = f"({(placeholder_count / total_rows * 100):.2f}%)" if total_rows > 0 else ""
                null_placeholder_results.append({
                    "Column": col,
                    "Null Count": null_count,
                    "Null %": null_perc,
                    "Placeholder Count": placeholder_count,
                    "Placeholder %": placeholder_perc
                })
        else:
                null_placeholder_results.append({
                    "Column": col,
                    "Null Count": null_count,
                    "Null %": "N/A",
                    "Placeholder Count": placeholder_count,
                    "Placeholder %": "N/A"
                })

    if null_placeholder_results:
            display(pl.DataFrame(null_placeholder_results))
    else:
            print("  Could not retrieve NULL/Placeholder counts summary.")
            
    print("\n  Examples of Records with Placeholders (Limit 5 per placeholder type per column):")
    placeholders_found_overall_for_table = False
    for col in columns:
        id_col_name = 'NEW_EVENT_NUMBER' if table_name == RAW_NEW_TABLE_NAME else 'reference_number'
        
        if id_col_name not in columns or 'source_filename' not in columns:
            # print(f"    Skipping detailed placeholder check for column '{col}' in table '{table_name}': Identifier or source_filename not in columns.")
            continue

        placeholders_found_for_this_col_overall = False
        for placeholder_value in PLACEHOLDERS_LOWER:
            sql_placeholder_value = placeholder_value.replace("'", "''")
            
            if placeholder_value == "": 
                placeholder_condition = f"trim(\"{col}\") = ''"
            else:
                placeholder_condition = f"lower(trim(\"{col}\")) = '{sql_placeholder_value}'"

            details_query = f'''
                SELECT "{id_col_name}" AS identifier, "source_filename", "{col}" AS problematic_value
                FROM "{table_name}"
                WHERE {placeholder_condition} AND "{col}" IS NOT NULL
                LIMIT 5
            '''
            details_df = run_query(con, details_query)

            if details_df is not None and not details_df.is_empty():
                if not placeholders_found_for_this_col_overall:
                    print(f"\n    --- Column: '{col}' ---")
                    placeholders_found_for_this_col_overall = True
                    placeholders_found_overall_for_table = True
                
                display_placeholder_name = f"'{placeholder_value}'" if placeholder_value != "" else "(empty string)"
                print(f"      Records with placeholder {display_placeholder_name}:")
                display(details_df)
    
    if not placeholders_found_overall_for_table and total_rows > 0 :
        print(f"    No specific placeholder examples to show for any column in {table_name}.")
print("--- Check 1: NULL and Placeholder Counts Complete ---")

--- Running Check 1: NULL and Placeholder Counts ---

--- Analyzing Table for NULLs/Placeholders: raw_new_roadworks ---
Total Rows: 11353

1. NULL and Placeholder Counts (Summary):


Column,Null Count,Null %,Placeholder Count,Placeholder %
str,i64,str,i64,str
"""source_filename""",0,"""(0.00%)""",0,"""(0.00%)"""
"""NEW_EVENT_NUMBER""",0,"""(0.00%)""",0,"""(0.00%)"""
"""OLD_REFERENCE_NUMBER""",10692,"""(94.18%)""",0,"""(0.00%)"""
"""SDATE""",0,"""(0.00%)""",0,"""(0.00%)"""
"""EDATE""",0,"""(0.00%)""",0,"""(0.00%)"""
"""EXPDEL""",0,"""(0.00%)""",0,"""(0.00%)"""
"""DESCRIPTION""",0,"""(0.00%)""",0,"""(0.00%)"""
"""CLOSURE_TYPE""",0,"""(0.00%)""",0,"""(0.00%)"""
"""STATUS""",0,"""(0.00%)""",0,"""(0.00%)"""
"""PUBLISHED_DATE""",0,"""(0.00%)""",0,"""(0.00%)"""



  Examples of Records with Placeholders (Limit 5 per placeholder type per column):
    No specific placeholder examples to show for any column in raw_new_roadworks.

--- Analyzing Table for NULLs/Placeholders: raw_old_roadworks ---
Total Rows: 12068

1. NULL and Placeholder Counts (Summary):


Column,Null Count,Null %,Placeholder Count,Placeholder %
str,i64,str,i64,str
"""source_filename""",0,"""(0.00%)""",0,"""(0.00%)"""
"""reference_number""",0,"""(0.00%)""",0,"""(0.00%)"""
"""start_date""",0,"""(0.00%)""",0,"""(0.00%)"""
"""end_date""",0,"""(0.00%)""",0,"""(0.00%)"""
"""expected_delay""",0,"""(0.00%)""",0,"""(0.00%)"""
"""description""",0,"""(0.00%)""",0,"""(0.00%)"""
"""closure_type""",0,"""(0.00%)""",0,"""(0.00%)"""
"""status""",0,"""(0.00%)""",0,"""(0.00%)"""
"""published_date""",0,"""(0.00%)""",0,"""(0.00%)"""
"""centre_easting""",0,"""(0.00%)""",0,"""(0.00%)"""



  Examples of Records with Placeholders (Limit 5 per placeholder type per column):

    --- Column: 'traffic_management' ---
      Records with placeholder 'none':


identifier,source_filename,problematic_value
str,str,str
"""1853252""","""ha-roadworks_2011_10_10.xml""","""None"""
"""1837978""","""ha-roadworks_2011_10_10.xml""","""None"""
"""1862058""","""ha-roadworks_2011_10_10.xml""","""None"""
"""1848669""","""ha-roadworks_2011_10_10.xml""","""None"""
"""564571""","""ha-roadworks_2011_10_10.xml""","""None"""


--- Check 1: NULL and Placeholder Counts Complete ---


#### String length analysis

In [None]:
# --- Check 2: String Length Analysis ---
print("--- Running Check 2: String Length Analysis ---")
for table_name, columns in TABLES_INFO.items():
    print(f"\n--- Analyzing Table for String Lengths: {table_name} ---")
    
    count_df = run_query(con, f'SELECT COUNT(*) as total_rows FROM "{table_name}"')
    total_rows = 0
    if count_df is not None and not count_df.is_empty():
        total_rows = count_df[0, "total_rows"]
    # print(f"Total Rows: {total_rows}") # Optional context

    if total_rows == 0:
        print("Table is empty. Skipping string length checks for this table.")
        continue

    print("\n2. String Length Analysis:")
    length_results = []
    for col in columns:
        length_stats_query = f'''
            SELECT MIN(LENGTH("{col}")) as min_len,
                    MAX(LENGTH("{col}")) as max_len,
                    AVG(LENGTH("{col}")) as avg_len,
                    STDDEV_POP(LENGTH("{col}")) as stddev_len
            FROM "{table_name}"
            WHERE "{col}" IS NOT NULL AND trim("{col}") != ''
        '''
        length_df = run_query(con, length_stats_query)

        min_len, max_len, avg_len, stddev_len = "Error", "Error", "Error", "Error"
        if length_df is not None and not length_df.is_empty():
            min_len = length_df[0, "min_len"]
            max_len = length_df[0, "max_len"]
            avg_len_val = length_df[0, "avg_len"]
            stddev_len_val = length_df[0, "stddev_len"]
            
            avg_len = f"{avg_len_val:.2f}" if avg_len_val is not None else "N/A"
            stddev_len = f"{stddev_len_val:.2f}" if stddev_len_val is not None else "N/A"

        length_results.append({
            "Column": col,
            "Min Length": min_len,
            "Max Length": max_len,
            "Avg Length": avg_len,
            "StdDev Length": stddev_len
        })
        
        if min_len != "Error" and min_len is not None:
            shortest_strings_query = f'''
                SELECT DISTINCT "{col}" as val
                FROM "{table_name}"
                WHERE "{col}" IS NOT NULL AND LENGTH("{col}") = {min_len}
                LIMIT 3
            '''
            shortest_df = run_query(con, shortest_strings_query)
            if shortest_df is not None and not shortest_df.is_empty():
                print(f"  Shortest string example for '{col}' (length {min_len}): {shortest_df['val'].to_list()}")

        if max_len != "Error" and max_len is not None:
            longest_strings_query = f'''
                SELECT DISTINCT "{col}" as val
                FROM "{table_name}"
                WHERE "{col}" IS NOT NULL AND LENGTH("{col}") = {max_len}
                LIMIT 3
            '''
            longest_df = run_query(con, longest_strings_query)
            if longest_df is not None and not longest_df.is_empty():
                print(f"  Longest string example for '{col}' (length {max_len}): {longest_df['val'].to_list()}")

    if length_results:
        print("\n  String Length Statistics Summary:")
        display(pl.DataFrame(length_results))
    else:
        print("  Could not retrieve string length statistics.")
print("--- Check 2: String Length Analysis Complete ---")

--- Running Check 2: String Length Analysis ---

--- Analyzing Table for String Lengths: raw_new_roadworks ---

2. String Length Analysis:
  Shortest string example for 'source_filename' (length 25): ['nh_roadworks_2023_3_6.xml']
  Longest string example for 'source_filename' (length 27): ['he_roadworks_2021_03_01.xml', 'he_roadworks_2018_02_26.xml', 'he_roadworks_2019_04_15.xml']
  Shortest string example for 'NEW_EVENT_NUMBER' (length 12): ['00004020-008', '00026799-002', '00028129-003']
  Longest string example for 'NEW_EVENT_NUMBER' (length 12): ['00028402-003', '00005530-003', '00031914-001']
  Shortest string example for 'OLD_REFERENCE_NUMBER' (length 4): ['7383', '5662', '6275']
  Longest string example for 'OLD_REFERENCE_NUMBER' (length 8): ['12018198']
  Shortest string example for 'SDATE' (length 17): ['03-MAR-2018 22:30', '21-FEB-2018 21:00', '16-FEB-2018 09:00']
  Longest string example for 'SDATE' (length 17): ['03-MAR-2018 22:30', '21-FEB-2018 21:00', '16-FEB-2018 09:00']

Column,Min Length,Max Length,Avg Length,StdDev Length
str,i64,i64,str,str
"""source_filename""",25,27,"""26.37""","""0.69"""
"""NEW_EVENT_NUMBER""",12,12,"""12.00""","""0.00"""
"""OLD_REFERENCE_NUMBER""",4,8,"""6.09""","""0.96"""
"""SDATE""",17,17,"""17.00""","""0.00"""
"""EDATE""",17,17,"""17.00""","""0.00"""
"""EXPDEL""",23,26,"""25.24""","""1.30"""
"""DESCRIPTION""",14,384,"""101.89""","""39.12"""
"""CLOSURE_TYPE""",7,38,"""19.52""","""6.00"""
"""STATUS""",6,9,"""9.00""","""0.05"""
"""PUBLISHED_DATE""",19,19,"""19.00""","""0.00"""



--- Analyzing Table for String Lengths: raw_old_roadworks ---

2. String Length Analysis:
  Shortest string example for 'source_filename' (length 27): ['ha-roadworks_2013_05_06.xml', 'ha_roadworks_2015_03_16.xml', 'he_roadworks_2016_02_29.xml']
  Longest string example for 'source_filename' (length 27): ['ha-roadworks_2014_03_31.xml', 'he_roadworks_2016_02_29.xml', 'ha_roadworks_2015_03_16.xml']
  Shortest string example for 'reference_number' (length 6): ['981222', '999850', '765300']
  Longest string example for 'reference_number' (length 7): ['1306529', '1683376', '1705838']
  Shortest string example for 'start_date' (length 19): ['2011-08-09T22:00:00', '2011-10-18T09:30:00', '2011-10-14T08:00:00']
  Longest string example for 'start_date' (length 19): ['2010-07-12T07:00:00', '2011-09-26T22:45:00', '2011-10-14T21:00:00']
  Shortest string example for 'end_date' (length 19): ['2011-11-11T06:00:00', '2011-10-12T06:00:00', '2011-10-22T05:30:00']
  Longest string example for 'end_date'

Column,Min Length,Max Length,Avg Length,StdDev Length
str,i64,i64,str,str
"""source_filename""",27,27,"""27.00""","""0.00"""
"""reference_number""",6,7,"""6.99""","""0.07"""
"""start_date""",19,19,"""19.00""","""0.00"""
"""end_date""",19,19,"""19.00""","""0.00"""
"""expected_delay""",8,26,"""22.39""","""6.66"""
"""description""",3,1988,"""80.12""","""56.20"""
"""closure_type""",13,15,"""13.14""","""0.51"""
"""status""",4,11,"""4.70""","""2.11"""
"""published_date""",19,19,"""19.00""","""0.00"""
"""centre_easting""",1,6,"""5.99""","""0.19"""


--- Check 2: String Length Analysis Complete ---


#### Categorical variable check

In [35]:
# --- Check 3: Categorical Value Counts ---
print("--- Running Check 3: Categorical Value Counts ---")
if 'con' not in globals() or con is None:
    print("Error: Database connection 'con' is not established. Please run the setup cell first.")
elif 'TABLES_INFO' not in globals():
    print("Error: TABLES_INFO is not defined. Please run the setup cell first.")
else:
    for table_name, columns in TABLES_INFO.items():
        print(f"\n--- Analyzing Table for Categorical Values: {table_name} ---")
        
        count_df = run_query(con, f'SELECT COUNT(*) as total_rows FROM "{table_name}"')
        total_rows = 0
        if count_df is not None and not count_df.is_empty():
            total_rows = count_df[0, "total_rows"]
        # print(f"Total Rows: {total_rows}") # Optional context

        if total_rows == 0:
            print("Table is empty. Skipping categorical value checks for this table.")
            continue

        print("\n3. Categorical Value Counts:")
        if table_name == RAW_NEW_TABLE_NAME:
            categorical_cols = ['STATUS', 'EXPDEL', 'CLOSURE_TYPE']
        elif table_name == RAW_OLD_TABLE_NAME:
            categorical_cols = ['status', 'expected_delay', 'closure_type', 'local_authority', 'traffic_management']
        else:
            categorical_cols = []

        if not categorical_cols:
            print("  No categorical columns defined for this table.")
            continue

        for col in categorical_cols:
            if col in columns:
                print(f"\n  Distinct values for '{col}':")
                distinct_query = f'''
                    SELECT "{col}", COUNT(*) as count
                    FROM "{table_name}"
                    GROUP BY "{col}"
                    ORDER BY count DESC
                    LIMIT 50
                '''
                distinct_df = run_query(con, distinct_query)
                if distinct_df is not None and not distinct_df.is_empty():
                    display(distinct_df)
                elif distinct_df is not None and distinct_df.is_empty():
                        print(f"    No distinct values found for '{col}' (column might be all NULL).")
                else:
                    print(f"    Could not retrieve distinct values for '{col}'.")
            else:
                    print(f"  Configured categorical column '{col}' not found in table columns for {table_name}.")
    print("--- Check 3: Categorical Value Counts Complete ---")

--- Running Check 3: Categorical Value Counts ---

--- Analyzing Table for Categorical Values: raw_new_roadworks ---

3. Categorical Value Counts:

  Distinct values for 'STATUS':


STATUS,count
str,i64
"""Published""",11350
"""Shared""",3



  Distinct values for 'EXPDEL':


EXPDEL,count
str,i64
"""Slight (less than 10 mins)""",8417
"""Moderate (10 - 30 mins)""",2877
"""Severe (more than 30 mins)""",59



  Distinct values for 'CLOSURE_TYPE':


CLOSURE_TYPE,count
str,i64
"""Programmed Routine Works""",3675
"""Area Schemes""",1850
"""Major Schemes""",1242
"""Area Renewals""",956
"""Emergency Routine Works""",768
"""Ad-hoc Routine Works""",666
"""Regional Technology Works""",368
"""Diversion/Alternate Route""",336
"""Ad-hoc Street/Road Works""",302
"""Programmed Street/Road Works""",292



--- Analyzing Table for Categorical Values: raw_old_roadworks ---

3. Categorical Value Counts:

  Distinct values for 'status':


status,count
str,i64
"""Firm""",10853
"""Provisional""",1215



  Distinct values for 'expected_delay':


expected_delay,count
str,i64
"""Slight (less than 10 mins)""",7894
"""No Delay""",2078
"""Moderate (10 - 30 mins)""",2049
"""Severe (more than 30 mins)""",47



  Distinct values for 'closure_type':


closure_type,count
str,i64
"""Planned Works""",11228
"""Emergency Works""",840



  Distinct values for 'local_authority':


local_authority,count
str,i64
"""Hampshire""",705
"""Kent""",635
"""Surrey""",512
"""Essex""",447
"""Warwickshire""",417
"""Hertfordshire""",347
"""Humberside""",330
"""Oxfordshire""",326
"""Cheshire""",313
"""Avon""",306



  Distinct values for 'traffic_management':


traffic_management,count
str,i64
"""Lane Closure""",7093
"""Carriageway Closure""",2662
"""Traffic Signals""",549
"""Mobile Lane Closure""",500
"""Lane Closure with Switching""",327
"""None""",290
"""Other""",234
"""Width Restriction""",130
"""Convoy Working""",91
"""Contraflow""",73


--- Check 3: Categorical Value Counts Complete ---


#### ID uniqueness

In [43]:
# --- Check 4: Identifier Uniqueness and Overlap ---
print("--- Running Check 4: Identifier Uniqueness and Overlap ---")
if 'con' not in globals() or con is None:
    print("Error: Database connection 'con' is not established. Please run the setup cell first.")
elif 'TABLES_INFO' not in globals() or 'RAW_NEW_TABLE_NAME' not in globals() or 'RAW_OLD_TABLE_NAME' not in globals():
    print("Error: Key table name variables (TABLES_INFO, RAW_NEW_TABLE_NAME, RAW_OLD_TABLE_NAME) are not defined. Please run the setup cell first.")
else:
    print("\n4.a Identifier Uniqueness Checks (within each table):")

    # Check New Format Identifier
    new_id_col = 'NEW_EVENT_NUMBER'
    if RAW_NEW_TABLE_NAME in TABLES_INFO and new_id_col in TABLES_INFO[RAW_NEW_TABLE_NAME]:
        print(f"\n  Checking for duplicate '{new_id_col}' in '{RAW_NEW_TABLE_NAME}':")
        dupe_new_query = f'''
            SELECT "{new_id_col}", COUNT(*) as count
            FROM "{RAW_NEW_TABLE_NAME}"
            WHERE "{new_id_col}" IS NOT NULL AND trim("{new_id_col}") != ''
            GROUP BY "{new_id_col}"
            HAVING COUNT(*) > 1
            ORDER BY count DESC
        '''
        dupe_new_df = run_query(con, dupe_new_query)
        if dupe_new_df is not None and not dupe_new_df.is_empty():
            print(f"  WARNING: Found {dupe_new_df.height} duplicate '{new_id_col}' values. Sample duplicates:")
            display(dupe_new_df.head(5))
        elif dupe_new_df is not None and dupe_new_df.is_empty():
            print(f"  OK: '{new_id_col}' values are unique (excluding NULLs and empty strings).")
        else:
            print(f"  Could not perform duplicate check for '{new_id_col}'.")
    else:
        print(f"  Skipping uniqueness check for '{new_id_col}': table or column not defined in TABLES_INFO.")
    
    # Check Old Format Identifier
    old_id_col = 'reference_number'
    if RAW_OLD_TABLE_NAME in TABLES_INFO and old_id_col in TABLES_INFO[RAW_OLD_TABLE_NAME]:
        print(f"\n  Checking for duplicate '{old_id_col}' in '{RAW_OLD_TABLE_NAME}':")
        dupe_old_query = f'''
            SELECT "{old_id_col}", COUNT(*) as count
            FROM "{RAW_OLD_TABLE_NAME}"
            WHERE "{old_id_col}" IS NOT NULL AND trim("{old_id_col}") != ''
            GROUP BY "{old_id_col}"
            HAVING COUNT(*) > 1
            ORDER BY count DESC
        '''
        dupe_old_df = run_query(con, dupe_old_query)
        if dupe_old_df is not None and not dupe_old_df.is_empty():
            print(f"  WARNING: Found {dupe_old_df.height} duplicate '{old_id_col}' values. Sample duplicates:")
            display(dupe_old_df.head(5))
        elif dupe_old_df is not None and dupe_old_df.is_empty():
            print(f"  OK: '{old_id_col}' values are unique (excluding NULLs and empty strings).")
        else:
            print(f"  Could not perform duplicate check for '{old_id_col}'.")
    else:
            print(f"  Skipping uniqueness check for '{old_id_col}': table or column not defined in TABLES_INFO.")

    print("\n\n4.b Identifier Overlap Checks (between tables):")

    # ====== Inspecting duplicate example ======
    # duplicate_examples_query = f'''
    #     SELECT *
    #     FROM "{RAW_OLD_TABLE_NAME}"
    #     WHERE "reference_number" = '1479020'
    # '''
    # details_df = run_query(con, duplicate_examples_query)
    # print(f"\n  Example of duplicate '1479020' value:")
    # display(details_df)
    # ====== Inspecting duplicate example FINISHED ======
    
    # Check overlap: NEW_EVENT_NUMBER (new) vs reference_number (old)
    new_event_col = 'NEW_EVENT_NUMBER'
    old_ref_col = 'reference_number'
    print(f"\n  Checking overlap between '{RAW_NEW_TABLE_NAME}'.'{new_event_col}' and '{RAW_OLD_TABLE_NAME}'.'{old_ref_col}':")
    if (RAW_NEW_TABLE_NAME in TABLES_INFO and new_event_col in TABLES_INFO[RAW_NEW_TABLE_NAME] and
        RAW_OLD_TABLE_NAME in TABLES_INFO and old_ref_col in TABLES_INFO[RAW_OLD_TABLE_NAME]):
        
        overlap_query_1 = f'''
            SELECT COUNT(DISTINCT t1."{new_event_col}") as overlapping_count
            FROM "{RAW_NEW_TABLE_NAME}" t1
            INNER JOIN "{RAW_OLD_TABLE_NAME}" t2 ON trim(t1."{new_event_col}") = trim(t2."{old_ref_col}")
            WHERE t1."{new_event_col}" IS NOT NULL AND trim(t1."{new_event_col}") != ''
              AND t2."{old_ref_col}" IS NOT NULL AND trim(t2."{old_ref_col}") != '';
        '''
        overlap_df_1 = run_query(con, overlap_query_1)
        if overlap_df_1 is not None and not overlap_df_1.is_empty():
            overlap_count_1 = overlap_df_1[0, "overlapping_count"]
            print(f"  Found {overlap_count_1} distinct '{new_event_col}' values from '{RAW_NEW_TABLE_NAME}' that also exist as '{old_ref_col}' in '{RAW_OLD_TABLE_NAME}'.")
            if overlap_count_1 > 0:
                examples_query_1 = f'''
                    SELECT DISTINCT t1."{new_event_col}" AS overlapping_value, 
                           t1."source_filename" AS new_table_filename,
                           t2."source_filename" AS old_table_filename
                    FROM "{RAW_NEW_TABLE_NAME}" t1
                    INNER JOIN "{RAW_OLD_TABLE_NAME}" t2 ON trim(t1."{new_event_col}") = trim(t2."{old_ref_col}")
                    WHERE t1."{new_event_col}" IS NOT NULL AND trim(t1."{new_event_col}") != ''
                      AND t2."{old_ref_col}" IS NOT NULL AND trim(t2."{old_ref_col}") != ''
                    LIMIT 5;
                '''
                examples_df_1 = run_query(con, examples_query_1)
                if examples_df_1 is not None and not examples_df_1.is_empty():
                    print("    Example overlapping values:")
                    display(examples_df_1)
        else:
            print(f"  Could not perform overlap check between '{new_event_col}' and '{old_ref_col}'.")
    else:
        print(f"  Skipping overlap check: one or both columns/tables ('{new_event_col}', '{old_ref_col}') not defined in TABLES_INFO.")


    # Check overlap: OLD_REFERENCE_NUMBER (new) vs reference_number (old)
    new_old_ref_col = 'OLD_REFERENCE_NUMBER' # This is from the new table
    # old_ref_col is already defined as 'reference_number' for the old table
    print(f"\n  Checking overlap between '{RAW_NEW_TABLE_NAME}'.'{new_old_ref_col}' and '{RAW_OLD_TABLE_NAME}'.'{old_ref_col}':")
    if (RAW_NEW_TABLE_NAME in TABLES_INFO and new_old_ref_col in TABLES_INFO[RAW_NEW_TABLE_NAME] and
        RAW_OLD_TABLE_NAME in TABLES_INFO and old_ref_col in TABLES_INFO[RAW_OLD_TABLE_NAME]):

        overlap_query_2 = f'''
            SELECT COUNT(DISTINCT t1."{new_old_ref_col}") as overlapping_count
            FROM "{RAW_NEW_TABLE_NAME}" t1
            INNER JOIN "{RAW_OLD_TABLE_NAME}" t2 ON trim(t1."{new_old_ref_col}") = trim(t2."{old_ref_col}")
            WHERE t1."{new_old_ref_col}" IS NOT NULL AND trim(t1."{new_old_ref_col}") != ''
              AND t2."{old_ref_col}" IS NOT NULL AND trim(t2."{old_ref_col}") != '';
        '''
        overlap_df_2 = run_query(con, overlap_query_2)
        if overlap_df_2 is not None and not overlap_df_2.is_empty():
            overlap_count_2 = overlap_df_2[0, "overlapping_count"]
            print(f"  Found {overlap_count_2} distinct '{new_old_ref_col}' values from '{RAW_NEW_TABLE_NAME}' that also exist as '{old_ref_col}' in '{RAW_OLD_TABLE_NAME}'.")
            if overlap_count_2 > 0:
                examples_query_2 = f'''
                    SELECT DISTINCT t1."{new_old_ref_col}" AS overlapping_value,
                           t1."source_filename" AS new_table_filename,
                           t2."source_filename" AS old_table_filename
                    FROM "{RAW_NEW_TABLE_NAME}" t1
                    INNER JOIN "{RAW_OLD_TABLE_NAME}" t2 ON trim(t1."{new_old_ref_col}") = trim(t2."{old_ref_col}")
                    WHERE t1."{new_old_ref_col}" IS NOT NULL AND trim(t1."{new_old_ref_col}") != ''
                      AND t2."{old_ref_col}" IS NOT NULL AND trim(t2."{old_ref_col}") != ''
                    LIMIT 5;
                '''
                examples_df_2 = run_query(con, examples_query_2)
                if examples_df_2 is not None and not examples_df_2.is_empty():
                    print("    Example overlapping values:")
                    display(examples_df_2)
        else:
            print(f"  Could not perform overlap check between '{new_old_ref_col}' and '{old_ref_col}'.")
    else:
        print(f"  Skipping overlap check: one or both columns/tables ('{new_old_ref_col}', '{old_ref_col}') not defined in TABLES_INFO.")

    print("\n--- Check 4: Identifier Uniqueness and Overlap Complete ---")

--- Running Check 4: Identifier Uniqueness and Overlap ---

4.a Identifier Uniqueness Checks (within each table):

  Checking for duplicate 'NEW_EVENT_NUMBER' in 'raw_new_roadworks':


NEW_EVENT_NUMBER,count
str,i64
"""00044016-002""",4
"""00076857-001""",4
"""00253822-003""",3
"""00070856-003""",3
"""00146456-002""",3



  Checking for duplicate 'reference_number' in 'raw_old_roadworks':


reference_number,count
str,i64
"""1479020""",6
"""1512545""",5
"""2311381""",4
"""783303""",4
"""213110""",4




4.b Identifier Overlap Checks (between tables):

  Checking overlap between 'raw_new_roadworks'.'NEW_EVENT_NUMBER' and 'raw_old_roadworks'.'reference_number':
  Found 0 distinct 'NEW_EVENT_NUMBER' values from 'raw_new_roadworks' that also exist as 'reference_number' in 'raw_old_roadworks'.

  Checking overlap between 'raw_new_roadworks'.'OLD_REFERENCE_NUMBER' and 'raw_old_roadworks'.'reference_number':
  Found 74 distinct 'OLD_REFERENCE_NUMBER' values from 'raw_new_roadworks' that also exist as 'reference_number' in 'raw_old_roadworks'.
    Example overlapping values:


overlapping_value,new_table_filename,old_table_filename
str,str,str
"""3987855""","""he_roadworks_2018_02_26.xml""","""he_roadworks_2017_06_05.xml"""
"""3883906""","""he_roadworks_2018_02_26.xml""","""he_roadworks_2017_06_05.xml"""
"""4088992""","""he_roadworks_2018_02_26.xml""","""he_roadworks_2017_06_05.xml"""
"""3892155""","""he_roadworks_2018_02_26.xml""","""he_roadworks_2017_06_05.xml"""
"""2294207""","""he_roadworks_2018_02_26.xml""","""he_roadworks_2017_06_05.xml"""



--- Check 4: Identifier Uniqueness and Overlap Complete ---


### Convert data types
1. Numeric conversion (coordinates, reference number, NEW_EVENT_NUMBER?)
1. Convert dates

### Check converted data types for plausibility
1. Date ranges
1. Coordinate ranges (Correct locations in the UK?)
1. Did numeric conversions succeed?