# Unify data (old & new format)

In [None]:
import duckdb
import glob
import os
from IPython.display import display

# --- Configuration ---
DUCKDB_FILE = 'roadworks_data.duckdb'
RAW_NEW_TABLE_NAME = 'raw_new_roadworks'
RAW_OLD_TABLE_NAME = 'raw_old_roadworks'
UNIFIED_TABLE_NAME = 'uk_roadworks'

# --- Helper function to run queries (optional, or use con.sql().pl() directly) ---
def run_query_df(connection, sql_query):
    """Helper function to run a query and return a Polars DataFrame."""
    if not connection:
        print("Error: Database connection is not established.")
        return None
    try:
        return connection.sql(sql_query).pl()
    except duckdb.Error as e:
        print(f"Error running query:\n{sql_query}\nError: {e}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None

# --- Connect to DuckDB ---
con = None
try:
    con = duckdb.connect(database=DUCKDB_FILE, read_only=False)
    print(f"Successfully connected to {DUCKDB_FILE}")
except Exception as e:
    print(f"Failed to connect to DuckDB: {e}")

## 1. Define and create unified table

In [None]:
# --- 1. Define and Create Unified Table ---
# Consider primary keys, constraints, and exact data types based on your analysis.
# event_id: NEW_EVENT_NUMBER for new, reference_number for old.
# legacy_reference_id: OLD_REFERENCE_NUMBER from new data.
# Using the _NUMERIC and _DT columns created in data_exploration.ipynb.

create_unified_table_sql = f"""
CREATE OR REPLACE TABLE "{UNIFIED_TABLE_NAME}" (
    event_id VARCHAR,                        -- NEW_EVENT_NUMBER (new) or reference_number (old)
    legacy_reference_id VARCHAR,             -- OLD_REFERENCE_NUMBER (new)
    start_datetime TIMESTAMP,                -- SDATE_DT (new) or start_date_dt (old)
    end_datetime TIMESTAMP,                  -- EDATE_DT (new) or end_date_dt (old)
    published_datetime TIMESTAMP,            -- PUBLISHED_DATE_DT (new) or published_date_dt (old)
    expected_delay VARCHAR,                  -- EXPDEL (new) or expected_delay (old)
    description VARCHAR,                     -- DESCRIPTION (new) or description (old)
    closure_type VARCHAR,                    -- CLOSURE_TYPE (new) or closure_type (old)
    status VARCHAR,                          -- STATUS (new) or status (old)
    road_names VARCHAR,                      -- ROAD_NUMBERS (new) or road (old)
    easting_osgb INTEGER,                    -- CENTRE_EASTING_NUMERIC (new) or centre_easting_numeric (old)
    northing_osgb INTEGER,                   -- CENTRE_NORTHING_NUMERIC (new) or centre_northing_numeric (old)
    longitude_wgs84 DOUBLE,
    latitude_wgs84 DOUBLE,
    location_detail VARCHAR,                 -- location (old only)
    local_authority VARCHAR,                 -- local_authority (old only)
    traffic_management_type VARCHAR,         -- traffic_management (old only)
    source_filename VARCHAR,
    data_source_format VARCHAR              -- 'new_xml' or 'old_xml'
);
"""
con.execute(create_unified_table_sql)
print(f"Table '{UNIFIED_TABLE_NAME}' created/re-created successfully.")

# --- 2. Populate Unified Table from New Format Data ---
# Ensure you use the columns with converted data types (e.g., _NUMERIC, _DT)
insert_from_new_sql = f"""
INSERT INTO "{UNIFIED_TABLE_NAME}"
SELECT
    "NEW_EVENT_NUMBER" AS event_id,
    "OLD_REFERENCE_NUMBER" AS legacy_reference_id, -- Use OLD_REFERENCE_NUMBER_NUMERIC if it's always numeric and preferred
    "SDATE_DT" AS start_datetime,
    "EDATE_DT" AS end_datetime,
    "PUBLISHED_DATE_DT" AS published_datetime,
    "EXPDEL" AS expected_delay,
    "DESCRIPTION" AS description,
    "CLOSURE_TYPE" AS closure_type,
    "STATUS" AS status,
    "ROAD_NUMBERS" AS road_names,
    "CENTRE_EASTING_NUMERIC" AS easting_osgb,
    "CENTRE_NORTHING_NUMERIC" AS northing_osgb,
    longitude_wgs84,
    latitude_wgs84,
    NULL AS location_detail,        -- Not present in new format
    NULL AS local_authority,        -- Not present in new format
    NULL AS traffic_management_type,-- Not present in new format
    source_filename,
    'new_xml' AS data_source_format
FROM "{RAW_NEW_TABLE_NAME}";
"""
con.execute(insert_from_new_sql)
print(f"Data inserted from '{RAW_NEW_TABLE_NAME}' into '{UNIFIED_TABLE_NAME}'.")

# --- 3. Populate Unified Table from Old Format Data ---
insert_from_old_sql = f"""
INSERT INTO "{UNIFIED_TABLE_NAME}"
SELECT
    "reference_number" AS event_id, -- Use reference_number_numeric if preferred and always populated
    NULL AS legacy_reference_id,    -- Not applicable or directly present in old format structure
    "start_date_dt" AS start_datetime,
    "end_date_dt" AS end_datetime,
    "published_date_dt" AS published_datetime,
    "expected_delay" AS expected_delay,
    "description" AS description,
    "closure_type" AS closure_type,
    "status" AS status,
    "road" AS road_names,
    "centre_easting_numeric" AS easting_osgb,
    "centre_northing_numeric" AS northing_osgb,
    longitude_wgs84,
    latitude_wgs84,
    "location" AS location_detail,
    "local_authority" AS local_authority,
    "traffic_management" AS traffic_management_type,
    source_filename,
    'old_xml' AS data_source_format
FROM "{RAW_OLD_TABLE_NAME}";
"""
con.execute(insert_from_old_sql)
print(f"Data inserted from '{RAW_OLD_TABLE_NAME}' into '{UNIFIED_TABLE_NAME}'.")

# --- 4. Verification (Example) ---
print(f"\n--- Verifying {UNIFIED_TABLE_NAME} ---")
total_rows_unified = con.execute(f'SELECT COUNT(*) FROM "{UNIFIED_TABLE_NAME}"').fetchone()[0]
print(f"Total rows in '{UNIFIED_TABLE_NAME}': {total_rows_unified}")

print(f"\nSample of 5 rows from '{UNIFIED_TABLE_NAME}':")
sample_unified_df = run_query_df(con, f'SELECT * FROM "{UNIFIED_TABLE_NAME}" LIMIT 5')
if sample_unified_df is not None:
    # display(sample_unified_df) # In Jupyter, this would display the Polars DataFrame
    print(sample_unified_df) # For plain Python output

# Further verification:
# - Check counts per data_source_format
# - Examine distinct values for standardized columns (expected_delay, status, closure_type)
# - Check for NULLs in key columns

con.commit()
print("Changes committed.")