In [0]:
import re
from pyspark.sql import SparkSession
from datetime import datetime, timedelta

# Assuming detect_key_columns is defined elsewhere or imported
# Placeholder function if not available:
def detect_key_columns(spark, full_table_name):
    # In a real scenario, this function would inspect table metadata (like primary keys)
    # For this example, let's return a placeholder or None
    print(f"Warning: Using placeholder detect_key_columns for {full_table_name}. Implement actual logic.")
    # Example placeholder logic (replace with real implementation)
    if "mill_" in full_table_name:
         return ["some_mill_id_col"] # Example
    # return None # Or return None if detection fails
    return [] # Return empty list if no keys detected/defined


def add_table_to_pipeline(
table_name,
src_server_name=None,
src_server_id=None,
src_database=None,
src_schema=None,
src_table=None,
dst_catalog="4_prod",
dst_schema="raw",
dst_table=None,
key_columns=None,
watermark_column=None,
watermark_value=None,
item_tag=None,
comment=None,
active_ind=1,
query_timeout="00:05:00",
dry_run=False
):
    """
    Add a new table to the ETL pipeline by registering it in the watermark management system.
    Works even if the destination table doesn't exist yet.

    If item_tag includes 'wt_updt', validation/auto-detection for watermark_column
    and key_columns is skipped, allowing them to be NULL (for whole table updates).
    """
    import re
    from pyspark.sql import SparkSession
    from datetime import datetime, timedelta

    spark = SparkSession.builder.getOrCreate()

    # Standardize destination table name
    if dst_table is None:
        dst_table = table_name.lower()
    else:
        dst_table = dst_table.lower()

    # --- Start of Inference Logic ---
    # Detect source information based on destination table prefix patterns
    # This block attempts to fill in missing source details and item_tag if possible
    if src_server_name is None or src_server_id is None or item_tag is None:
        if dst_table.startswith('mill_'):
            src_server_name = src_server_name or 'oracle_mill'
            src_server_id = src_server_id or 107
            src_schema = src_schema or 'V500'
            item_tag = item_tag or 'mill'
        elif dst_table.startswith(('cds_', 'msds', 'mat_', 'slam_', 'pi_')):
            src_server_name = src_server_name or 'BH2VMDWRL1'
            src_server_id = src_server_id or 104
            src_database = src_database or 'BH_DATAWAREHOUSE'
            src_schema = src_schema or 'dbo'
            item_tag = item_tag or 'dwh_cds' if dst_table.startswith('cds_') else 'dwh'
        elif dst_table.startswith('iqemo_'):
            src_server_name = src_server_name or 'BH2VMDWRL1'
            src_server_id = src_server_id or 104
            src_database = src_database or 'iqemo.iqemo'
            src_schema = src_schema or 'dbo'
            item_tag = item_tag or 'iqemo'
        elif dst_table.startswith('pacs_'):
            src_server_name = src_server_name or 'BH2VMDWRL1'
            src_server_id = src_server_id or 104
            src_database = src_database or 'IMAGING_Sectra.SDWH_PACS_Datamodel'
            src_schema = src_schema or '[Interface 1.25]'
            item_tag = item_tag or 'pacs_incr_updt'
        elif dst_table.startswith('nnu_'):
            src_server_name = src_server_name or 'BH2VMDWRL1'
            src_server_id = src_server_id or 104
            src_database = src_database or 'BadgerNetReporting'
            src_schema = src_schema or 'bnf_dbsync'
            item_tag = item_tag or 'dwh'
        elif dst_table.startswith('path_'):
            src_server_name = src_server_name or 'BH2VMDWRL1'
            src_server_id = src_server_id or 104
            src_database = src_database or 'PATHOLOGY.BH_PathDataWarehouse'
            src_schema = src_schema or 'dbo'
            item_tag = item_tag or 'path_incr_updt'
        # Fallback if item_tag is still None after pattern matching
        item_tag = item_tag or 'generic' # Assign a default if none matched

    # Derive source table name if not specified
    if src_table is None:
        if dst_table.startswith('mill_'):
            src_table = re.sub(r'^mill_', '', dst_table, flags=re.IGNORECASE).upper()
        elif dst_table.startswith('iqemo_'):
            src_table = re.sub(r'^iqemo_', '', dst_table, flags=re.IGNORECASE)
        elif dst_table.startswith('pacs_'):
            src_table = re.sub(r'^pacs_', '', dst_table, flags=re.IGNORECASE)
        elif dst_table.startswith('nnu_'):
            src_table = re.sub(r'^nnu_', '', dst_table, flags=re.IGNORECASE)
        elif dst_table.startswith('path_'):
            src_table = re.sub(r'^path_', '', dst_table, flags=re.IGNORECASE)
        else:
            src_table = dst_table.upper()
    # --- End of Inference Logic ---

    # Fully qualified destination table
    full_dst_table = f"{dst_catalog}.{dst_schema}.{dst_table}"

    # Check if table exists in destination
    table_exists = spark.sql(f"SHOW TABLES FROM {dst_catalog}.{dst_schema} LIKE '{dst_table}'").count() > 0

    # Check if the item_tag indicates a whole table update
    is_wt_updt = item_tag is not None and "wt_updt" in item_tag.lower() # Check case-insensitively

    # If table doesn't exist, ensure we have required parameters (unless wt_updt)
    if not table_exists and not is_wt_updt:
        if watermark_value is None:
            watermark_value = "2008-01-01" # Default start for new incremental tables
        if key_columns is None:
            return {
                "status": "error",
                "message": f"Table {full_dst_table} does not exist yet. Must specify key_columns explicitly (or use 'wt_updt' in item_tag)."
            }
    elif not table_exists and is_wt_updt:
         # For wt_updt, watermark and keys are not strictly needed even if table doesn't exist
         # Set default watermark_value to None if not provided, as it's irrelevant
         watermark_value = watermark_value # Keep provided value or None
         watermark_column = watermark_column # Keep provided value or None
         key_columns = key_columns # Keep provided value or None

    # Check if table is already in the watermark table
    existing_wm = spark.sql(f"""
        SELECT watermark_id
        FROM 6_mgmt.incr_updt.watermark
        WHERE dst_catalog = '{dst_catalog}'
          AND dst_schema = '{dst_schema}'
          AND dst_table = '{dst_table}'
          AND active_ind = 1
    """).collect()

    if existing_wm:
        existing_id = existing_wm[0]["watermark_id"]
        return {
            "status": "warning",
            "message": f"Table {dst_table} already exists in the watermark table with ID {existing_id}.",
            "watermark_id": existing_id
        }

    # For tables that exist, perform column validation and auto-detection (unless wt_updt)
    if table_exists and not is_wt_updt:
        # Get all columns
        columns_df = spark.sql(f"DESCRIBE TABLE {full_dst_table}")
        all_columns = {row["col_name"].lower(): row["data_type"] for row in columns_df.collect()}

        # Auto-detect watermark column if not specified
        if watermark_column is None:
            # Different patterns based on source
            if src_server_name == 'oracle_mill':
                watermark_candidates = ["updt_dt_tm", "update_dt_tm"]
            elif src_server_name == 'BH2VMDWRL1' and src_database and 'datawarehouse' in str(src_database).lower():
                watermark_candidates = ["record_updated_dt", "record_update_dt"]
            elif src_database and 'iqemo' in str(src_database).lower():
                watermark_candidates = ["dateupdated", "lastupdate"]
            elif src_database and 'pacs' in str(src_database).lower():
                watermark_candidates = ["reportmodifieddateutc", "examinationfoldermodifydate", "reportdate"]
            elif src_database and 'badger' in str(src_database).lower():
                watermark_candidates = ["lastupdate", "recordtimestamp"]
            else:
                watermark_candidates = [
                    "updt_dt_tm", "update_dt_tm", "record_updated_dt", "record_update_dt",
                    "dateupdated", "lastupdate", "reportdate", "reportmodifieddateutc",
                    "examinationfoldermodifydate", "recordtimestamp"
                ]

            detected_wm_col = None
            for col in watermark_candidates:
                if col in all_columns:
                    detected_wm_col = col.upper() # Use upper case as convention? Or keep original case? Let's stick to upper for consistency with original code.
                    break

            if detected_wm_col is None:
                return {
                    "status": "error",
                    "message": f"Could not auto-detect a watermark column for {dst_table}. Please specify one explicitly or use 'wt_updt' in item_tag."
                }
            watermark_column = detected_wm_col # Assign detected column

        elif watermark_column.lower() not in all_columns:
            return {
                "status": "error",
                "message": f"Specified watermark column '{watermark_column}' does not exist in table {dst_table}."
            }

        # Auto-detect key columns if not provided
        if key_columns is None:
            # Assuming detect_key_columns function exists and works
            try:
                 detected_keys = detect_key_columns(spark, full_dst_table)
                 if not detected_keys: # If detection returns None or empty list
                     return {
                        "status": "error",
                        "message": f"Could not auto-detect key columns for {dst_table}. Please specify them explicitly or use 'wt_updt' in item_tag."
                    }
                 key_columns = detected_keys
            except Exception as e:
                 return {
                    "status": "error",
                    "message": f"Error during key column detection for {dst_table}: {e}. Please specify them explicitly or use 'wt_updt' in item_tag."
                }


        # Auto-determine watermark value if not specified (only if watermark column is valid)
        if watermark_value is None and watermark_column:
            try:
                # Ensure watermark column name is quoted in case it contains special characters
                max_ts_result = spark.sql(f"SELECT MAX(`{watermark_column}`) as max_ts FROM {full_dst_table}").collect()
                if max_ts_result and max_ts_result[0]["max_ts"] is not None:
                     max_ts = max_ts_result[0]["max_ts"]
                     # Check if max_ts is a datetime or date object before subtracting timedelta
                     if isinstance(max_ts, (datetime, date)):
                         watermark_value = (max_ts - timedelta(days=7)).strftime("%Y-%m-%d")
                     else:
                         # Handle cases where it might be a string or number that looks like a date
                         # This part might need refinement based on actual data types
                         print(f"Warning: Max value for {watermark_column} is not a date/datetime type ({type(max_ts)}). Setting default watermark.")
                         watermark_value = "2008-01-01"
                else:
                    watermark_value = "2008-01-01" # Default if table is empty or max is NULL
            except Exception as e:
                print(f"Warning: Could not determine max watermark value for {watermark_column} in {full_dst_table}: {e}. Using default.")
                watermark_value = "2008-01-01"

    # Ensure key_columns is a list if it's not None (and not wt_updt case where it might stay None)
    if key_columns is not None and not isinstance(key_columns, list):
        key_columns = [key_columns]

    # Format values for SQL insertion, handling NULLs properly
    comment_sql = f"'{comment}'" if comment is not None else "NULL"
    src_database_sql = f"'{src_database}'" if src_database is not None else "NULL"
    src_schema_sql = f"'{src_schema}'" if src_schema is not None else "NULL"
    # IMPORTANT: Handle NULL for watermark column and value
    watermark_column_sql = f"'{watermark_column}'" if watermark_column is not None else "NULL"
    watermark_value_sql = f"'{watermark_value}'" if watermark_value is not None else "NULL"
    # Ensure item_tag is quoted
    item_tag_sql = f"'{item_tag}'" if item_tag is not None else "NULL"


    # Create SQL queries
    watermark_query = f"""
    INSERT INTO 6_mgmt.incr_updt.watermark(
        src_server_name, src_server_id, src_database, src_schema, src_table,
        dst_catalog, dst_schema, dst_table,
        watermark_column, watermark_value, item_tag, comment, active_ind, query_timeout
    ) VALUES (
        '{src_server_name}', {src_server_id}, {src_database_sql}, {src_schema_sql}, '{src_table}',
        '{dst_catalog}', '{dst_schema}', '{dst_table}',
        {watermark_column_sql}, {watermark_value_sql},
        {item_tag_sql}, {comment_sql}, {active_ind}, '{query_timeout}'
    )
    """

    # Prepare key column queries (only if key_columns is not None and not empty)
    key_col_queries = []
    if key_columns: # Check handles None and empty list
        key_col_queries = [f"""
            INSERT INTO 6_mgmt.incr_updt.table_key_columns(
                watermark_id, dst_table_name, key_column_name, active_ind
            ) VALUES (
                #watermark_id#, '{dst_table}', '{key_column}', 1
            )
        """ for key_column in key_columns] # Assumes key_columns is now a list


    # --- Execution or Dry Run ---
    if dry_run:
        return {
            "status": "dry_run",
            "watermark_query": watermark_query,
            "key_column_queries": key_col_queries if key_columns else ["No key columns specified or needed (wt_updt)."],
            "configuration": {
                "src_server_name": src_server_name,
                "src_server_id": src_server_id,
                "src_database": src_database,
                "src_schema": src_schema,
                "src_table": src_table,
                "dst_catalog": dst_catalog,
                "dst_schema": dst_schema,
                "dst_table": dst_table,
                "key_columns": key_columns, # Will be None or list
                "watermark_column": watermark_column, # Will be None or string
                "watermark_value": watermark_value, # Will be None or string
                "item_tag": item_tag,
                "table_exists": table_exists,
                "is_wt_updt": is_wt_updt
            }
        }

    # Execute queries
    try:
        spark.sql(watermark_query)

        # Get the newly created watermark_id
        watermark_id_result = spark.sql(f"""
            SELECT MAX(watermark_id) as watermark_id
            FROM 6_mgmt.incr_updt.watermark
            WHERE dst_catalog = '{dst_catalog}'
              AND dst_schema = '{dst_schema}'
              AND dst_table = '{dst_table}'
              AND active_ind = {active_ind} -- Match active_ind used in insert
              AND src_server_name = '{src_server_name}' -- Add more conditions for robustness
              AND src_table = '{src_table}'
        """).collect()

        if not watermark_id_result or watermark_id_result[0]["watermark_id"] is None:
             raise ValueError("Could not retrieve watermark_id after insertion.")
        watermark_id = watermark_id_result[0]["watermark_id"]

        # Insert key columns (only if key_columns were provided/detected)
        if key_columns: # Check again before executing inserts
            for key_column in key_columns:
                spark.sql(f"""
                    INSERT INTO 6_mgmt.incr_updt.table_key_columns(
                        watermark_id, dst_table_name, key_column_name, active_ind
                    ) VALUES (
                        {watermark_id}, '{dst_table}', '{key_column}', 1
                    )
                """)

        return {
            "status": "success",
            "watermark_id": watermark_id,
            "message": f"Successfully added {dst_table} to the ETL pipeline with watermark_id {watermark_id}."
                       f"{' (Whole Table Update)' if is_wt_updt else ''}",
            "configuration": {
                "src_server_name": src_server_name,
                "src_server_id": src_server_id,
                "src_database": src_database,
                "src_schema": src_schema,
                "src_table": src_table,
                "dst_catalog": dst_catalog,
                "dst_schema": dst_schema,
                "dst_table": dst_table,
                "key_columns": key_columns,
                "watermark_column": watermark_column,
                "watermark_value": watermark_value,
                "item_tag": item_tag,
                "is_wt_updt": is_wt_updt
            }
        }

    except Exception as e:
        # Attempt to clean up if watermark record was inserted but keys failed (optional)
        # spark.sql(f"DELETE FROM 6_mgmt.incr_updt.watermark WHERE watermark_id = {watermark_id}") # Be cautious with cleanup
        return {
            "status": "error",
            "message": f"Error adding table to pipeline: {str(e)}",
            "watermark_query": watermark_query, # Include query for debugging
            "key_column_queries": key_col_queries if key_columns else [] # Include queries for debugging
        }

In [0]:
run_result = add_table_to_pipeline(
    table_name="CDS_ECD_REF_CHIEF_COMPLAINT",
    dst_catalog="3_lookup",
    dst_schema="dwh",
    src_server_name="BH2VMDWRL1",
    src_server_id=104, 
    src_database="BH_DATAWAREHOUSE",
    src_schema="dbo",
    src_table="LKP_CDS_ECD_REF_CHIEF_COMPLAINT",
    key_columns=None,
    watermark_column=None,
    item_tag="dwh_wt_updt",
    dry_run=True
)

print("Complete result:")
print(run_result)

In [0]:
run_result = add_table_to_pipeline(
    table_name="CDS_ECD_REF_INVESTIGATION",
    dst_catalog="3_lookup",
    dst_schema="dwh",
    src_server_name="BH2VMDWRL1",
    src_server_id=104, 
    src_database="BH_DATAWAREHOUSE",
    src_schema="dbo",
    src_table="LKP_CDS_ECD_REF_INVESTIGATION",
    key_columns=None,
    watermark_column=None,
    item_tag="dwh_wt_updt",
    dry_run=True
)

print("Complete result:")
print(run_result)