In [0]:
def add_table_to_pipeline(
    table_name,
    src_server_name=None,
    src_server_id=None,
    src_database=None,
    src_schema=None,
    src_table=None,
    dst_catalog="4_prod",
    dst_schema="raw",
    dst_table=None,
    key_columns=None,
    watermark_column=None,
    watermark_value=None,
    item_tag=None,
    comment=None,
    active_ind=1,
    query_timeout="00:05:00",
    dry_run=False
):
    """
    Add a new table to the ETL pipeline by registering it in the watermark management system.
    Works even if the destination table doesn't exist yet.
    """
    import re
    from pyspark.sql import SparkSession
    from datetime import datetime, timedelta
    
    spark = SparkSession.builder.getOrCreate()
    
    # Standardize destination table name
    if dst_table is None:
        dst_table = table_name.lower()
    else:
        dst_table = dst_table.lower()
    
    # Detect source information based on destination table prefix patterns
    if src_server_name is None or src_server_id is None or item_tag is None:
        if dst_table.startswith('mill_'):
            src_server_name = src_server_name or 'oracle_mill'
            src_server_id = src_server_id or 107
            src_schema = src_schema or 'V500'
            item_tag = item_tag or 'mill'
        elif dst_table.startswith(('cds_', 'msds', 'mat_', 'slam_', 'pi_')):
            src_server_name = src_server_name or 'BH2VMDWRL1'
            src_server_id = src_server_id or 104
            src_database = src_database or 'BH_DATAWAREHOUSE'
            src_schema = src_schema or 'dbo'
            item_tag = item_tag or 'dwh_cds' if dst_table.startswith('cds_') else 'dwh'
        elif dst_table.startswith('iqemo_'):
            src_server_name = src_server_name or 'BH2VMDWRL1'
            src_server_id = src_server_id or 104
            src_database = src_database or 'iqemo.iqemo'
            src_schema = src_schema or 'dbo'
            item_tag = item_tag or 'iqemo'
        elif dst_table.startswith('pacs_'):
            src_server_name = src_server_name or 'BH2VMDWRL1'
            src_server_id = src_server_id or 104
            src_database = src_database or 'IMAGING_Sectra.SDWH_PACS_Datamodel'
            src_schema = src_schema or '[Interface 1.25]'
            item_tag = item_tag or 'pacs_incr_updt'
        elif dst_table.startswith('nnu_'):
            src_server_name = src_server_name or 'BH2VMDWRL1'
            src_server_id = src_server_id or 104
            src_database = src_database or 'BadgerNetReporting'
            src_schema = src_schema or 'bnf_dbsync'
            item_tag = item_tag or 'dwh'
        elif dst_table.startswith('path_'):
            src_server_name = src_server_name or 'BH2VMDWRL1'
            src_server_id = src_server_id or 104
            src_database = src_database or 'PATHOLOGY.BH_PathDataWarehouse'
            src_schema = src_schema or 'dbo'
            item_tag = item_tag or 'path_incr_updt'
    
    # Derive source table name if not specified
    if src_table is None:
        if dst_table.startswith('mill_'):
            src_table = re.sub(r'^mill_', '', dst_table, flags=re.IGNORECASE).upper()
        elif dst_table.startswith('iqemo_'):
            src_table = re.sub(r'^iqemo_', '', dst_table, flags=re.IGNORECASE)
        elif dst_table.startswith('pacs_'):
            src_table = re.sub(r'^pacs_', '', dst_table, flags=re.IGNORECASE)
        elif dst_table.startswith('nnu_'):
            src_table = re.sub(r'^nnu_', '', dst_table, flags=re.IGNORECASE)
        elif dst_table.startswith('path_'):
            src_table = re.sub(r'^path_', '', dst_table, flags=re.IGNORECASE)
        else:
            src_table = dst_table.upper()
    
    # Fully qualified destination table
    full_dst_table = f"{dst_catalog}.{dst_schema}.{dst_table}"
    
    # Check if table exists in destination
    table_exists = spark.sql(f"SHOW TABLES FROM {dst_catalog}.{dst_schema} LIKE '{dst_table}'").count() > 0
    
    # If table doesn't exist, ensure we have required parameters
    if not table_exists:
        if watermark_value is None:
            watermark_value = "2008-01-01"
        if key_columns is None:
            return {
                "status": "error",
                "message": f"Table {full_dst_table} does not exist yet. Must specify key_columns explicitly."
            }
    
    # Check if table is already in the watermark table
    existing_wm = spark.sql(f"""
        SELECT watermark_id 
        FROM 6_mgmt.incr_updt.watermark
        WHERE dst_catalog = '{dst_catalog}'
          AND dst_schema = '{dst_schema}'
          AND dst_table = '{dst_table}'
          AND active_ind = 1
    """).collect()
    
    if existing_wm:
        existing_id = existing_wm[0]["watermark_id"]
        return {
            "status": "warning",
            "message": f"Table {dst_table} already exists in the watermark table with ID {existing_id}.",
            "watermark_id": existing_id
        }
    
    # For tables that exist, perform column validation and auto-detection
    if table_exists:
        # Get all columns
        columns_df = spark.sql(f"DESCRIBE TABLE {full_dst_table}")
        all_columns = {row["col_name"].lower(): row["data_type"] for row in columns_df.collect()}
        
        # Auto-detect watermark column if not specified
        if watermark_column is None:
            # Different patterns based on source
            if src_server_name == 'oracle_mill':
                watermark_candidates = ["updt_dt_tm", "update_dt_tm"]
            elif src_server_name == 'BH2VMDWRL1' and 'datawarehouse' in str(src_database).lower():
                watermark_candidates = ["record_updated_dt", "record_update_dt"]
            elif src_database and 'iqemo' in str(src_database).lower():
                watermark_candidates = ["dateupdated", "lastupdate"]
            elif src_database and 'pacs' in str(src_database).lower():
                watermark_candidates = ["reportmodifieddateutc", "examinationfoldermodifydate", "reportdate"]
            elif src_database and 'badger' in str(src_database).lower():
                watermark_candidates = ["lastupdate", "recordtimestamp"]
            else:
                watermark_candidates = [
                    "updt_dt_tm", "update_dt_tm", "record_updated_dt", "record_update_dt", 
                    "dateupdated", "lastupdate", "reportdate", "reportmodifieddateutc", 
                    "examinationfoldermodifydate", "recordtimestamp"
                ]
            
            for col in watermark_candidates:
                if col in all_columns:
                    watermark_column = col.upper()
                    break
                    
            if watermark_column is None:
                return {
                    "status": "error",
                    "message": f"Could not auto-detect a watermark column for {dst_table}. Please specify one explicitly."
                }
        elif watermark_column.lower() not in all_columns:
            return {
                "status": "error",
                "message": f"Specified watermark column '{watermark_column}' does not exist in table {dst_table}."
            }
        
        # Auto-detect key columns if not provided
        if key_columns is None:
            key_columns = detect_key_columns(spark, full_dst_table)
            
            if not key_columns:
                return {
                    "status": "error",
                    "message": f"Could not auto-detect key columns for {dst_table}. Please specify them explicitly."
                }
        
        # Auto-determine watermark value if not specified
        if watermark_value is None:
            try:
                max_ts = spark.sql(f"SELECT MAX(`{watermark_column}`) as max_ts FROM {full_dst_table}").collect()[0]["max_ts"]
                watermark_value = (max_ts - timedelta(days=7)).strftime("%Y-%m-%d") if max_ts else "2008-01-01"
            except:
                watermark_value = "2008-01-01"
    
    # Ensure key_columns is a list
    if isinstance(key_columns, str):
        key_columns = [key_columns]
            
    # Format comment for SQL
    comment_sql = f"'{comment}'" if comment else "NULL"
    
    # Format source database and schema for SQL
    src_database_sql = f"'{src_database}'" if src_database else "NULL"
    src_schema_sql = f"'{src_schema}'" if src_schema else "NULL"
            
    # Create SQL queries
    watermark_query = f"""
    INSERT INTO 6_mgmt.incr_updt.watermark(
        src_server_name, src_server_id, src_database, src_schema, src_table, 
        dst_catalog, dst_schema, dst_table, 
        watermark_column, watermark_value, item_tag, comment, active_ind, query_timeout
    ) VALUES (
        '{src_server_name}', {src_server_id}, {src_database_sql}, {src_schema_sql}, '{src_table}',
        '{dst_catalog}', '{dst_schema}', '{dst_table}',
        '{watermark_column}', '{watermark_value}',
        '{item_tag}', {comment_sql}, {active_ind}, '{query_timeout}'
    )
    """
    
    # Prepare key column queries
    key_col_queries = [f"""
        INSERT INTO 6_mgmt.incr_updt.table_key_columns(
            watermark_id, dst_table_name, key_column_name, active_ind
        ) VALUES (
            #watermark_id#, '{dst_table}', '{key_column}', 1
        )
    """ for key_column in key_columns]
    
    # If dry run, just return the queries
    if dry_run:
        return {
            "status": "dry_run",
            "watermark_query": watermark_query,
            "key_column_queries": key_col_queries,
            "configuration": {
                "src_server_name": src_server_name,
                "src_server_id": src_server_id,
                "src_database": src_database,
                "src_schema": src_schema,
                "src_table": src_table,
                "dst_catalog": dst_catalog,
                "dst_schema": dst_schema,
                "dst_table": dst_table,
                "key_columns": key_columns,
                "watermark_column": watermark_column,
                "watermark_value": watermark_value,
                "item_tag": item_tag,
                "table_exists": table_exists
            }
        }
    
    # Execute queries
    try:
        spark.sql(watermark_query)
        
        # Get the newly created watermark_id
        watermark_id = spark.sql(f"""
            SELECT MAX(watermark_id) as watermark_id
            FROM 6_mgmt.incr_updt.watermark
            WHERE dst_catalog = '{dst_catalog}'
              AND dst_schema = '{dst_schema}'
              AND dst_table = '{dst_table}'
        """).collect()[0]["watermark_id"]
        
        # Insert key columns
        for key_column in key_columns:
            spark.sql(f"""
                INSERT INTO 6_mgmt.incr_updt.table_key_columns(
                    watermark_id, dst_table_name, key_column_name, active_ind
                ) VALUES (
                    {watermark_id}, '{dst_table}', '{key_column}', 1
                )
            """)
        
        return {
            "status": "success",
            "watermark_id": watermark_id,
            "message": f"Successfully added {dst_table} to the ETL pipeline with watermark_id {watermark_id}.",
            "configuration": {
                "src_server_name": src_server_name,
                "src_server_id": src_server_id,
                "src_database": src_database,
                "src_schema": src_schema,
                "src_table": src_table,
                "dst_catalog": dst_catalog,
                "dst_schema": dst_schema,
                "dst_table": dst_table,
                "key_columns": key_columns,
                "watermark_column": watermark_column,
                "watermark_value": watermark_value,
                "item_tag": item_tag
            }
        }
    
    except Exception as e:
        return {
            "status": "error",
            "message": f"Error adding table to pipeline: {str(e)}"
        }

In [0]:
# Dry run to preview the SQL statements
# First, let's see what the error is
dry_run_result = add_table_to_pipeline(
    table_name="cds_aea_invest",
    src_server_name="BH2VMDWRL1",
    src_server_id=104, 
    src_database="BH_DATAWAREHOUSE",
    src_schema="dbo",
    src_table="CDS_AEA_INVEST",
    key_columns=["CDS_AEA_Id", "invest_Num"],
    watermark_column="Record_Updated_Dt",
    item_tag="dwh_cds",
    dry_run=True
)

print("Complete result:")
print(dry_run_result)