In [0]:
from pyspark.sql import types as T, functions as F, Row
from pyspark.sql.dataframe import DataFrame
from delta.tables import DeltaTable
import re
from pyspark.errors import AnalysisException
from datetime import datetime

In [0]:
def log_audit(
    job_name,
    job_run_id,
    status,
    error_details="",
    start_time=None,
    end_time=None,
    audit_table="mdf2.bronze.audit"
):
    query = f"""
    INSERT INTO
        {audit_table}
    VALUES
    (
        '{job_name}',
        '{job_run_id}',
        '{status}',
        '{error_details}',
        try_cast('{start_time}' AS TIMESTAMP),
        try_cast('{end_time}' AS TIMESTAMP)
    )
    """
    spark.sql(query)

In [0]:
def create_metadata_tables(catalog, schema):
    spark.sql(f"""
        CREATE OR REPLACE TABLE {catalog}.{schema}.file_master_config (
            RootFolder STRING,
            TargetCatalog STRING,
            TargetSchema STRING,
            TargetTable STRING,
            FileSearchTerm STRING,
            BaseUrl STRING,
            WatermarkColumn STRING,
            LPWatermarkValue TIMESTAMP,
            EnableFlag BOOLEAN,
            ColumnList STRING
        );
    """)

    spark.sql(f"""
        CREATE TABLE IF NOT EXISTS {catalog}.{schema}.audit (
            JobName STRING,
            JobRunID STRING,
            Status STRING,
            ErrorDetails STRING,
            StartTime TIMESTAMP,
            EndTime TIMESTAMP
        );
    """)

In [0]:
def upsert_file_master_config(
    rootFolder: str,
    targetCatalog: str,
    targetSchema: str,
    targetTable: str,
    fileSearchTerm: str,
    baseUrl: str,
    watermarkColumn: str,
    lpWatermarkValue: str,
    enableFlag: str,
    columnList: str,
    catalog: str,
    schema: str,
) -> dict:

    schema = T.StructType(
        [
            T.StructField("RootFolder", T.StringType(), True),
            T.StructField("TargetCatalog", T.StringType(), True),
            T.StructField("TargetSchema", T.StringType(), True),
            T.StructField("TargetTable", T.StringType(), True),
            T.StructField("FileSearchTerm", T.StringType(), True),
            T.StructField("BaseUrl", T.StringType(), True),
            T.StructField("WatermarkColumn", T.StringType(), True),
            T.StructField("LPWatermarkValue", T.TimestampType(), True),
            T.StructField("EnableFlag", T.BooleanType(), True),
            T.StructField("ColumnList", T.StringType(), True),
        ]
    )

    if watermarkColumn is not None:
        watermarkColumn = str(re.sub(r"[ /()&]", "", watermarkColumn))
        watermarkColumn = str(watermarkColumn.replace("-", "_"))
    else:
        watermarkColumn = str(None)

    lpWatermarkValue = datetime.strptime(lpWatermarkValue, "%Y-%m-%d %H:%M:%S")

    data = [
        Row(
            rootFolder,
            targetCatalog,
            targetSchema,
            targetTable,
            fileSearchTerm,
            baseUrl,
            watermarkColumn,
            lpWatermarkValue,
            bool(enableFlag),
            columnList,
        ),
    ]

    df = spark.createDataFrame(data, schema)

    keyColumns = [
        "RootFolder",
        "BaseUrl",
    ]

    updateColumns = [
        "TargetCatalog",
        "TargetSchema",
        "TargetTable",
        "FileSearchTerm",
        "WatermarkColumn",
        "LPWatermarkValue",
        "EnableFlag",
        "ColumnList",
    ]

    valuesColumns = {
        f"target.{col}": f"source.{col}" for col in keyColumns + updateColumns
    }
    
    setColumns = {f"target.{col}": f"source.{col}" for col in updateColumns}

    delta_table = DeltaTable.forName(
        spark, f"{catalog}.{schema}.file_master_config"
    )

    (
        delta_table.alias("target")
        .merge(
            df.alias("source"),
            " and ".join(f"target.{c} = source.{c}" for c in keyColumns),
        )
        .whenMatchedUpdate(
            " or ".join([f"target.`{c}` != source.`{c}`" for c in updateColumns]),
            set=setColumns,
        )
        .whenNotMatchedInsert(values=valuesColumns)
        .execute()
    )

    return get_record_stats(catalog, schema, "file_master_config")