In [1]:
SERVER = "syn-dp-dev-swn-001.sql.azuresynapse.net"
DATABASE = "Synapse_Data_Warehouse"
TEMP_FOLDER = "abfss://fs-adl-dp-dev-swn-001@sadpdevswn002.dfs.core.windows.net/temp"
TARGET_FODER = "abfss://fs-adl-dp-dev-swn-001@sadpdevswn002.dfs.core.windows.net/enterprisemetrics/parquet/"

In [2]:
from shared_utils.process_spark_base import ProcessSparkBase
utils = ProcessSparkBase(SERVER, DATABASE, TEMP_FOLDER)

In [3]:
QUERY_OBJECTS = """
    SELECT
        o.object_id,
        s.name AS schema_name,
        o.name AS object_name,
        CASE o.type
            WHEN 'U' THEN 'Table'
            WHEN 'V' THEN 'View'
            WHEN 'P' THEN 'Stored Procedure'
        END AS object_type,
        o.create_date,
        o.modify_date
    FROM sys.objects o
    JOIN sys.schemas s ON o.schema_id = s.schema_id
    WHERE o.type IN ('U', 'V', 'P')
        AND o.is_ms_shipped = 0
        AND (s.name IN ('DBO','CONSUMPTION_POWERBI') or s.name like '%FINANCE%' or s.name like '%PRIMA%' or s.name like '%COGNOS%' 
             or s.name like '%STARTUP%' or s.name like '%ENTERPRISEMETRIC%' or s.name like '%Cadence%')
"""

QUERY_DEPENDENCIES = """
    SELECT
        d.referencing_id AS referencing_object_id,
        d.referenced_id AS referenced_object_id,
        d.referenced_schema_name,
        d.referenced_entity_name
    FROM sys.sql_expression_dependencies d
    WHERE d.referencing_id IS NOT NULL
        AND d.referenced_id IS NOT NULL
"""

QUERY_DEFINITIONS = """
    SELECT
        m.object_id,
        o.name AS object_name,
        s.name AS schema_name,
        m.definition
    FROM sys.sql_modules m
    JOIN sys.objects o ON m.object_id = o.object_id
    JOIN sys.schemas s ON o.schema_id = s.schema_id
    WHERE o.is_ms_shipped = 0
        AND (s.name IN ('DBO','CONSUMPTION_POWERBI') or s.name like '%FINANCE%' or s.name like '%PRIMA%' or s.name like '%COGNOS%' 
             or s.name like '%STARTUP%' or s.name like '%ENTERPRISEMETRIC%' or s.name like '%Cadence%')
"""

QUERY_LOGS = """
    SELECT DISTINCT
        SUBSTRING(r.command, 1, 4000) AS command_text
    FROM sys.dm_pdw_exec_requests r
    WHERE r.submit_time >= DATEADD(day, -7, GETDATE())
        AND r.status = 'Completed'
        AND (
            r.command LIKE 'SELECT %'    -- Shows READ dependencies
            OR r.command LIKE 'INSERT %' -- Shows WRITE dependencies
            OR r.command LIKE 'UPDATE %' -- Shows READ + WRITE
            OR r.command LIKE 'MERGE %'  -- Shows READ + WRITE (upsert)
        )
"""

QUERY_TABLE_COLUMNS = """
    SELECT
        o.object_id,
        s.name AS schema_name,
        o.name AS table_name,
        c.name AS column_name,
        t.name AS data_type,
        c.max_length,
        c.precision,
        c.scale,
        c.is_nullable,
        c.column_id
    FROM sys.tables o
    JOIN sys.schemas s ON o.schema_id = s.schema_id
    JOIN sys.columns c ON o.object_id = c.object_id
    JOIN sys.types t ON c.user_type_id = t.user_type_id
    WHERE o.is_ms_shipped = 0
        AND (s.name IN ('DBO','CONSUMPTION_POWERBI') or s.name like '%FINANCE%' or s.name like '%PRIMA%' or s.name like '%COGNOS%' 
             or s.name like '%STARTUP%' or s.name like '%ENTERPRISEMETRIC%' or s.name like '%Cadence%')
"""

In [4]:
def extract_to_parquet(query, output_filename):

    df = utils.read_dwh(query)

    output_path = f"{TARGET_FODER}{output_filename}"
    df.coalesce(1).write.mode("overwrite").parquet(output_path)

In [5]:
extract_to_parquet(QUERY_OBJECTS, "objects")
extract_to_parquet(QUERY_DEPENDENCIES, "dependencies")
extract_to_parquet(QUERY_DEFINITIONS, "definitions")
extract_to_parquet(QUERY_LOGS, "query_logs")
extract_to_parquet(QUERY_TABLE_COLUMNS, "table_columns")