In [0]:
%sql
USE CATALOG aws;

CREATE SCHEMA IF NOT EXISTS gold;

-- Dimensions / Facts as Delta (managed)
CREATE TABLE IF NOT EXISTS aws.gold.dim_workflow
USING DELTA
AS SELECT * FROM aws.silver.workflow WHERE 1=0;

CREATE TABLE IF NOT EXISTS aws.gold.fact_taskresults
USING DELTA
AS SELECT * FROM aws.silver.wftaskresults WHERE 1=0;

CREATE TABLE IF NOT EXISTS aws.gold.fact_documents
USING DELTA
AS SELECT * FROM aws.silver.wfdoc WHERE 1=0;

CREATE TABLE IF NOT EXISTS aws.gold.fact_workflow_daily
USING DELTA
AS SELECT
  CAST(NULL AS DATE) AS day,
  CAST(NULL AS STRING) AS workflow_type,
  CAST(NULL AS STRING) AS workflow_state,
  CAST(NULL AS BIGINT) AS workflows_modified,
  CAST(NULL AS BIGINT) AS workflows_completed
WHERE 1=0;


num_affected_rows,num_inserted_rows


In [0]:
%sql
MERGE INTO aws.gold.dim_workflow t
USING (
  SELECT
    workflow_id,
    workflow_version,
    outcome,
    completed_date,
    modified,
    status_page_url,
    special_state,
    workflow_state,
    workflow_type,
    workflow_name,
    coordinator_login,
    release_option,
    release_date,
    min_authors,
    min_reviewers,
    min_approvers,
    comment,
    target_completion_date,
    planned_effective_date,
    halt_on_preprocessing_finished,

    -- Derived fields
    CASE
      WHEN workflow_state IN ('Completed') THEN 'Completed'
      WHEN workflow_state IN ('Canceled', 'Cancelled') THEN 'Canceled'
      ELSE 'In Progress'
    END AS workflow_state_group,

    CASE WHEN completed_date IS NOT NULL THEN true ELSE false END AS is_completed,
    CAST(modified AS DATE) AS modified_date

  FROM aws.silver.workflow
) s
ON t.workflow_id = s.workflow_id
WHEN MATCHED THEN UPDATE SET *
WHEN NOT MATCHED THEN INSERT *;


num_affected_rows,num_updated_rows,num_deleted_rows,num_inserted_rows
10,0,0,10


In [0]:
%sql
MERGE INTO aws.gold.fact_taskresults t
USING (
  SELECT
    workflow_id,
    identity_login,
    role,
    outcome,
    ts,
    signature_type,
    CAST(ts AS DATE) AS task_date
  FROM aws.silver.wftaskresults
) s
ON  t.workflow_id = s.workflow_id
AND t.identity_login = s.identity_login
AND t.role = s.role
AND t.outcome = s.outcome
AND t.ts = s.ts
WHEN MATCHED THEN UPDATE SET *
WHEN NOT MATCHED THEN INSERT *;


num_affected_rows,num_updated_rows,num_deleted_rows,num_inserted_rows
15,0,0,15


In [0]:
%sql
MERGE INTO aws.gold.fact_documents t
USING (
  SELECT
    d.*,
    CASE WHEN s.workflow_id IS NOT NULL THEN true ELSE false END AS has_superseded_version
  FROM aws.silver.wfdoc d
  LEFT JOIN aws.silver.wfsupersededdoc s
    ON d.workflow_id = s.workflow_id
   AND d.doc_number = s.doc_number
   AND d.doc_part = s.doc_part
   AND d.doc_version = s.doc_version
   AND d.doc_type = s.doc_type
) s
ON  t.workflow_id = s.workflow_id
AND t.doc_number  = s.doc_number
AND t.doc_part    = s.doc_part
AND t.doc_version = s.doc_version
AND t.doc_type    = s.doc_type
WHEN MATCHED THEN UPDATE SET *
WHEN NOT MATCHED THEN INSERT *;


num_affected_rows,num_updated_rows,num_deleted_rows,num_inserted_rows
23,0,0,23


In [0]:
%sql
MERGE INTO aws.gold.fact_workflow_daily t
USING (
  SELECT
    CAST(modified AS DATE) AS day,
    workflow_type,
    workflow_state,
    COUNT(*) AS workflows_modified,
    SUM(CASE WHEN completed_date IS NOT NULL THEN 1 ELSE 0 END) AS workflows_completed
  FROM aws.silver.workflow
  WHERE modified IS NOT NULL
  GROUP BY CAST(modified AS DATE), workflow_type, workflow_state
) s
ON t.day = s.day AND t.workflow_type = s.workflow_type AND t.workflow_state = s.workflow_state
WHEN MATCHED THEN UPDATE SET
  t.workflows_modified = s.workflows_modified,
  t.workflows_completed = s.workflows_completed
WHEN NOT MATCHED THEN INSERT *;


num_affected_rows,num_updated_rows,num_deleted_rows,num_inserted_rows
9,0,0,9


In [0]:
%sql
SELECT COUNT(*) FROM aws.gold.dim_workflow;

SELECT day, SUM(workflows_modified) AS modified_total
FROM aws.gold.fact_workflow_daily
GROUP BY day
ORDER BY day DESC;

SELECT COUNT(*) FROM aws.gold.fact_taskresults;
SELECT COUNT(*) FROM aws.gold.fact_documents;


count(1)
23


In [0]:
%sql
-- 1) Row counts in each silver table
SELECT 'workflow' AS tbl, COUNT(*) AS rows FROM aws.silver.workflow
UNION ALL SELECT 'wfdoc', COUNT(*) FROM aws.silver.wfdoc
UNION ALL SELECT 'wfsupersededdoc', COUNT(*) FROM aws.silver.wfsupersededdoc
UNION ALL SELECT 'wfroles', COUNT(*) FROM aws.silver.wfroles
UNION ALL SELECT 'wftaskresults', COUNT(*) FROM aws.silver.wftaskresults
ORDER BY tbl;


tbl,rows
wfdoc,23
wfroles,55
wfsupersededdoc,14
wftaskresults,15
workflow,10


In [0]:
%sql
CREATE TABLE IF NOT EXISTS aws.gold.fact_roles
USING DELTA
AS SELECT * FROM aws.silver.wfroles WHERE 1=0;

MERGE INTO aws.gold.fact_roles t
USING (
  SELECT
    workflow_id,
    role,
    user_login,
    user_name,
    user_email
  FROM aws.silver.wfroles
) s
ON  t.workflow_id = s.workflow_id
AND t.role        = s.role
AND t.user_login  = s.user_login
WHEN MATCHED THEN UPDATE SET *
WHEN NOT MATCHED THEN INSERT *;


num_affected_rows,num_updated_rows,num_deleted_rows,num_inserted_rows
55,0,0,55


In [0]:
%sql
CREATE TABLE IF NOT EXISTS aws.gold.fact_superseded_docs
USING DELTA
AS SELECT * FROM aws.silver.wfsupersededdoc WHERE 1=0;

MERGE INTO aws.gold.fact_superseded_docs t
USING aws.silver.wfsupersededdoc s
ON  t.workflow_id      = s.workflow_id
AND t.doc_number       = s.doc_number
AND t.doc_part         = s.doc_part
AND t.doc_version      = s.doc_version
AND t.doc_type         = s.doc_type
AND t.prev_doc_number  = s.prev_doc_number
AND t.prev_doc_part    = s.prev_doc_part
AND t.prev_doc_version = s.prev_doc_version
AND t.prev_doc_type    = s.prev_doc_type
WHEN MATCHED THEN UPDATE SET *
WHEN NOT MATCHED THEN INSERT *;


num_affected_rows,num_updated_rows,num_deleted_rows,num_inserted_rows
14,0,0,14


In [0]:
(
  spark.table("aws.silver.workflow")
    .coalesce(1)
    .write.mode("overwrite")
    .option("header", "true")
    .csv("s3://databricks-aryaman/xml-etl/exports/workflow/")
)


In [0]:
%sql
CREATE DATABASE workflowdb;


In [0]:
%sql
CREATE SCHEMA IF NOT EXISTS diadoc;


In [0]:
jdbc_host = "database-1.cla80g0eyp93.us-east-2.rds.amazonaws.com"
jdbc_port = 5432
jdbc_db   = "workflowdb"     # âœ… must be workflowdb
jdbc_user = "postgres"
jdbc_pwd  = "#Aryaman2004"            # (see security note below)

jdbc_url = f"jdbc:postgresql://{jdbc_host}:{jdbc_port}/{jdbc_db}?sslmode=require"

tables = [
  ("aws.silver.workflow",        "diadoc.workflow"),
  ("aws.silver.wfdoc",           "diadoc.wfdoc"),
  ("aws.silver.wfsupersededdoc", "diadoc.wfsupersededdoc"),
  ("aws.silver.wfroles",         "diadoc.wfroles"),
  ("aws.silver.wftaskresults",   "diadoc.wftaskresults"),
]

for src, tgt in tables:
    df = spark.table(src)
    (df.write
      .format("jdbc")
      .option("url", jdbc_url)
      .option("dbtable", tgt)
      .option("user", jdbc_user)
      .option("password", jdbc_pwd)
      .option("driver", "org.postgresql.Driver")
      .mode("overwrite")
      .save()
    )

print("Done exporting silver tables to RDS.")


Done exporting silver tables to RDS.
