In [0]:
%sql
CREATE OR REPLACE TEMPORARY FUNCTION to_col_json_str(
  col_name STRING, data_type STRING, sensitive STRING, description STRING
)
RETURNS STRING
/*
RETURN CONCAT(
  '{', '''name'':''', col_name, ''',',
  '''values'':null,',
  '''dataType'':''', data_type, ''',',
  '''sensitive'':', sensitive, ',',
  '''description'':''', description, '''','}')*/

RETURN TO_JSON(named_struct(
  'name', col_name,
  'values', null,
  'dataType', data_type,
  'sensitive', sensitive,
  'description', description))

In [0]:
from pyspark.sql import functions as F

CATALOG = "4_prod"
SCHEMA = "pacs"
DATASET_ID = "1491"

# Load column_tags table
col_tags = spark.table("system.information_schema.column_tags")

# Prepare ig_risk and ig_severity DataFrames
ig_risk = col_tags.filter(F.lower(F.col("tag_name")) == "ig_risk")
ig_severity = col_tags.filter(F.lower(F.col("tag_name")) == "ig_severity")

# Join and compute sensitive
ig_sensitive = (
    ig_risk.alias("r")
    .join(
        ig_severity.alias("s"),
        [
            F.col("r.catalog_name") == F.col("s.catalog_name"),
            F.col("r.schema_name") == F.col("s.schema_name"),
            F.col("r.table_name") == F.col("s.table_name"),
            F.col("r.column_name") == F.col("s.column_name"),
        ],
        "inner"
    )
    .select(
        F.col("r.catalog_name"),
        F.col("r.schema_name"),
        F.col("r.table_name"),
        F.col("r.column_name"),
        F.col("r.tag_value").alias("ig_risk"),
        F.col("s.tag_value").alias("ig_severity"),
        F.when(F.upper(F.col("r.column_name")) == "ADC_UPDT", F.lit(False))
         .when(F.col("r.tag_value").isNull() | F.col("s.tag_value").isNull(), F.lit(None))
         .when(F.col("r.tag_value").cast("int") >= 3, F.lit(True))
         .when(F.col("s.tag_value").cast("int") >= 2, F.lit(True))
         .otherwise(F.lit(False)).alias("sensitive")
    )
)

# Load columns table
columns = spark.table("system.information_schema.columns")

# Join with ig_sensitive
col_df = (
    columns.alias("c")
    .join(
        ig_sensitive.alias("s"),
        [
            F.col("c.table_catalog") == F.col("s.catalog_name"),
            F.col("c.table_schema") == F.col("s.schema_name"),
            F.col("c.table_name") == F.col("s.table_name"),
            F.col("c.column_name") == F.col("s.column_name"),
        ],
        "left"
    )
    .filter(
        (F.col("c.table_catalog") == CATALOG) &
        (F.col("c.table_schema") == SCHEMA)
        # Uncomment the next line to filter table_name with 'omop_%'
        # & (F.col("c.table_name").like("omop_%"))
    )
    .select(
        F.col("c.table_catalog"),
        F.col("c.table_schema"),
        F.col("c.table_name"),
        F.col("c.column_name").alias("column_name"),
        F.col("c.ordinal_position"),
        F.col("c.data_type").alias("dataType"),
        F.col("s.ig_risk"),
        F.col("s.ig_severity"),
        F.col("s.sensitive"),
        F.col("c.comment").alias("column_description")
    )
)

col_df = col_df.withColumn(
    "json_str",
    F.when(
        F.col("sensitive").isNull() & F.col("column_description").isNull(),
        F.expr("to_col_json_str(column_name, dataType, 'null', 'null')")
    ).when(
        F.col("sensitive").isNull(),
        F.expr("to_col_json_str(column_name, dataType, 'null', column_description)")
    ).when(
        F.col("column_description").isNull(),
        F.expr("to_col_json_str(column_name, dataType, sensitive, 'null')")
    ).otherwise(
        F.expr("to_col_json_str(column_name, dataType, sensitive, column_description)")
    )
)

# Insert into target table
display(col_df)

In [0]:
from pyspark.sql.functions import from_json

# Define the schema for the JSON struct
json_schema = "name STRING, values STRING, dataType STRING, sensitive STRING, description STRING"
# Not sure if sensitive is BOOLEAN or STRING

# Add the json_struct column to the result DataFrame
col_df = col_df.withColumn(
    "json_struct",
    from_json("json_str", json_schema)
)
display(col_df)

In [0]:
# Check if any json_str is null
col_df.filter(col_df.json_str.isNull()).count()

In [0]:
from pyspark.sql import functions as F

tab_df = (
    col_df.groupBy("table_name")
    .agg(
        F.collect_set("json_struct").alias("columns"),
    )
    .withColumn("description", F.lit(None))
)

display(tab_df)

In [0]:

from pyspark.sql import functions as F

# Load table comments
table_comments = (
    spark.table("system.information_schema.tables")
    .filter(
        (F.col("table_catalog") == CATALOG) &
        (F.col("table_schema") == SCHEMA)
        # .filter(F.col("table_name").like("omop_%"))  # Uncomment if needed
    )
    .select(
        F.col("table_name"),
        F.col("comment").alias("table_comment")
    )
)

# Update description in tab_df
tab_df = (
    tab_df.alias("t")
    .join(
        table_comments.alias("c"),
        F.col("t.table_name") == F.col("c.table_name"),
        "left"
    )
    .withColumn(
        "description",
        F.col("c.table_comment")
    )
    .select("t.*", "description")
)

display(tab_df)

In [0]:
import json
import requests

api_path = f"https://api.dev.hdruk.cloud/api/v1/integrations/datasets/{DATASET_ID}"
headers = {
    "x-application-id": dbutils.secrets.get(scope="adc_store", key="hdruk_app_id"),
    "x-client-id": dbutils.secrets.get(scope="adc_store", key="hdruk_client_id"),
    "Content-Type": "application/json"
}
response = requests.get(
    f"https://api.healthdatagateway.org/api/v1/datasets/{DATASET_ID}",
    headers=headers
)
print(response)