In [0]:
%sql
CREATE OR REPLACE TEMPORARY FUNCTION to_col_json_str(
  col_name STRING, data_type STRING, sensitive STRING, description STRING
)
RETURNS STRING
/*
RETURN CONCAT(
  '{', '''name'':''', col_name, ''',',
  '''values'':null,',
  '''dataType'':''', data_type, ''',',
  '''sensitive'':', sensitive, ',',
  '''description'':''', description, '''','}')*/

RETURN TO_JSON(named_struct(
  'name', col_name,
  'values', null,
  'dataType', data_type,
  'sensitive', sensitive,
  'description', description))

In [0]:
from pyspark.sql import functions as F

CATALOG = "4_prod"
SCHEMA = "pacs"
HDRUK_SCHEMA_VERSION = "3.0.0"
DATASET_ID = "1491"
DATASET_TITLE = "Barts Imaging Dataset"
DATASET_ABSTRACT = "Barts Health NHS Trust Imaging Metadata and Report Dataset"

# Load column_tags table
col_tags = spark.table("system.information_schema.column_tags")

# Prepare ig_risk and ig_severity DataFrames
ig_risk = col_tags.filter(F.lower(F.col("tag_name")) == "ig_risk")
ig_severity = col_tags.filter(F.lower(F.col("tag_name")) == "ig_severity")

# Join and compute sensitive
ig_sensitive = (
    ig_risk.alias("r")
    .join(
        ig_severity.alias("s"),
        [
            F.col("r.catalog_name") == F.col("s.catalog_name"),
            F.col("r.schema_name") == F.col("s.schema_name"),
            F.col("r.table_name") == F.col("s.table_name"),
            F.col("r.column_name") == F.col("s.column_name"),
        ],
        "inner"
    )
    .select(
        F.col("r.catalog_name"),
        F.col("r.schema_name"),
        F.col("r.table_name"),
        F.col("r.column_name"),
        F.col("r.tag_value").alias("ig_risk"),
        F.col("s.tag_value").alias("ig_severity"),
        F.when(F.upper(F.col("r.column_name")) == "ADC_UPDT", F.lit(False))
         .when(F.col("r.tag_value").isNull() | F.col("s.tag_value").isNull(), F.lit(None))
         .when(F.col("r.tag_value").cast("int") >= 3, F.lit(True))
         .when(F.col("s.tag_value").cast("int") >= 2, F.lit(True))
         .otherwise(F.lit(False)).alias("sensitive")
    )
)

# Load columns table
columns = spark.table("system.information_schema.columns")

# Join with ig_sensitive
col_df = (
    columns.alias("c")
    .join(
        ig_sensitive.alias("s"),
        [
            F.col("c.table_catalog") == F.col("s.catalog_name"),
            F.col("c.table_schema") == F.col("s.schema_name"),
            F.col("c.table_name") == F.col("s.table_name"),
            F.col("c.column_name") == F.col("s.column_name"),
        ],
        "left"
    )
    .filter(
        (F.col("c.table_catalog") == CATALOG) &
        (F.col("c.table_schema") == SCHEMA)
        # Uncomment the next line to filter table_name with 'omop_%'
        # & (F.col("c.table_name").like("omop_%"))
    )
    .select(
        F.col("c.table_catalog"),
        F.col("c.table_schema"),
        F.col("c.table_name"),
        F.col("c.column_name").alias("column_name"),
        F.col("c.ordinal_position"),
        F.col("c.data_type").alias("dataType"),
        F.col("s.ig_risk"),
        F.col("s.ig_severity"),
        F.col("s.sensitive"),
        F.col("c.comment").alias("column_description")
    )
)

col_df = col_df.withColumn(
    "json_str",
    F.when(
        F.col("sensitive").isNull() & F.col("column_description").isNull(),
        F.expr("to_col_json_str(column_name, dataType, 'null', 'null')")
    ).when(
        F.col("sensitive").isNull(),
        F.expr("to_col_json_str(column_name, dataType, 'null', column_description)")
    ).when(
        F.col("column_description").isNull(),
        F.expr("to_col_json_str(column_name, dataType, sensitive, 'null')")
    ).otherwise(
        F.expr("to_col_json_str(column_name, dataType, sensitive, column_description)")
    )
)

display(col_df)

In [0]:
from pyspark.sql.functions import from_json

# Define the schema for the JSON struct
json_schema = "name STRING, values STRING, dataType STRING, sensitive STRING, description STRING"
# Not sure if sensitive is BOOLEAN or STRING

# Add the json_struct column to the result DataFrame
col_df = col_df.withColumn(
    "json_struct",
    from_json("json_str", json_schema)
)
display(col_df)

In [0]:
# Check if any json_str is null
col_df.filter(col_df.json_str.isNull()).count()

In [0]:
from pyspark.sql import functions as F

tab_df = (
    col_df.groupBy("table_name")
    .agg(
        F.collect_set("json_struct").alias("columns"),
    )
    .withColumn("description", F.lit(None))
    .withColumnRenamed("table_name", "name")
)

display(tab_df)

In [0]:
from pyspark.sql import functions as F

# Load table comments
table_comments = (
    spark.table("system.information_schema.tables")
    .filter(
        (F.col("table_catalog") == CATALOG) &
        (F.col("table_schema") == SCHEMA)
        # .filter(F.col("table_name").like("omop_%"))  # Uncomment if needed
    )
    .select(
        F.col("table_name"),
        F.col("comment").alias("table_comment")
    )
)

# Update description in tab_df and rename table_name to name
tab_df = (
    tab_df.alias("t")
    .join(
        table_comments.alias("c"),
        F.col("t.name") == F.col("c.table_name"),
        "left"
    )
    .withColumn(
        "description",
        F.col("c.table_comment")
    )
    .select("t.*", "description")
)

display(tab_df)

In [0]:
import json
import requests

api_path = f"https://api.dev.hdruk.cloud/api/v1/integrations/datasets/{DATASET_ID}"
headers = {
    "x-application-id": dbutils.secrets.get(scope="adc_store", key="hdruk_app_id"),
    "x-client-id": dbutils.secrets.get(scope="adc_store", key="hdruk_client_id"),
    "Content-Type": "application/json"
}
response = requests.get(
    f"https://api.healthdatagateway.org/api/v1/datasets/{DATASET_ID}",
    headers=headers
)
print(response)

In [0]:
response_json = json.loads(response.text)
response_json

In [0]:
version_nbr_str = response_json["data"]["versions"][0]["metadata"]["metadata"]["required"]["version"]
version_nbrs = version_nbr_str.split(".")
version_nbrs = [int(x) for x in version_nbrs]
new_version_nbr_str = f'{version_nbrs[0]}.{version_nbrs[1]+1 if version_nbrs[2]>=12 else version_nbrs[1]}.{0 if version_nbrs[2]>= 12 else version_nbrs[2]+1}'
print("old:", version_nbr_str)
print("new:", new_version_nbr_str)

In [0]:
# TODO: Change this template
hdruk_300_str = \
'''
{
    "identifier": "https://web.www.healthdatagateway.org/f948711f-b176-44e4-b57e-5776997a2e75",
    "version": "###VER_NUM_STR###",
    "issued": "2024-11-26T00:00:00.000Z",
    "modified": "###CURR_DATE_STR###T00:00:00.000Z",
    "revisions": [{"url": "https://web.dev.hdruk.cloud//dataset/###DATASET_ID###?version=1.0.0","version": "1.0.0"}],
    "summary": {
        "title": "###DATASET_TITLE###",
        "abstract": "###DATASET_ABSTRACT###",
        "contactPoint": "BartsHealth.ResearchDataRequest@nhs.net",
        "keywords": ["Hospital Inpatient data","Outpatient","Pathology","Radiology","Maternity","Critical Care","Pharmacy"],
        "alternateIdentifiers": null,
        "doiName": null,
        "populationSize": ###PATIENT_COUNT_INT###,
        "dataCustodian": {
            "identifier": "https://ror.org/00b31g692",
            "name": "Barts Health NHS Trust",
            "logo": "https://media.prod.hdruk.cloud/teams/nhs-barts-health.jpg",
            "description": null,
            "contactPoint": "BartsHealth.ResearchDataRequest@nhs.net",
            "memberOf": "Alliance"
        }
    },
    "documentation": {
        "description": "Barts Health NHS Imaging Metadata and Report Dataset",
        "associatedMedia": null,
        "inPipeline": null
    },
    "coverage": {
        "spatial": "United Kingdom,England",
        "followUp": null,
        "pathway": null,
        "typicalAgeRangeMin": 0,
        "typicalAgeRangeMax": 150,
        "datasetCompleteness": null,
        "materialType": [
            "None/not available"
        ]
    },
    "provenance": {
        "origin": {
            "purpose": [
                "Other",
                "Administrative"
            ],
            "source": [
                "EPR"
            ],
            "datasetType": ["Health and disease"],
            "datasetSubType": null,
            "collectionSource": null,
            "imageContrast": null
        },
        "temporal": {
            "distributionReleaseDate": null,
            "startDate": "2008-01-01",
            "endDate": null,
            "timeLag": "Variable",
            "publishingFrequency": "Daily"
        }
    },
    "accessibility": {
        "usage": {
            "dataUseLimitation": ["General research use"],
            "dataUseRequirements": ["Ethics approval required","Project-specific restrictions","User-specific restriction"],
            "resourceCreator": "Barts Health"
        },
        "access": {
            "accessRights": null,
            "accessService":"Barts Health has a secure data environment since 2024. Projects requiring access to data can make an application on the Data Portal (data.bartshealth.nhs.uk).",
            "accessRequestCost": "Cost Recovery Model",
            "deliveryLeadTime": null,
            "jurisdiction": ["GB-ENG"],
            "dataProcessor": null,
            "dataController": "Barts Health",
            "accessServiceCategory": null
        },
        "formatAndStandards": {
            "vocabularyEncodingScheme": [
            ],
            "conformsTo": [
                "OMOP"
            ],
            "language": ["en"],
            "format": ["CSV"]
        }
    },
    "enrichmentAndLinkage": {
        "tools": null,
        "derivedFrom": null,
        "isPartOf": null,
        "linkableDatasets": null,
        "similarToDatasets": null,
        "publicationAboutDataset": null,
        "investigations": null,
        "publicationUsingDataset": null
    },
    "observations": [
        {
            "observedNode": "Persons",
            "measuredValue": ###PATIENT_COUNT_INT###,
            "measuredProperty": "COUNT",
            "observationDate": "###CURR_DATE_STR###",
            "disambiguatingDescription": "Total number of distinct PERSON_ID in the OMOP_PERSON table"
        }
    ],
    "structuralMetadata": {
        "tables": [
            {
                "name": "IAPT.iapt.Rep_Referral",
                "description": "IAPT.iapt.Rep_Referral",
                "columns": [
                    {
                        "name": "Count of number of Non-guided Self Help (Computer) sessions (derived)",
                        "description": "Count of number of Non-guided Self Help (Computer) sessions (derived)",
                        "dataType": "Number",
                        "sensitive": false,
                        "values": null
                    }
                ]
            },
            {
                "name": "IAPT.iapt.Rep_Referral",
                "description": "IAPT.iapt.Rep_Referral",
                "columns": [
                    {
                        "name": "Pseudonymised Service Request Identifier",
                        "description": "A request for the provision of care services to a PATIENT.",
                        "dataType": "String",
                        "sensitive": false,
                        "values": null
                    }
                ]
            }
        ],
        "syntheticDataWebLink": null
    },
    "demographicFrequency": null,
    "omics": null
}
'''

In [0]:

# TODO: Parameterise this
patientcount = spark.sql("SELECT COUNT(DISTINCT PERSON_ID) AS patientcount FROM 4_prod.dlt.omop_person").collect()[0]["patientcount"]

patientcount = round(patientcount,-5)
print(patientcount)
hdruk_300_str = hdruk_300_str.replace("###PATIENT_COUNT_INT###", str(patientcount))

In [0]:
hdruk_300_str = hdruk_300_str.replace("###VER_NUM_STR###", new_version_nbr_str)


In [0]:
currentdate = spark.sql("SELECT CAST(CURRENT_DATE() AS STRING) AS curr_date").collect()[0]["curr_date"]
print(currentdate)
hdruk_300_str = hdruk_300_str.replace("###CURR_DATE_STR###", currentdate)

In [0]:
print(DATASET_ID)
hdruk_300_str = hdruk_300_str.replace("###DATASET_ID###", DATASET_ID)
print(DATASET_TITLE)
hdruk_300_str = hdruk_300_str.replace("###DATASET_TITLE###", DATASET_TITLE)
print(DATASET_ABSTRACT)
hdruk_300_str = hdruk_300_str.replace("###DATASET_ABSTRACT###", DATASET_ABSTRACT)


In [0]:
hdruk_300_json = json.loads(hdruk_300_str)

In [0]:
struct_metadata_dict = tab_df.toPandas().to_dict(orient="records")

for i, d in enumerate(struct_metadata_dict):
    try:
        d["columns"] = d["columns"].tolist()
    except:
        print(i)
        
hdruk_300_json["structuralMetadata"]["tables"] = struct_metadata_dict

In [0]:
hdruk_300_json["structuralMetadata"]["tables"]

In [0]:
import requests

api_path = f"https://api.dev.hdruk.cloud/api/v1/integrations/datasets/{DATASET_ID}"
headers = {
    "x-application-id": dbutils.secrets.get(scope="adc_store", key="hdruk_app_id"),
    "x-client-id": dbutils.secrets.get(scope="adc_store", key="hdruk_client_id"),
    "Content-Type": "application/json"
}
response = requests.get(
    f"https://api.healthdatagateway.org/api/v1/datasets/{DATASET_ID}",
    headers=headers
)
print(response)

In [0]:
import json
response_json = json.loads(response.text)
response_json

In [0]:
import json
import requests



In [0]:
def is_metadata_schema_correct(metadata_json, version="3.0.0", verbose=1):
    headers = {
        "Content-Type": "application/json",
    }

    traser_uri = "https://hdr-gateway-traser-dev-qmnkcg5qjq-ew.a.run.app"
    response = requests.post(
        f"{traser_uri}/find?with_errors=1", headers=headers, json=hdruk_300_json
    )

    if verbose >= 1:
        print("Traser response:", response)

    if int(response.status_code) != 200:
        raise ConnectionError("Unable to receive a response.")

    if verbose >= 2:
        print(json.dumps(response.json(), indent=6))

    for item in response.json():
        if item["name"] == "HDRUK" and item["version"] == version:
            if verbose >= 1:
                print(item)
            return item["matches"]
    raise ValueError("Unable to find schema in TRASER.")


In [0]:
import requests

def update_hdruk_metadata(metadata_json, dataset_id):
    api_path = f"https://api.healthdatagateway.org/api/v1/integrations/datasets/{dataset_id}"
    headers = {
        "x-application-id": dbutils.secrets.get(scope="adc_store", key="hdruk_app_id"),
        "x-client-id": dbutils.secrets.get(scope="adc_store", key="hdruk_client_id"),
        "Content-Type": "application/json"
    }

    response = requests.put(
        f"{api_path}",
        headers=headers,
        json={"metadata":metadata_json}
    )
    print("Schema update response:", response.status_code)

if is_metadata_schema_correct(hdruk_300_json, version=HDRUK_SCHEMA_VERSION, verbose=1):
    #update_hdruk_metadata(hdruk_300_json, DATASET_ID)
    pass