In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.functions import ( 
    col, from_json, to_timestamp, when, current_timestamp, 
    floor, rand, lit
)


bronze_path = "abfss://bronze@hospitalstorage224.dfs.core.windows.net/patient_flow"
silver_path = "abfss://silver@hospitalstorage224.dfs.core.windows.net/patient_flow"

# We can use either key or string based access to accees the azure storage account (Using key based access)
spark.conf.set( 
               "fs.azure.account.key.hospitalstorage224.dfs.core.windows.net",
  dbutils.secrets.get(scope='hospitalAnalyticsVaultScope', key='storage-connection'))

# Read from bronze
bronze_df = (
    spark.readStream
    .format('delta')
    .load(bronze_path)
)

# Define Schema
schema = StructType([
    StructField("patient_id", StringType()),
    StructField("gender", StringType()),
    StructField("age", IntegerType()),
    StructField("department", StringType()),
    StructField("admission_time", StringType()),
    StructField("discharge_time", StringType()),
    StructField("bed_id", IntegerType()),
    StructField("hospital_id", IntegerType())
])

# Parse it to Dataframe
parsed_df = bronze_df.withColumn("data", from_json(col("raw_json"), schema)).select("data.*")

# Convert type to timestamp
clean_df = parsed_df.withColumn("admission_time", to_timestamp(col("admission_time")))
clean_df = parsed_df.withColumn("discharge_time", to_timestamp(col("discharge_time")))

# Invalid admission times
clean_df = clean_df.withColumn("admission_time", 
    when(col("admission_time").isNull() | (col("admission_time") > current_timestamp()), 
         current_timestamp())
    .otherwise(col("admission_time")))

# Handle invalid ages
clean_df = clean_df.withColumn('age', 
    when((col("age").isNull()) | (col("age") < 0) | (col("age") > 100), 
         floor(rand()*90+1).cast(IntegerType()))
    .otherwise(col("age")))

# Expected columns
expected_cols = ['patient_id', 'gender','age','department','admission_time','discharge_time','bed_id','hospital_id']

for col_name in expected_cols:
    if col_name not in clean_df.columns:
        clean_df = clean_df.withColumn(col_name, lit(None))
        #raise Exception(f"Column {col_name} not found in dataframe")   # raise exception
        
# Write to silver
(
    clean_df.writeStream
    .format('delta')
    .outputMode('append')
    .option('mergeSchema','true')
    .option('checkpointLocation', silver_path + '/_checkpoints')
    .start(silver_path))

In [0]:
df_silver=spark.read.format('delta').load(silver_path)
display(df_silver.limit(10))

patient_id,gender,age,department,admission_time,discharge_time,bed_id,hospital_id
fc4316db-2a00-459e-85e7-94c1c67d3f1f,Female,46,Pediatrics,2025-12-12T06:17:23.777843,2025-12-15T05:17:23.777843Z,44,1
50e355e1-ed9c-4e2c-92c8-601ad10bd474,Other,50,Pediatrics,2025-12-13T16:16:14.201565,2025-12-15T00:16:14.201565Z,404,1
d94adab0-193f-4eb2-a173-42f6f0431dad,Female,14,Oncology,2025-12-13T13:16:20.289204,2025-12-16T10:16:20.289204Z,20,1
7fc72f0a-5397-470b-95a4-571f7ea0f030,Other,69,Surgery,2025-12-14T04:16:40.355214,2025-12-15T02:16:40.355214Z,40,4
eb82cf21-c0f2-4390-9b9b-7a0cd54a5eea,Female,53,ICU,2025-12-13T07:16:11.304313,2025-12-14T18:16:11.304313Z,353,6
4e40bee7-8b44-4b63-ae8d-c777d84cc33a,Male,78,ICU,2025-12-12T16:16:51.882245,2025-12-14T04:16:51.882245Z,157,7
98d8c5c1-87f3-4009-baf8-d259cbbb0f53,Male,59,ICU,2025-12-11T23:17:38.536794,2025-12-13T05:17:38.536794Z,284,1
d74a521b-023e-42d4-876e-abc9115bb4ed,Male,35,,2025-12-14T06:16:34.426526,2025-12-15T07:16:34.426526Z,408,2
cde79e04-41dc-4f1c-948e-44e1ba0a19a1,Male,86,ICU,2025-12-14 17:15:46.011,2025-12-14T16:15:41.133496Z,41,3
5212ffbc-2cf8-4faa-868e-3ee031848a47,Other,86,Emergency,2025-12-12T14:15:43.456302,2025-12-12T20:15:43.456302Z,305,7
