In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.functions import ( 
    col, from_json, to_timestamp, when, current_timestamp, 
    floor, rand, lit
)


bronze_path = "abfss://bronze@hospitalstorage224.dfs.core.windows.net/patient_flow"
silver_path = "abfss://silver@hospitalstorage224.dfs.core.windows.net/patient_flow"

# We can use either key or string based access to accees the azure storage account (Using key based access)
spark.conf.set( 
               "fs.azure.account.key.hospitalstorage224.dfs.core.windows.net",
  dbutils.secrets.get(scope='hospitalAnalyticsVaultScope', key='storage-connection'))

# Read from bronze
bronze_df = (
    spark.readStream
    .format('delta')
    .load(bronze_path)
)

# Define Schema
schema = StructType([
    StructField("patient_id", StringType()),
    StructField("gender", StringType()),
    StructField("age", IntegerType()),
    StructField("department", StringType()),
    StructField("admission_time", StringType()),
    StructField("discharge_time", StringType()),
    StructField("bed_id", IntegerType()),
    StructField("hospital_id", IntegerType())
])

# Parse it to Dataframe
parsed_df = bronze_df.withColumn("data", from_json(col("raw_json"), schema)).select("data.*")

# Convert type to timestamp
clean_df = parsed_df.withColumn("admission_time", to_timestamp(col("admission_time")))
clean_df = parsed_df.withColumn("discharge_time", to_timestamp(col("discharge_time")))

# Invalid admission times
clean_df = clean_df.withColumn("admission_time", 
    when(col("admission_time").isNull() | (col("admission_time") > current_timestamp()), 
         current_timestamp())
    .otherwise(col("admission_time")))

# Handle invalid ages
clean_df = clean_df.withColumn('age', 
    when((col("age").isNull()) | (col("age") < 0) | (col("age") > 100), 
         floor(rand()*90+1).cast(IntegerType()))
    .otherwise(col("age")))

# Expected columns
expected_cols = ['patient_id', 'gender','age','department','admission_time','discharge_time','bed_id','hospital_id']

for col_name in expected_cols:
    if col_name not in clean_df.columns:
        clean_df = clean_df.withColumn(col_name, lit(None))
        #raise Exception(f"Column {col_name} not found in dataframe")   # raise exception
        
# Write to silver
(
    clean_df.writeStream
    .format('delta')
    .outputMode('append')
    .option('mergeSchema','true')
    .option('checkpointLocation', silver_path + '/_checkpoints')
    .start(silver_path))

<pyspark.sql.connect.streaming.query.StreamingQuery at 0x7fc0ed3a1970>

In [0]:
df_silver=spark.read.format('delta').load(silver_path)
display(df_silver)

patient_id,gender,age,department,admission_time,discharge_time,bed_id,hospital_id
602d94b8-4a05-46e2-8f66-8a018d040403,Other,27,Maternity,2025-12-12T00:04:03.646134,2025-12-13T13:04:03.646134Z,75,6
963296db-2792-44f9-a829-be3a789dc1ce,Female,7,Emergency,2025-12-12T08:04:04.752311,2025-12-15T02:04:04.752311Z,304,7
f3febdea-83eb-4533-a267-f8642ba6603c,Female,5,Cardiology,2025-12-12T04:04:07.703286,2025-12-13T09:04:07.703286Z,165,5
8ca727d9-77c3-435b-8f4f-86e8cca6acc5,Other,24,Oncology,2025-12-14T14:04:09.643350,2025-12-14T19:04:09.64335Z,146,1
1606b9c2-793f-45a7-8ce9-e7f7cd354c90,Female,11,ICU,2025-12-14T00:04:12.582770,2025-12-14T18:04:12.58277Z,138,1
433a9e2a-f6ea-4b52-9c89-475cdd90d939,Male,47,Pediatrics,2025-12-14T12:04:15.280739,2025-12-17T02:04:15.280739Z,378,6
640d9ddc-3f40-4a39-a346-b021d845838f,Female,60,ICU,2025-12-14T02:04:16.182220,2025-12-14T22:04:16.18222Z,431,5
3fe7837b-e431-49d0-8773-73e4164d540a,Female,22,Cardiology,2025-12-12T03:04:18.488697,2025-12-14T00:04:18.488697Z,70,4
e75a7e80-4d84-471e-812f-5c1776151dad,Other,63,Emergency,2025-12-14T03:04:20.812964,2025-12-13T22:04:20.812964Z,122,7
6244d0ca-63f2-4feb-8d5f-b53bcae51fac,Male,73,Maternity,2025-12-14T08:04:22.016547,2025-12-17T02:04:22.016547Z,176,4
