In [0]:
from pyspark.sql import functions as F

#### **Read data from Bronze**

In [0]:
df_bronze = spark.table("Patient_Risk_Prediction.bronze.healthcare")

#### **Clean and standardize data**

In [0]:
df_silver = (
    df_bronze
    # Trim spaces and standardize name formats
    .withColumn("name", F.initcap(F.trim(F.col("name"))))
    .withColumn("gender", F.upper(F.trim(F.col("gender"))))
    .withColumn("blood_type", F.upper(F.trim(F.col("blood_type"))))
    .withColumn("medical_condition", F.initcap(F.trim(F.col("medical_condition"))))
    .withColumn("doctor", F.initcap(F.trim(F.col("doctor"))))
    .withColumn("hospital", F.initcap(F.trim(F.col("hospital"))))
    .withColumn("insurance_provider", F.initcap(F.trim(F.col("insurance_provider"))))
    .withColumn("medication", F.initcap(F.trim(F.col("medication"))))
    .withColumn("test_results", F.initcap(F.trim(F.col("test_results"))))
    # Convert date strings to DateType
    .withColumn("date_of_admission", F.to_date("date_of_admission", "yyyy-MM-dd"))
    .withColumn("discharge_date", F.to_date("discharge_date", "yyyy-MM-dd"))
    # Handle missing/null values
    .fillna({
        "gender": "UNKNOWN",
        "blood_type": "UNKNOWN",
        "medical_condition": "Not Specified",
        "doctor": "Not Assigned",
        "hospital": "Unknown Hospital",
        "insurance_provider": "Not Provided",
        "admission_type": "Other",
        "medication": "Not Specified",
        "test_results": "Unknown"
    })
    # Cast numeric columns properly
    .withColumn("age", F.col("age").cast("integer"))
    .withColumn("billing_amount", F.col("billing_amount").cast("double"))
    .withColumn("room_number", F.col("room_number").cast("integer"))
    # Add derived columns
    .withColumn("stay_duration_days", F.datediff(F.col("discharge_date"), F.col("date_of_admission")))
)

#### **Write cleaned data to Silver Delta Table**

In [0]:
(
    df_silver.write.format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable("Patient_Risk_Prediction.silver.healthcare")
)

print("✅ Silver table 'Patient_Risk_Prediction.silver.healthcare' created successfully.")

✅ Silver table 'Patient_Risk_Prediction.silver.healthcare' created successfully.


In [0]:
%sql
SELECT * FROM Patient_Risk_Prediction.silver.healthcare LIMIT 10;

name,age,gender,blood_type,medical_condition,date_of_admission,doctor,hospital,insurance_provider,billing_amount,room_number,admission_type,discharge_date,medication,test_results,stay_duration_days
Bobby Jackson,30,MALE,B-,Cancer,2024-01-31,Matthew Smith,Sons And Miller,Blue Cross,18856.281305978155,328,Urgent,2024-02-02,Paracetamol,Normal,2
Leslie Terry,62,MALE,A+,Obesity,2019-08-20,Samantha Davies,Kim Inc,Medicare,33643.327286577885,265,Emergency,2019-08-26,Ibuprofen,Inconclusive,6
Danny Smith,76,FEMALE,A-,Obesity,2022-09-22,Tiffany Mitchell,Cook Plc,Aetna,27955.096078842456,205,Emergency,2022-10-07,Aspirin,Normal,15
Andrew Watts,28,FEMALE,O+,Diabetes,2020-11-18,Kevin Wells,"Hernandez Rogers And Vang,",Medicare,37909.78240987528,450,Elective,2020-12-18,Ibuprofen,Abnormal,30
Adrienne Bell,43,FEMALE,AB+,Cancer,2022-09-19,Kathleen Hanna,White-white,Aetna,14238.317813937623,458,Urgent,2022-10-09,Penicillin,Abnormal,20
Emily Johnson,36,MALE,A+,Asthma,2023-12-20,Taylor Newton,Nunez-humphrey,Unitedhealthcare,48145.11095104189,389,Urgent,2023-12-24,Ibuprofen,Normal,4
Edward Edwards,21,FEMALE,AB-,Diabetes,2020-11-03,Kelly Olson,Group Middleton,Medicare,19580.87234486093,389,Emergency,2020-11-15,Paracetamol,Inconclusive,12
Christina Martinez,20,FEMALE,A+,Cancer,2021-12-28,Suzanne Thomas,"Powell Robinson And Valdez,",Cigna,45820.46272159459,277,Emergency,2022-01-07,Paracetamol,Inconclusive,10
Jasmine Aguilar,82,MALE,AB+,Asthma,2020-07-01,Daniel Ferguson,Sons Rich And,Cigna,50119.222791548505,316,Elective,2020-07-14,Aspirin,Abnormal,13
Christopher Berg,58,FEMALE,AB-,Cancer,2021-05-23,Heather Day,Padilla-walker,Unitedhealthcare,19784.63106221073,249,Elective,2021-06-22,Paracetamol,Inconclusive,30
