In [0]:
#define varibles use in the scripts
catalog_name = "capstone_aimie_dbk"
schema_name = "medisure"

#storage path
input_path = f"/Volumes/{catalog_name}/{schema_name}/inputs"
schem_path = f"/Volumes/{catalog_name}/{schema_name}/schem"
bronze_path = f"/Volumes/{catalog_name}/{schema_name}/schem/bronze"

#imports
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, TimestampType, DateType, ArrayType, BooleanType

In [0]:
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {schema_name}")
spark.sql(f"CREATE VOLUME IF NOT EXISTS {catalog_name}.{schema_name}.inputs")

In [0]:
#creation of Bronze tables 
# Define explicit schemas for enforcement
claims_batch_schema = StructType([
    StructField("ClaimID", StringType(), True),
    StructField("MemberID", StringType(), True),
    StructField("ProviderID", StringType(), True),
    StructField("ClaimDate", StringType(), True),       
    StructField("ServiceDate", StringType(), True),    
    StructField("Amount", DoubleType(), True),
    StructField("Status", StringType(), True),
    StructField("ICD10Codes", StringType(), True),       
    StructField("CPTCodes", StringType(), True),        
    StructField("ClaimType", StringType(), True),
    StructField("SubmissionChannel", StringType(), True),
    StructField("Notes", StringType(), True),
    StructField("IngestTimestamp", StringType(), True)   
])

claims_stream_schema = StructType([
    StructField("ClaimID", StringType(), True),
    StructField("MemberID", StringType(), True),
    StructField("ProviderID", StringType(), True),
    StructField("ClaimDate", StringType(), True),  
    StructField("Amount", DoubleType(), True),
    StructField("Status", StringType(), True),
    StructField("ICD10Codes", StringType(), True), 
    StructField("CPTCodes", StringType(), True),   
    StructField("EventTimestamp", StringType(), True)
])

diagnosis_schema = StructType([
    StructField("Code", StringType(), True),
    StructField("Description", StringType(), True)
])

members_schema = StructType([
    StructField("MemberID", StringType(), True),
    StructField("Name", StringType(), True),
    StructField("DOB", StringType(), True),               
    StructField("Gender", StringType(), True),
    StructField("Region", StringType(), True),
    StructField("PlanType", StringType(), True),
    StructField("EffectiveDate", StringType(), True),     
    StructField("Email", StringType(), True),
    StructField("IsActive", DoubleType(), True),         
    StructField("LastUpdated", StringType(), True)      
])

providers_schema = StructType([
    StructField("ProviderID", StringType(), True),
    StructField("Name", StringType(), True),
    StructField("Specialties", ArrayType(StringType()), True),
    StructField("Locations", ArrayType(StructType([
        StructField("Address", StringType(), True),
        StructField("City", StringType(), True),
        StructField("State", StringType(), True)
    ])), True),
    StructField("IsActive", BooleanType(), True),
    StructField("TIN", StringType(), True),
    StructField("LastVerified", DateType(), True)
])

bronze_claims_batch_df = (spark.read
            .format("csv")
            .schema(claims_batch_schema)
            .option("header", "true")
            .option("timestampFormat", "yyyy-MM-dd[ HH:mm:ss]")
            .load(f"{input_path}/claims_batch.csv"))

bronze_claims_stream_df = (spark.read
                    .schema(claims_stream_schema)
                    .json(f"{input_path}/claims_stream.json"))

bronze_diagnosis_df = (spark.read
               .format("csv")
               .schema(diagnosis_schema)
               .option("header", "true")
               .load(f"{input_path}/diagnosis_ref.csv"))

bronze_members_df = (spark.read
                .format("csv")
                .schema(members_schema)
                .option("header", "true")
                .load(f"{input_path}/members.csv"))

bronze_providers_df = (spark.read
                    .schema(providers_schema)
                    .json(f"{input_path}/providers.json"))

(bronze_claims_batch_df.write.format("delta").mode("overwrite").saveAsTable("medisure.bronze_claims_batch"))
(bronze_claims_stream_df.write.format("delta").mode("overwrite").saveAsTable("medisure.bronze_claims_stream"))
(bronze_diagnosis_df.write.format("delta").mode("overwrite").saveAsTable("medisure.bronze_diagnosis_ref"))
(bronze_members_df.write.format("delta").mode("overwrite").saveAsTable("medisure.bronze_members"))
(bronze_providers_df.write.format("delta").mode("overwrite").saveAsTable("medisure.bronze_providers"))