In [0]:
#define varibles use in the scripts
catalog_name = "capstone_aimie_dbk"
schema_name = "medisure"

#storage path
input_path = f"/Volumes/{catalog_name}/{schema_name}/inputs"
schem_path = f"/Volumes/{catalog_name}/{schema_name}/schem"
bronze_path = f"/Volumes/{catalog_name}/{schema_name}/schem/bronze"

#imports
from pyspark.sql.functions import col, when, udf, lit
from pyspark.sql.types import IntegerType


In [0]:
combined_claims_df = spark.table(f"{catalog_name}.{schema_name}.silver_claims_transform")
silver_diagnosis_df = spark.table(f"{catalog_name}.{schema_name}.silver_diagnosis_ref")
silver_members_df = spark.table(f"{catalog_name}.{schema_name}.silver_members")
silver_providers_df = spark.table(f"{catalog_name}.{schema_name}.silver_providers")

In [0]:
# Define a UDF for fraud scoring
@udf(returnType=IntegerType())
def fraud_score(amount, status):
    if amount is None or status is None:
        return 0
    if amount > 10000 and status == "Pending":
        return 1
    elif amount > 5000 and status == "Approved":
        return 0.5
    else:
        return 0

# Join datasets
gold_enriched_claims_df = (
    combined_claims_df
    .join(silver_members_df, "MemberID", "inner")
    .join(silver_providers_df, "ProviderID", "inner")
    .join(silver_diagnosis_df, col("ICD10Codes") == col("Code"), "left")
    .withColumn("FraudScore", fraud_score(col("Amount"), col("Status")))
    .withColumn("IsValid", when((col("FraudScore") > 0) & (col("Status") == "Pending"), "No")
                          .when((col("FraudScore") == 0) & (col("Status") == "Approved"), "Yes")
                          .otherwise("Unknown"))
)

spark.sql("""
CREATE TABLE IF NOT EXISTS medisure.gold_enriched_claims (
    ClaimID STRING,
    MemberID STRING,
    ProviderID STRING,
    ClaimDate DATE,
    Amount DOUBLE,
    Status STRING,
    ICD10Codes STRING,
    CPTCodes STRING,
    IngestTimestamp TIMESTAMP,
    Source STRING,
    Name STRING,
    DOB DATE,
    Gender STRING,
    Region STRING,
    PlanType STRING,
    EffectiveDate DATE,
    Email STRING,
    IsActive DOUBLE,
    LastUpdated DATE,
    ProviderName STRING,
    Specialties STRING,
    Address STRING,
    City STRING,
    State STRING,
    IsActiveFlag BOOLEAN,
    TIN STRING,
    LastVerified DATE,
    Description STRING,
    FraudScore INT
)
USING DELTA
""")

spark.sql("""
CREATE TABLE IF NOT EXISTS medisure.gold_enriched_claims_histo (
    ClaimID STRING,
    MemberID STRING,
    ProviderID STRING,
    ClaimDate DATE,
    Amount DOUBLE,
    Status STRING,
    ICD10Codes STRING,
    CPTCodes STRING,
    IngestTimestamp TIMESTAMP,
    Source STRING,
    Name STRING,
    DOB DATE,
    Gender STRING,
    Region STRING,
    PlanType STRING,
    EffectiveDate DATE,
    Email STRING,
    IsActive DOUBLE,
    LastUpdated DATE,
    ProviderName STRING,
    Specialties STRING,
    Address STRING,
    City STRING,
    State STRING,
    IsActiveFlag BOOLEAN,
    TIN STRING,
    LastVerified DATE,
    Description STRING,
    FraudScore INT,
    xBackupDate DATE
)
USING DELTA
""")

In [0]:
from delta.tables import DeltaTable

# Define the Delta table name
delta_table_name = "capstone_aimie_dbk.medisure.gold_enriched_claims_histo"

# Versioning and time travel
df_version_1 = spark.read.format("delta").option("versionAsOf", 1).table(delta_table_name)

# Data/metadata comparison
df_current = spark.read.format("delta").table(delta_table_name)
df_previous = spark.read.format("delta").option("versionAsOf", 1).table(delta_table_name)
df_diff = df_current.exceptAll(df_previous)

# Partitioning & Z-ordering
spark.sql(
    "OPTIMIZE capstone_aimie_dbk.medisure.gold_enriched_claims ZORDER BY (ClaimID)"
)

# OPTIMIZE & VACUUM with increased retention period
spark.sql(
    "VACUUM capstone_aimie_dbk.medisure.gold_enriched_claims RETAIN 168 HOURS"
)

# Implement MERGE for upserts and deduplication
source_df = gold_enriched_claims_df
target_table = DeltaTable.forName(
    spark, "capstone_aimie_dbk.medisure.gold_enriched_claims"
)

target_table.alias("t").merge(
    source_df.alias("s"),
    (
        "t.ClaimID = s.ClaimID and "
        "t.MemberID = s.MemberID and "
        "t.ProviderID = s.ProviderID"
    )
).whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()

# Copy data from gold_enriched_claims to gold_enriched_claims_histo for backup purposes and for restoring data if needed
spark.sql("""
INSERT INTO capstone_aimie_dbk.medisure.gold_enriched_claims_histo
SELECT *, current_date() FROM capstone_aimie_dbk.medisure.gold_enriched_claims
""")

In [0]:
%sql
Select * from capstone_aimie_dbk.medisure.gold_enriched_claims

In [0]:
%sql
Select * from capstone_aimie_dbk.medisure.gold_enriched_claims_histo