In [0]:
# Databricks Notebook: 06_delta_features

from pyspark.sql import functions as F

In [0]:
%sql
CREATE OR REPLACE TABLE workspace.healthcare.gold_claims_fraud_scores_table AS
SELECT * FROM workspace.healthcare.gold_claims_fraud_scores;

In [0]:
%sql
-- History
DESCRIBE HISTORY workspace.healthcare.gold_claims_fraud_scores_table;

In [0]:
%sql
-- Time travel
SELECT * FROM workspace.healthcare.gold_claims_fraud_scores_table VERSION AS OF 0 LIMIT 10;

In [0]:
%sql
-- Optimization
OPTIMIZE workspace.healthcare.gold_claims_fraud_scores_table
ZORDER BY (risk_bucket);

In [0]:
%sql
-- Vacuum
VACUUM workspace.healthcare.gold_claims_fraud_scores_table RETAIN 168 HOURS;

In [0]:
total_claims = spark.sql("""
    SELECT COUNT(*) AS cnt
    FROM workspace.healthcare.silver_claims_dedup
""").collect()[0]["cnt"]

print(f"✅ Total claims ingested (deduped): {total_claims}")

In [0]:
high_risk_alerts = spark.sql("""
    SELECT COUNT(*) AS cnt
    FROM workspace.healthcare.gold_fraud_alerts
""").collect()[0]["cnt"]

print(f"🚨 High-risk alerts generated: {high_risk_alerts}")

In [0]:
risk_dist = spark.sql("""
    SELECT risk_bucket, COUNT(*) AS cnt
    FROM workspace.healthcare.gold_claims_fraud_scores
    GROUP BY risk_bucket
    ORDER BY cnt DESC
""")

display(risk_dist)


In [0]:
sample_alerts = spark.sql("""
    SELECT ClaimID, MemberID, ProviderID, Amount, Diagnosis_Code, fraud_score, risk_bucket
    FROM workspace.healthcare.gold_fraud_alerts
    LIMIT 10
""")

display(sample_alerts)

In [0]:
# Example: % of claims missing MemberID
missing_members = spark.sql("""
    SELECT COUNT(*) AS cnt
    FROM default.silver_claims_enriched_v1
    WHERE MemberID IS NULL
""").collect()[0]["cnt"]

print(f"⚠️ Claims missing MemberID: {missing_members}")

In [0]:
# Example: Claims with invalid FK (is_valid = false)
invalid_claims = spark.sql("""
    SELECT COUNT(*) AS cnt
    FROM default.silver_invalid_claims
""").collect()[0]["cnt"]

print(f"❌ Invalid claims flagged: {invalid_claims}")