In [0]:
from pyspark.sql.functions import sum, count, avg, col

fact_df  = spark.read.table("fraud_lakehouse.gold.fact_transactions")
dim_merch = spark.read.table("fraud_lakehouse.gold.dim_merchants")
dim_cust  = spark.read.table("fraud_lakehouse.gold.dim_customer").filter(col("is_current") == True)

bi_daily_summary = (
    fact_df.alias("f")
    .join(dim_merch.alias("m"), col("f.merchant_fk") == col("m.merchant_sk"))
    .join(dim_cust.alias("c"),  col("f.customer_fk") == col("c.customer_sk"))
    .groupBy(
        col("f.date_key"),
        col("m.merchant_category"),
        col("c.risk_segment"),
        col("c.home_country")
    )
    .agg(
        sum("f.amount_usd").alias("total_sales"),
        count("f.transaction_id").alias("total_transactions"),
        sum("f.fraud_flag").alias("fraud_count"),
        avg("f.amount_usd").alias("avg_transaction_value")
    )
    .withColumn("fraud_rate_percentage", (col("fraud_count") / col("total_transactions")) * 100)
)

table_name = "fraud_lakehouse.gold.bi_daily_sales_summary"
bi_daily_summary.write.format("delta").mode("overwrite").saveAsTable(table_name)

print(f"BI Table '{table_name}' created.")
display(spark.read.table(table_name))

In [0]:
spark.sql("""
    SELECT 
        merchant_category,
        SUM(total_transactions) as txns,
        SUM(fraud_count) as fraud_cases,
        ROUND((SUM(fraud_count) / SUM(total_transactions)) * 100, 2) as fraud_rate_percent
    FROM fraud_lakehouse.gold.bi_daily_sales_summary
    GROUP BY merchant_category
    ORDER BY fraud_rate_percent DESC
""").display()

In [0]:

spark.sql("""
    SELECT 
        risk_segment,
        SUM(total_sales) as revenue,
        SUM(fraud_count) as fraud_events
    FROM fraud_lakehouse.gold.bi_daily_sales_summary
    GROUP BY risk_segment
    ORDER BY revenue DESC
""").display()

In [0]:
from pyspark.sql.functions import count, sum, avg, max, col

fact = spark.read.table("fraud_lakehouse.gold.fact_transactions")
cust = spark.read.table("fraud_lakehouse.gold.dim_customer").filter(col("is_current") == True)
customer_360 = (
    fact.alias("f")
    .join(cust.alias("c"), col("f.customer_fk") == col("c.customer_sk"))
    .groupBy(
        col("c.customer_id"),
        col("c.home_country"),
        col("c.risk_segment")
    )
    .agg(
        sum("f.amount_usd").alias("lifetime_spend"),
        count("f.transaction_id").alias("lifetime_transactions"),
        sum("f.fraud_flag").alias("total_fraud_events"),
        avg("f.amount_usd").alias("avg_spend_per_txn"),
        max("f.date_key").alias("last_seen_date")
    )
)
customer_360.write.format("delta").mode("overwrite").saveAsTable("fraud_lakehouse.gold.customer_360_profile")

print("Customer 360 Table Created.")
display(spark.read.table("fraud_lakehouse.gold.customer_360_profile"))

In [0]:
spark.sql("""
    SELECT 
        HOUR(transaction_ts) as hour_of_day,
        COUNT(*) as total_txns,
        SUM(fraud_flag) as fraud_cases,
        ROUND((SUM(fraud_flag) / COUNT(*)) * 100, 2) as fraud_rate_percent
    FROM fraud_lakehouse.gold.fact_transactions
    GROUP BY hour_of_day
    ORDER BY hour_of_day ASC
""").display()

In [0]:
spark.sql("""
    SELECT 
        device_type,
        SUM(amount_usd) as total_volume,
        SUM(fraud_flag) as fraud_count,
        ROUND((SUM(fraud_flag) / COUNT(*)) * 100, 2) as fraud_rate_percent
    FROM fraud_lakehouse.gold.fact_transactions
    GROUP BY device_type
    ORDER BY fraud_rate_percent DESC
""").display()

In [0]:
spark.sql("""
    SELECT 
        c.home_country as customer_country,
        m.merchant_country as merchant_country,
        COUNT(f.transaction_id) as total_txns,
        SUM(f.fraud_flag) as fraud_cases
    FROM fraud_lakehouse.gold.fact_transactions f
    JOIN fraud_lakehouse.gold.dim_customer c ON f.customer_fk = c.customer_sk
    JOIN fraud_lakehouse.gold.dim_merchants m ON f.merchant_fk = m.merchant_sk
    WHERE c.home_country != m.merchant_country 
    GROUP BY c.home_country, m.merchant_country
    HAVING fraud_cases > 0
    ORDER BY fraud_cases DESC
    LIMIT 10
""").display()

In [0]:
spark.sql("""
    SELECT 
        CASE WHEN fraud_flag = 1 THEN 'Fraudulent' ELSE 'Legitimate' END as status,
        COUNT(*) as count,
        ROUND(AVG(amount_usd), 2) as avg_transaction_value,
        ROUND(MAX(amount_usd), 2) as max_transaction_value
    FROM fraud_lakehouse.gold.fact_transactions
    GROUP BY fraud_flag
""").display()

In [0]:
spark.sql("""
    SELECT 
        m.merchant_name,
        m.merchant_category,
        COUNT(f.transaction_id) as total_txns,
        SUM(f.fraud_flag) as fraud_events,
        ROUND((SUM(f.fraud_flag) / COUNT(f.transaction_id)) * 100, 2) as merchant_fraud_rate
    FROM fraud_lakehouse.gold.fact_transactions f
    JOIN fraud_lakehouse.gold.dim_merchants m ON f.merchant_fk = m.merchant_sk
    GROUP BY m.merchant_name, m.merchant_category
    HAVING total_txns > 5 
    ORDER BY merchant_fraud_rate DESC
    LIMIT 10
""").display()

In [0]:
spark.sql("""
    SELECT 
        card_type,
        SUM(fraud_flag) as total_fraud_cases,
        ROUND(SUM(amount_usd), 2) as total_fraud_value
    FROM fraud_lakehouse.gold.fact_transactions
    WHERE fraud_flag = 1
    GROUP BY card_type
    ORDER BY total_fraud_value DESC
""").display()

In [0]:

from pyspark.sql.functions import col, hour, when, concat_ws, lit, array, array_remove

fact  = spark.read.table("fraud_lakehouse.gold.fact_transactions")
cust  = spark.read.table("fraud_lakehouse.gold.dim_customer").filter(col("is_current") == True)
merch = spark.read.table("fraud_lakehouse.gold.dim_merchants")


joined_df = (
    fact.alias("f")
    .join(cust.alias("c"),  col("f.customer_fk") == col("c.customer_sk"))
    .join(merch.alias("m"), col("f.merchant_fk") == col("m.merchant_sk"))
)


rule_night_owl = (
    (hour(col("f.transaction_ts")).between(2, 4)) & 
    (col("f.amount") > 1000)
)

rule_cross_border = (
    col("c.home_country") != col("m.merchant_country")
)


rule_high_value = (
    col("f.amount") > 3000
)


flagged_df = (
    joined_df
    .withColumn("rule_night_owl", when(rule_night_owl, "Night_Owl_>1k").otherwise(lit(None)))
    .withColumn("rule_cross_border", when(rule_cross_border, "Cross_Border").otherwise(lit(None)))
    .withColumn("rule_high_value", when(rule_high_value, "High_Value_>3k").otherwise(lit(None)))
    
    
    .withColumn("suspicion_reasons", 
        concat_ws(", ", 
            col("rule_night_owl"), 
            col("rule_cross_border"), 
            col("rule_high_value")
        )
    )
    
    .filter(col("suspicion_reasons") != "")
    .select(
        col("f.transaction_id"),
        col("f.transaction_ts"),
        col("f.amount"),
        col("c.customer_id"),
        col("c.home_country"),
        col("m.merchant_name"),
        col("m.merchant_country"),
        col("suspicion_reasons"), 
        col("f.fraud_flag") 
    )
)


table_name = "fraud_lakehouse.gold.suspicious_activity_report"
print(f"Running Rule Engine... Saving to {table_name}")

flagged_df.write.format("delta").mode("overwrite").saveAsTable(table_name)

print("Rule Engine Complete. Suspicious transactions tagged.")
display(spark.read.table(table_name).limit(10))

In [0]:

spark.sql(f"""
    SELECT 
        suspicion_reasons,
        COUNT(*) as flagged_count,
        SUM(fraud_flag) as actual_fraud_found,
        ROUND((SUM(fraud_flag) / COUNT(*)) * 100, 1) as precision_rate
    FROM {table_name}
    GROUP BY suspicion_reasons
    ORDER BY actual_fraud_found DESC
""").display()