In [82]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, isnan, when, isnull, countDistinct

# Create Spark session
spark = SparkSession.builder \
    .appName("Silver Layer - Loan Data EDA") \
    .config("spark.driver.memory", "2g") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .getOrCreate()

# Load bronze_loan_daily data
bronze_loan_df = spark.read.parquet("datamart/bronze/bronze_loan_daily")

# Basic info
print(f"Total number of rows: {bronze_loan_df.count()}")
print(f"Total number of columns: {len(bronze_loan_df.columns)}")

# Display schema
print("\nSchema of the dataset:")
bronze_loan_df.printSchema()

# Display sample data
print("\nSample data:")
bronze_loan_df.show(5, truncate=False)

# Calculate number of nulls per column
print("\nNumber of NULL values per column:")
null_counts = bronze_loan_df.select([count(when(col(c).isNull() | isnull(c), c)).alias(c) for c in bronze_loan_df.columns])
null_counts.show(truncate=False)

# Calculate number of distinct values per column
print("\nNumber of distinct values per column:")
distinct_counts = {c: bronze_loan_df.select(c).distinct().count() for c in bronze_loan_df.columns}
for col_name, count in distinct_counts.items():
    print(f"{col_name}: {count}")

# Check basic loan statistics per customer
print("\nNumber of loans per customer:")
bronze_loan_df.groupBy("Customer_ID").count().orderBy(col("count").desc()).show(12)

# Check number of installments per loan
print("\nNumber of installments per loan:")
bronze_loan_df.groupBy("loan_id").count().orderBy(col("count").desc()).show(12)

# Check range of loan start dates
print("\nRange of loan start dates:")
bronze_loan_df.select("loan_start_date").distinct().orderBy("loan_start_date").show(12)

# Check range of snapshot dates
print("\nRange of snapshot dates:")
bronze_loan_df.select("snapshot_date").distinct().orderBy("snapshot_date").show(12)


Total number of rows: 137500
Total number of columns: 13

Schema of the dataset:
root
 |-- loan_id: string (nullable = true)
 |-- Customer_ID: string (nullable = true)
 |-- loan_start_date: date (nullable = true)
 |-- tenure: integer (nullable = true)
 |-- installment_num: integer (nullable = true)
 |-- loan_amt: integer (nullable = true)
 |-- due_amt: integer (nullable = true)
 |-- paid_amt: integer (nullable = true)
 |-- overdue_amt: integer (nullable = true)
 |-- balance: integer (nullable = true)
 |-- snapshot_date: date (nullable = true)
 |-- bronze_ingest_timestamp: timestamp (nullable = true)
 |-- bronze_source_file: string (nullable = true)


Sample data:
+---------------------+-----------+---------------+------+---------------+--------+-------+--------+-----------+-------+-------------+--------------------------+------------------+
|loan_id              |Customer_ID|loan_start_date|tenure|installment_num|loan_amt|due_amt|paid_amt|overdue_amt|balance|snapshot_date|bronze_ingest

In [83]:
from pyspark.sql.functions import col

# Compute the expected balance using the formula:
# calculated_balance = loan_amt - (installment_num * due_amt - overdue_amt)
balance_check = bronze_loan_df.withColumn(
    "calculated_balance", 
    col("loan_amt") - (col("installment_num") * col("due_amt") - col("overdue_amt"))
)

# Check whether the calculated balance matches the actual balance column
print("Balance consistency check:")
balance_check.withColumn(
    "balance_matches", 
    col("calculated_balance") == col("balance")
).groupBy("balance_matches").count().show()

# If mismatches exist, display a few examples
print("\nExamples of mismatched balances:")
balance_check.filter(col("calculated_balance") != col("balance")).show(5)


Balance consistency check:
+---------------+------+
|balance_matches| count|
+---------------+------+
|           true|137500|
+---------------+------+


Examples of mismatched balances:
+-------+-----------+---------------+------+---------------+--------+-------+--------+-----------+-------+-------------+-----------------------+------------------+------------------+
|loan_id|Customer_ID|loan_start_date|tenure|installment_num|loan_amt|due_amt|paid_amt|overdue_amt|balance|snapshot_date|bronze_ingest_timestamp|bronze_source_file|calculated_balance|
+-------+-----------+---------------+------+---------------+--------+-------+--------+-----------+-------+-------------+-----------------------+------------------+------------------+
+-------+-----------+---------------+------+---------------+--------+-------+--------+-----------+-------+-------------+-----------------------+------------------+------------------+



In [84]:
from pyspark.sql.functions import col, when

# Compute the expected overdue amount.
# Formula: expected_overdue = (installment_num * due_amt) - paid_amt
# This assumes all due amounts are equal and payment is cumulative.
overdue_check = bronze_loan_df.withColumn(
    "expected_overdue", 
    when(
        col("installment_num") > 0, 
        col("installment_num") * col("due_amt") - col("paid_amt")
    ).otherwise(0)
)

# Check whether the expected overdue matches the actual overdue_amt in the dataset
print("Overdue amount consistency check:")
overdue_check.withColumn(
    "overdue_matches", 
    col("expected_overdue") == col("overdue_amt")
).groupBy("overdue_matches").count().show()

# Show sample records where expected and actual overdue amounts do not match
print("\nExamples of mismatched overdue amounts:")
overdue_check.filter(col("expected_overdue") != col("overdue_amt")) \
    .select("loan_id", "installment_num", "due_amt", "paid_amt", "expected_overdue", "overdue_amt") \
    .show(5)


Overdue amount consistency check:
+---------------+------+
|overdue_matches| count|
+---------------+------+
|           true| 26954|
|          false|110546|
+---------------+------+


Examples of mismatched overdue amounts:
+--------------------+---------------+-------+--------+----------------+-----------+
|             loan_id|installment_num|due_amt|paid_amt|expected_overdue|overdue_amt|
+--------------------+---------------+-------+--------+----------------+-----------+
|CUS_0x56ef_2024_0...|              3|   1000|       0|            3000|       1000|
|CUS_0x56ef_2024_0...|              4|   1000|       0|            4000|       2000|
|CUS_0x56ef_2024_0...|              5|   1000|       0|            5000|       3000|
|CUS_0x56ef_2024_0...|              6|   1000|       0|            6000|       4000|
|CUS_0x56ef_2024_0...|              7|   1000|       0|            7000|       5000|
+--------------------+---------------+-------+--------+----------------+-----------+
only show

In [85]:
from pyspark.sql.functions import col, when, datediff, months_between

print("Detailed analysis of overdue amounts:")

# Compute overdue ratio: overdue_amt / expected_overdue
# If expected overdue is zero, set ratio to zero to avoid division by zero
detailed_analysis = overdue_check.withColumn(
    "overdue_ratio", 
    when(col("expected_overdue") > 0, 
         col("overdue_amt") / col("expected_overdue")).otherwise(0)
)

# Display statistical summary of overdue ratio
print("\nRatio of actual to expected overdue amount:")
detailed_analysis.select("overdue_ratio").describe().show()

# Analyze match rate of overdue amount per installment number
print("\nOverdue amount match status by installment number:")
overdue_check.withColumn(
    "overdue_matches", 
    col("expected_overdue") == col("overdue_amt")
).groupBy("installment_num", "overdue_matches").count().orderBy("installment_num").show(20)

# Inspect full overdue record of a specific loan
print("\nFull overdue history of one specific loan:")
overdue_check.filter(col("loan_id") == "CUS_0x56ef_2024_04_01") \
    .select("installment_num", "due_amt", "paid_amt", "overdue_amt", "expected_overdue") \
    .orderBy("installment_num") \
    .show(11)


Detailed analysis of overdue amounts:

Ratio of actual to expected overdue amount:
+-------+------------------+
|summary|     overdue_ratio|
+-------+------------------+
|  count|            137500|
|   mean|0.1316588340548323|
| stddev|0.2718268117145753|
|    min|               0.0|
|    max|               1.0|
+-------+------------------+


Overdue amount match status by installment number:
+---------------+---------------+-----+
|installment_num|overdue_matches|count|
+---------------+---------------+-----+
|              0|           true|12500|
|              1|           true|12500|
|              2|           true|  707|
|              2|          false|11793|
|              3|           true|  278|
|              3|          false|12222|
|              4|          false|12323|
|              4|           true|  177|
|              5|          false|12368|
|              5|           true|  132|
|              6|           true|  132|
|              6|          false|12368|
|  

In [88]:
from pyspark.sql.functions import col, when, lag, sum as sum_col
from pyspark.sql.window import Window

# Define a window partitioned by loan_id and ordered by installment_num
loan_window = Window.partitionBy("loan_id").orderBy("installment_num")

# Create the Silver layer for loan data
silver_loan_df = bronze_loan_df.withColumn(
    # Flag for whether payment was made on time (1 = paid, 0 = unpaid)
    "payment_made_flag", 
    when(col("paid_amt") >= col("due_amt"), 1).otherwise(0)
).withColumn(
    # Flag indicating overdue for the current installment (1 = overdue, 0 = not overdue)
    "monthly_overdue_flag",
    when((col("installment_num") > 0) & (col("paid_amt") < col("due_amt")), 1).otherwise(0)
)

# Add lag and cumulative calculations using window functions
silver_loan_df = silver_loan_df.withColumn(
    # Previous installment's payment flag
    "prev_payment_status", 
    lag("payment_made_flag", 1).over(loan_window)
).withColumn(
    # Cumulative due amount
    "total_due_amount", 
    col("installment_num") * col("due_amt")
).withColumn(
    # Cumulative paid amount
    "total_paid_amount", 
    sum_col("paid_amt").over(loan_window)
).withColumn(
    # Ratio of payment made this period
    "payment_ratio", 
    when(col("due_amt") > 0, col("paid_amt") / col("due_amt")).otherwise(1.0)
)

# Assign payment status based on business logic
silver_loan_df = silver_loan_df.withColumn(
    "payment_status",
    when(col("installment_num") == 0, "LOAN_INITIATED")
    .when(col("monthly_overdue_flag") == 0, "PAID_ON_TIME")
    .when((col("monthly_overdue_flag") == 1) & (col("overdue_amt") <= col("due_amt")), "MISSED_PAYMENT")
    .when((col("monthly_overdue_flag") == 1) & (col("overdue_amt") <= col("due_amt") * 3), "OVERDUE")
    .when(col("monthly_overdue_flag") == 1, "SEVERELY_OVERDUE")
    .otherwise("UNKNOWN")
)

# Display selected columns
print("Loan payment status in the Silver layer:")
silver_loan_df.select(
    "loan_id", "installment_num", "due_amt", "paid_amt", "overdue_amt", 
    "monthly_overdue_flag", "payment_status"
).orderBy("loan_id", "installment_num").show(11)

# Check full record for one loan
print("\nFull Silver-layer record for a specific loan:")
silver_loan_df.filter(col("loan_id") == "CUS_0x56ef_2024_04_01") \
    .select(
        "installment_num", "due_amt", "paid_amt", "overdue_amt", 
        "monthly_overdue_flag", "payment_status"
    ) \
    .orderBy("installment_num") \
    .show(11)


Loan payment status in the Silver layer:
+--------------------+---------------+-------+--------+-----------+--------------------+----------------+
|             loan_id|installment_num|due_amt|paid_amt|overdue_amt|monthly_overdue_flag|  payment_status|
+--------------------+---------------+-------+--------+-----------+--------------------+----------------+
|CUS_0x1000_2023_0...|              0|      0|       0|          0|                   0|  LOAN_INITIATED|
|CUS_0x1000_2023_0...|              1|   1000|    1000|          0|                   0|    PAID_ON_TIME|
|CUS_0x1000_2023_0...|              2|   1000|    1000|          0|                   0|    PAID_ON_TIME|
|CUS_0x1000_2023_0...|              3|   1000|       0|       1000|                   1|  MISSED_PAYMENT|
|CUS_0x1000_2023_0...|              4|   1000|    2000|          0|                   0|    PAID_ON_TIME|
|CUS_0x1000_2023_0...|              5|   1000|       0|       1000|                   1|  MISSED_PAYMENT|
|CUS_

In [89]:
from pyspark.sql.functions import col, when, sum as sum_col, datediff, date_add, current_date, lit
from pyspark.sql.window import Window

# Define a window by loan_id and ordered by installment_num
loan_window = Window.partitionBy("loan_id").orderBy("installment_num")

# Enhance Silver layer with additional fields
silver_loan_df = silver_loan_df.withColumn(
    # Calculate consecutive missed payments (reset after payment made)
    "consecutive_missed_payments",
    sum_col(when(col("monthly_overdue_flag") == 1, 1).otherwise(0))
    .over(Window.partitionBy("loan_id")
          .orderBy("installment_num")
          .rowsBetween(Window.unboundedPreceding, Window.currentRow))
    - sum_col(when((col("monthly_overdue_flag") == 0) & (col("installment_num") > 0), 
                    sum_col(when(col("monthly_overdue_flag") == 1, 1).otherwise(0))
                    .over(Window.partitionBy("loan_id")
                          .orderBy("installment_num")
                          .rowsBetween(Window.unboundedPreceding, Window.currentRow - 1)))
              .otherwise(0))
    .over(Window.partitionBy("loan_id")
          .orderBy("installment_num")
          .rowsBetween(Window.unboundedPreceding, Window.currentRow))
)

# Add date-related calculations
silver_loan_df = silver_loan_df.withColumn(
    # Age of the loan in months
    "loan_age_months",
    datediff(col("snapshot_date"), col("loan_start_date")) / 30
).withColumn(
    # Remaining tenure in months
    "remaining_tenure",
    col("tenure") - col("installment_num")
).withColumn(
    # Expected end date of the loan
    "expected_end_date",
    date_add(col("loan_start_date"), col("tenure") * 30)
).withColumn(
    # Whether the loan is still active
    "is_active_loan",
    (col("installment_num") < col("tenure")) & 
    (col("snapshot_date") <= date_add(col("loan_start_date"), col("tenure") * 30))
)

# Add business-related banking features
silver_loan_df = silver_loan_df.withColumn(
    # Whether the loan is considered defaulted (default = 3 or more missed payments)
    "is_default",
    col("consecutive_missed_payments") >= 3
).withColumn(
    # Percentage of total due amount that has been paid
    "percent_paid",
    when(col("total_due_amount") > 0, 
         col("total_paid_amount") / col("total_due_amount") * 100).otherwise(0)
)

# Display full Silver layer result (sample)
print("Enhanced Silver-layer loan records:")
silver_loan_df.select(
    "loan_id", "installment_num", "payment_status", 
    "consecutive_missed_payments", "is_default",
    "loan_age_months", "remaining_tenure", "is_active_loan"
).orderBy("loan_id", "installment_num").show(11)

# Check detailed records for a sample loan
print("\nDetailed Silver-layer record for a sample loan:")
silver_loan_df.filter(col("loan_id") == "CUS_0x56ef_2024_04_01") \
    .select(
        "installment_num", "payment_status", "consecutive_missed_payments", 
        "is_default", "loan_age_months", "remaining_tenure", "is_active_loan", "percent_paid"
    ) \
    .orderBy("installment_num") \
    .show(11)


Enhanced Silver-layer loan records:
+--------------------+---------------+----------------+---------------------------+----------+------------------+----------------+--------------+
|             loan_id|installment_num|  payment_status|consecutive_missed_payments|is_default|   loan_age_months|remaining_tenure|is_active_loan|
+--------------------+---------------+----------------+---------------------------+----------+------------------+----------------+--------------+
|CUS_0x1000_2023_0...|              0|  LOAN_INITIATED|                          0|     false|               0.0|              10|          true|
|CUS_0x1000_2023_0...|              1|    PAID_ON_TIME|                          0|     false|1.0333333333333334|               9|          true|
|CUS_0x1000_2023_0...|              2|    PAID_ON_TIME|                          0|     false| 2.033333333333333|               8|          true|
|CUS_0x1000_2023_0...|              3|  MISSED_PAYMENT|                          1|     

In [51]:
from pyspark.sql import SparkSession

# 创建 Spark 会话
spark = SparkSession.builder \
    .appName("Silver Layer Validation") \
    .config("spark.driver.memory", "2g") \
    .config("spark.executor.memory", "2g") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .getOrCreate()

# 加载并检查 Silver 层贷款数据
silver_loan_df = spark.read.parquet("datamart/silver/silver_loan_daily")

# 打印信息
print(f"行数: {silver_loan_df.count()}")
print("模式:")
silver_loan_df.printSchema()

# 查看样本数据
print("\n样本数据:")
silver_loan_df.show(5)

行数: 137500
模式:
root
 |-- loan_id: string (nullable = true)
 |-- Customer_ID: string (nullable = true)
 |-- loan_start_date: date (nullable = true)
 |-- tenure: integer (nullable = true)
 |-- installment_num: integer (nullable = true)
 |-- loan_amt: integer (nullable = true)
 |-- due_amt: integer (nullable = true)
 |-- paid_amt: integer (nullable = true)
 |-- overdue_amt: integer (nullable = true)
 |-- balance: integer (nullable = true)
 |-- snapshot_date: date (nullable = true)
 |-- bronze_ingest_timestamp: timestamp (nullable = true)
 |-- bronze_source_file: string (nullable = true)
 |-- payment_made_flag: integer (nullable = true)
 |-- monthly_overdue_flag: integer (nullable = true)
 |-- total_due_amount: integer (nullable = true)
 |-- total_paid_amount: long (nullable = true)
 |-- payment_ratio: double (nullable = true)
 |-- payment_status: string (nullable = true)
 |-- consecutive_missed_payments: long (nullable = true)
 |-- loan_age_months: double (nullable = true)
 |-- remaining_

In [53]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, when, isnan, isnull

# 创建Spark会话
spark = SparkSession.builder \
    .appName("Silver Layer Validation") \
    .config("spark.driver.memory", "2g") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .getOrCreate()

# 加载Bronze和Silver层数据
bronze_loan_df = spark.read.parquet("datamart/bronze/bronze_loan_daily")
silver_loan_df = spark.read.parquet("datamart/silver/silver_loan_daily")

# 1. 记录数验证
bronze_count = bronze_loan_df.count()
silver_count = silver_loan_df.count()
print(f"Bronze层记录数: {bronze_count}")
print(f"Silver层记录数: {silver_count}")
print(f"记录数是否匹配: {bronze_count == silver_count}")

# 2. 检查NULL值
print("\n检查Silver层关键字段的NULL值:")
null_counts = silver_loan_df.select([
    count(when(col(c).isNull() | isnull(c), c)).alias(c) 
    for c in ["loan_id", "payment_status", "consecutive_missed_payments", "is_default"]
])
null_counts.show()

# 3. 基本字段验证
print("\n检查衍生字段的分布:")
# 支付状态分布
print("支付状态分布:")
silver_loan_df.groupBy("payment_status").count().orderBy("payment_status").show()

# 违约贷款比例
default_count = silver_loan_df.filter(col("is_default") == True).count()
print(f"违约贷款记录比例: {default_count / silver_count * 100:.2f}%")

Bronze层记录数: 137500
Silver层记录数: 137500
记录数是否匹配: True

检查Silver层关键字段的NULL值:
+-------+--------------+---------------------------+----------+
|loan_id|payment_status|consecutive_missed_payments|is_default|
+-------+--------------+---------------------------+----------+
|      0|             0|                          0|         0|
+-------+--------------+---------------------------+----------+


检查衍生字段的分布:
支付状态分布:
+----------------+-----+
|  payment_status|count|
+----------------+-----+
|  LOAN_INITIATED|12500|
|  MISSED_PAYMENT| 5030|
|         OVERDUE| 7526|
|    PAID_ON_TIME|96124|
|SEVERELY_OVERDUE|16320|
+----------------+-----+

违约贷款记录比例: 14.26%


In [54]:
# 1. 验证payment_status逻辑
print("\n验证支付状态逻辑:")
status_validation = silver_loan_df.withColumn(
    "expected_status",
    when(col("installment_num") == 0, "LOAN_INITIATED")
    .when(col("monthly_overdue_flag") == 0, "PAID_ON_TIME")
    .when((col("monthly_overdue_flag") == 1) & (col("overdue_amt") <= col("due_amt")), "MISSED_PAYMENT")
    .when((col("monthly_overdue_flag") == 1) & (col("overdue_amt") <= col("due_amt") * 3), "OVERDUE")
    .when(col("monthly_overdue_flag") == 1, "SEVERELY_OVERDUE")
    .otherwise("UNKNOWN")
)

# 检查实际与预期的状态是否匹配
status_match = status_validation.filter(col("payment_status") == col("expected_status")).count()
print(f"支付状态逻辑正确率: {status_match / silver_count * 100:.2f}%")

# 2. 验证is_default逻辑
print("\n验证违约标志逻辑:")
default_validation = silver_loan_df.withColumn(
    "expected_default",
    col("consecutive_missed_payments") >= 3
)

# 检查实际与预期的违约标志是否匹配
default_match = default_validation.filter(col("is_default") == col("expected_default")).count()
print(f"违约标志逻辑正确率: {default_match / silver_count * 100:.2f}%")


验证支付状态逻辑:
支付状态逻辑正确率: 100.00%

验证违约标志逻辑:
违约标志逻辑正确率: 100.00%


In [80]:
from pyspark.sql.functions import rand

# 选择一个有违约历史的贷款案例

print("\n跟踪单个贷款的生命周期:")

# 从有违约记录且 installment_num=10 的记录中随机选择一个 loan_id
loan_history = silver_loan_df.filter(
    (col("is_default") == True) &
    (col("installment_num") == 10)
).orderBy(rand()).select("loan_id").limit(1)

sample_loan_id = loan_history.collect()[0]["loan_id"]
print(f"样本贷款ID: {sample_loan_id}")
# 检查此贷款的完整历史
loan_lifecycle = silver_loan_df.filter(col("loan_id") == sample_loan_id) \
    .select(
        "installment_num", "due_amt", "paid_amt", "overdue_amt", 
        "payment_status", "consecutive_missed_payments", 
        "is_default", "percent_paid"
    ) \
    .orderBy("installment_num")

print("贷款生命周期:")
loan_lifecycle.show(12)

# 验证连续逾期计算
print("\n验证连续逾期计算:")
# 手动计算连续逾期次数
loan_data = loan_lifecycle.collect()
manual_consecutive = []
current_consecutive = 0

for row in loan_data:
    if row["payment_status"] in ["MISSED_PAYMENT", "OVERDUE", "SEVERELY_OVERDUE"]:
        current_consecutive += 1
    else:
        current_consecutive = 0
    manual_consecutive.append(current_consecutive)

print("手动计算的连续逾期次数:")
print(manual_consecutive)

print("\n系统计算的连续逾期次数:")
system_consecutive = [row["consecutive_missed_payments"] for row in loan_data]
print(system_consecutive)

print(f"连续逾期计算是否匹配: {manual_consecutive == system_consecutive}")


跟踪单个贷款的生命周期:
样本贷款ID: CUS_0x37be_2024_05_01
贷款生命周期:
+---------------+-------+--------+-----------+----------------+---------------------------+----------+------------------+
|installment_num|due_amt|paid_amt|overdue_amt|  payment_status|consecutive_missed_payments|is_default|      percent_paid|
+---------------+-------+--------+-----------+----------------+---------------------------+----------+------------------+
|              0|      0|       0|          0|  LOAN_INITIATED|                          0|     false|               0.0|
|              1|   1000|    1000|          0|    PAID_ON_TIME|                          0|     false|             100.0|
|              2|   1000|       0|       1000|  MISSED_PAYMENT|                          1|     false|              50.0|
|              3|   1000|    2000|          0|    PAID_ON_TIME|                          0|     false|             100.0|
|              4|   1000|       0|       1000|  MISSED_PAYMENT|                          1|   

In [60]:
from pyspark.sql.functions import col, when, abs as spark_abs

# 验证percent_paid计算
print("\n验证已付款百分比计算:")
payment_validation = silver_loan_df.withColumn(
    "expected_percent_paid",
    when(col("total_due_amount") > 0, 
         col("total_paid_amount") / col("total_due_amount") * 100).otherwise(0)
)

# 使用PySpark的abs函数计算百分比误差
payment_validation = payment_validation.withColumn(
    "percent_error",
    spark_abs(col("percent_paid") - col("expected_percent_paid"))
)

# 检查误差是否在允许范围内（浮点数比较）
error_threshold = 0.001
payment_match = payment_validation.filter(col("percent_error") < error_threshold).count()
print(f"付款百分比计算正确率: {payment_match / silver_count * 100:.2f}%")

# 2. 抽样检查不匹配情况
if payment_match < silver_count:
    print("\n不匹配的付款百分比示例:")
    payment_validation.filter(col("percent_error") >= error_threshold) \
        .select("loan_id", "installment_num", "total_due_amount", "total_paid_amount", 
                "percent_paid", "expected_percent_paid", "percent_error") \
        .orderBy(col("percent_error").desc()) \
        .show(5)


验证已付款百分比计算:
付款百分比计算正确率: 100.00%


In [61]:
# 验证loan_age_months计算
print("\n验证贷款年龄计算:")
age_validation = silver_loan_df.withColumn(
    "expected_age_months",
    datediff(col("snapshot_date"), col("loan_start_date")) / 30
)

# 使用PySpark的abs函数计算年龄误差
age_validation = age_validation.withColumn(
    "age_error",
    spark_abs(col("loan_age_months") - col("expected_age_months"))
)

# 检查误差是否在允许范围内（浮点数比较）
age_match = age_validation.filter(col("age_error") < error_threshold).count()
print(f"贷款年龄计算正确率: {age_match / silver_count * 100:.2f}%")


验证贷款年龄计算:
贷款年龄计算正确率: 100.00%


In [63]:
from pyspark.sql.functions import col, when, count, sum, avg, date_format
# 计算关键业务指标
print("\n关键业务指标:")

# 1. 按月违约率
print("各月贷款违约率:")
silver_loan_df.withColumn(
    "month_year", 
    date_format(col("snapshot_date"), "yyyy-MM")
).groupBy("month_year") \
 .agg(
     count("*").alias("total_loans"),
     sum(when(col("is_default") == True, 1).otherwise(0)).alias("default_count")
 ) \
 .withColumn("default_rate", col("default_count") / col("total_loans") * 100) \
 .orderBy("month_year") \
 .show(12)

# 2. 各贷款状态的平均支付比例
print("\n各贷款状态的平均支付率:")
silver_loan_df.groupBy("payment_status") \
 .agg(
     avg("percent_paid").alias("avg_percent_paid"),
     count("*").alias("record_count")
 ) \
 .orderBy("payment_status") \
 .show()

# 3. 验证支付状态的进展逻辑
print("\n检查支付状态进展逻辑:")
# 对于违约贷款，验证它们是否经历了预期的状态变化
defaulted_loans = silver_loan_df.filter(col("is_default") == True) \
    .select("loan_id").distinct()

sample_defaulted_id = defaulted_loans.limit(1).collect()[0]["loan_id"]
print(f"样本违约贷款ID: {sample_defaulted_id}")

# 检查此贷款的状态变化
status_progression = silver_loan_df.filter(col("loan_id") == sample_defaulted_id) \
    .select("installment_num", "payment_status", "consecutive_missed_payments") \
    .orderBy("installment_num")

print("状态变化:")
status_progression.show()


关键业务指标:
各月贷款违约率:
+----------+-----------+-------------+------------------+
|month_year|total_loans|default_count|      default_rate|
+----------+-----------+-------------+------------------+
|   2023-01|        530|            0|               0.0|
|   2023-02|       1031|            0|               0.0|
|   2023-03|       1537|            0|               0.0|
|   2023-04|       2047|            7|0.3419638495359062|
|   2023-05|       2568|           34|  1.32398753894081|
|   2023-06|       3085|          115| 3.727714748784441|
|   2023-07|       3556|          228| 6.411698537682789|
|   2023-08|       4037|          370| 9.165221699281645|
|   2023-09|       4491|          503| 11.20017813404587|
|   2023-10|       4978|          649|13.037364403374848|
|   2023-11|       5469|          796|14.554763210824648|
|   2023-12|       5428|          804|14.812085482682388|
+----------+-----------+-------------+------------------+
only showing top 12 rows


各贷款状态的平均支付率:
+-------------

In [65]:
from pyspark.sql.functions import col, desc

# 获取一些不同类型的贷款
print("不同贷款案例分析:\n")

# 1. 寻找一个按时还款的贷款案例
print("案例1: 按时还款贷款")
on_time_loan = silver_loan_df.filter(
    (col("installment_num") == 10) & 
    (col("is_default") == False) & 
    (col("payment_status") == "PAID_ON_TIME")
).select("loan_id").limit(1)

# 2. 寻找一个早期违约的贷款案例
print("案例2: 早期违约贷款")
early_default_loan = silver_loan_df.filter(
    (col("installment_num") <= 5) & 
    (col("is_default") == True)
).select("loan_id").limit(1)

# 3. 寻找一个部分还款的贷款案例(有些月份付款，有些月份未付)
print("案例3: 混合付款行为贷款")
mixed_payment_loan = silver_loan_df.filter(
    (col("installment_num") == 10) & 
    (col("percent_paid").between(40, 60))
).select("loan_id").limit(1)

# 收集贷款ID
try:
    on_time_id = on_time_loan.collect()[0]["loan_id"]
    print(f"按时还款贷款ID: {on_time_id}")
except:
    print("未找到完全按时还款的贷款")
    
try:
    early_default_id = early_default_loan.collect()[0]["loan_id"] 
    print(f"早期违约贷款ID: {early_default_id}")
except:
    print("未找到早期违约的贷款")
    
try:
    mixed_payment_id = mixed_payment_loan.collect()[0]["loan_id"]
    print(f"混合付款行为贷款ID: {mixed_payment_id}")
except:
    print("未找到混合付款行为的贷款")

# 函数显示贷款的完整生命周期
def show_loan_lifecycle(loan_id, title):
    print(f"\n{title} - 贷款ID: {loan_id}")
    
    # 获取贷款完整历史
    loan_history = silver_loan_df.filter(col("loan_id") == loan_id) \
        .select(
            "installment_num", "due_amt", "paid_amt", "overdue_amt", 
            "payment_status", "consecutive_missed_payments", 
            "is_default", "percent_paid"
        ) \
        .orderBy("installment_num")
    
    # 显示历史
    loan_history.show(20)  # 显示全部历史，最多20行
    
    return loan_history

# 显示每个案例的生命周期
try:
    on_time_history = show_loan_lifecycle(on_time_id, "案例1: 按时还款贷款")
except:
    pass

try:
    early_default_history = show_loan_lifecycle(early_default_id, "案例2: 早期违约贷款")
except:
    pass

try:
    mixed_payment_history = show_loan_lifecycle(mixed_payment_id, "案例3: 混合付款行为贷款")
except:
    pass

# 额外案例: 找出逾期率最高的几个贷款
print("\n案例4: 逾期率最高的贷款")
worst_loans = silver_loan_df.filter(col("installment_num") == 10) \
    .select("loan_id", "percent_paid", "consecutive_missed_payments") \
    .orderBy("percent_paid") \
    .limit(3)

worst_loans.show()

# 显示其中一个最严重案例的历史
try:
    worst_loan_id = worst_loans.collect()[0]["loan_id"]
    worst_loan_history = show_loan_lifecycle(worst_loan_id, "案例4: 逾期率最高的贷款")
except:
    pass

# 额外案例: 找出有恢复记录的贷款(曾经违约但后来还款)
print("\n案例5: 有违约恢复的贷款")
recovery_loans = silver_loan_df.filter(
    (col("installment_num") == 10) & 
    (col("consecutive_missed_payments") == 0) & 
    (col("percent_paid") < 100) & 
    (col("percent_paid") > 0)
).select("loan_id").limit(1)

try:
    recovery_loan_id = recovery_loans.collect()[0]["loan_id"]
    recovery_history = show_loan_lifecycle(recovery_loan_id, "案例5: 有违约恢复的贷款")
except:
    print("未找到有违约恢复的贷款")

不同贷款案例分析:

案例1: 按时还款贷款
案例2: 早期违约贷款
案例3: 混合付款行为贷款
按时还款贷款ID: CUS_0x1013_2023_12_01
早期违约贷款ID: CUS_0x102e_2024_04_01
混合付款行为贷款ID: CUS_0x1000_2023_05_01

案例1: 按时还款贷款 - 贷款ID: CUS_0x1013_2023_12_01
+---------------+-------+--------+-----------+--------------+---------------------------+----------+------------+
|installment_num|due_amt|paid_amt|overdue_amt|payment_status|consecutive_missed_payments|is_default|percent_paid|
+---------------+-------+--------+-----------+--------------+---------------------------+----------+------------+
|              0|      0|       0|          0|LOAN_INITIATED|                          0|     false|         0.0|
|              1|   1000|    1000|          0|  PAID_ON_TIME|                          0|     false|       100.0|
|              2|   1000|    1000|          0|  PAID_ON_TIME|                          0|     false|       100.0|
|              3|   1000|    1000|          0|  PAID_ON_TIME|                          0|     false|       100.0|
|           

## process features_attributes silver 

In [91]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, when, isnan, isnull, countDistinct

# 创建Spark会话
spark = SparkSession.builder \
    .appName("Silver Layer - Customer Attributes") \
    .config("spark.driver.memory", "2g") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .getOrCreate()

# 加载bronze_attributes数据
bronze_attributes_df = spark.read.parquet("datamart/bronze/bronze_attributes")

# 基本信息
print(f"总行数: {bronze_attributes_df.count()}")
print(f"总列数: {len(bronze_attributes_df.columns)}")

# 显示模式
print("\n数据模式:")
bronze_attributes_df.printSchema()

# 显示样本数据
print("\n样本数据:")
bronze_attributes_df.show(5, truncate=False)

# 计算每列的非空值和空值数量
print("\n列的非空值和空值数量:")
null_counts = bronze_attributes_df.select([count(when(col(c).isNull() | isnull(c), c)).alias(c) for c in bronze_attributes_df.columns])
null_counts.show(truncate=False)

# 计算唯一值数量
print("\n唯一值数量:")
distinct_counts = {c: bronze_attributes_df.select(c).distinct().count() for c in bronze_attributes_df.columns}
for col_name, count in distinct_counts.items():
    print(f"{col_name}: {count}")

# 检查年龄分布
print("\n年龄分布:")
bronze_attributes_df.groupBy("Age").count().orderBy("Age").show(20)

# 检查职业分布
print("\nTop 10职业:")
bronze_attributes_df.groupBy("Occupation").count().orderBy(col("count").desc()).show(10)

# 检查SSN格式问题
print("\n检查SSN格式:")
# 使用正则表达式检查标准SSN格式(XXX-XX-XXXX)
from pyspark.sql.functions import regexp_extract
ssn_check = bronze_attributes_df.withColumn(
    "is_standard_ssn", 
    regexp_extract(col("SSN"), "^\\d{3}-\\d{2}-\\d{4}$", 0) != ""
)
ssn_check.groupBy("is_standard_ssn").count().show()

# 显示一些非标准SSN样本
print("\n非标准SSN示例:")
ssn_check.filter(col("is_standard_ssn") == False).select("SSN").distinct().show(10)

总行数: 12500
总列数: 8

数据模式:
root
 |-- Customer_ID: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- SSN: string (nullable = true)
 |-- Occupation: string (nullable = true)
 |-- snapshot_date: date (nullable = true)
 |-- bronze_ingest_timestamp: timestamp (nullable = true)
 |-- bronze_source_file: string (nullable = true)


样本数据:
+-----------+--------------+---+-----------+-------------+-------------+------------------------+-----------------------+
|Customer_ID|Name          |Age|SSN        |Occupation   |snapshot_date|bronze_ingest_timestamp |bronze_source_file     |
+-----------+--------------+---+-----------+-------------+-------------+------------------------+-----------------------+
|CUS_0x1000 |Alistair Barrf|18 |913-74-1218|Lawyer       |2023-05-01   |2025-05-17 16:28:39.9474|features_attributes.csv|
|CUS_0x1009 |Arunah        |26 |063-67-6938|Mechanic     |2025-01-01   |2025-05-17 16:28:39.9474|features_attributes.csv|
|CUS_0x100

In [96]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, regexp_replace, trim, isnan, isnull

# 创建Spark会话
spark = SparkSession.builder \
    .appName("Silver Layer - Customer Attributes") \
    .config("spark.driver.memory", "2g") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .getOrCreate()

# 加载bronze_attributes数据
bronze_attributes_df = spark.read.parquet("datamart/bronze/bronze_attributes")

# 第1步：处理年龄字段
print("步骤1: 处理年龄数据")

# 1.1 去除非数字字符（如下划线）
attributes_df = bronze_attributes_df.withColumn(
    "Age_cleaned", 
    regexp_replace(col("Age"), "[^0-9-]", "")  # 保留数字和负号
)

# 1.2 转换为整数类型
attributes_df = attributes_df.withColumn(
    "Age_int", 
    when(
        col("Age_cleaned").rlike("^-?\\d+$"),  # 验证是否为有效整数
        col("Age_cleaned").cast("int")
    ).otherwise(None)  # 无效值设为NULL
)

# 1.3 创建年龄有效性标志
attributes_df = attributes_df.withColumn(
    "is_valid_age", 
    when(
        col("Age_int").isNotNull() & 
        col("Age_int").between(1, 100),  # 1-100的合理范围
        True
    ).otherwise(False)
)

# 1.4 创建最终处理后的年龄字段
attributes_df = attributes_df.withColumn(
    "age_processed", 
    when(
        col("is_valid_age") == True,
        col("Age_int")
    ).otherwise(None)  # 无效年龄设为NULL
)

# 查看处理结果
print("\n年龄处理结果示例:")
attributes_df.select(
    "Customer_ID", "Age", "Age_cleaned", "Age_int", "is_valid_age", "age_processed"
).show(20)

# 统计有效年龄与无效年龄数量
valid_count = attributes_df.filter(col("is_valid_age") == True).count()
invalid_count = attributes_df.filter(col("is_valid_age") == False).count()

print(f"\n有效年龄数量: {valid_count} ({valid_count / attributes_df.count() * 100:.2f}%)")
print(f"无效年龄数量: {invalid_count} ({invalid_count / attributes_df.count() * 100:.2f}%)")

# 查看有效年龄的分布
print("\n有效年龄分布:")
attributes_df.filter(col("is_valid_age") == True) \
    .groupBy("age_processed") \
    .count() \
    .orderBy("age_processed") \
    .show(5)

步骤1: 处理年龄数据

年龄处理结果示例:
+-----------+---+-----------+-------+------------+-------------+
|Customer_ID|Age|Age_cleaned|Age_int|is_valid_age|age_processed|
+-----------+---+-----------+-------+------------+-------------+
| CUS_0x1000| 18|         18|     18|        true|           18|
| CUS_0x1009| 26|         26|     26|        true|           26|
| CUS_0x100b| 19|         19|     19|        true|           19|
| CUS_0x1011| 44|         44|     44|        true|           44|
| CUS_0x1013| 44|         44|     44|        true|           44|
| CUS_0x1015| 27|         27|     27|        true|           27|
| CUS_0x1018| 15|         15|     15|        true|           15|
| CUS_0x1026| 52|         52|     52|        true|           52|
| CUS_0x102d| 31|         31|     31|        true|           31|
| CUS_0x102e| 26|         26|     26|        true|           26|
| CUS_0x1032|40_|         40|     40|        true|           40|
| CUS_0x1037| 45|         45|     45|        true|           45|
| 

In [99]:
from pyspark.sql.functions import regexp_extract

print("步骤2: 处理SSN数据")

# 2.1 检查SSN是否符合标准格式(XXX-XX-XXXX)
attributes_df = attributes_df.withColumn(
    "is_valid_ssn", 
    regexp_extract(col("SSN"), "^\\d{3}-\\d{2}-\\d{4}$", 0) != ""
)

# 查看处理结果
print("\nSSN处理结果示例:")
attributes_df.select(
    "Customer_ID", "SSN", "is_valid_ssn"
).show(10)

# 统计有效SSN与无效SSN数量
valid_ssn_count = attributes_df.filter(col("is_valid_ssn") == True).count()
invalid_ssn_count = attributes_df.filter(col("is_valid_ssn") == False).count()

print(f"\n有效SSN数量: {valid_ssn_count} ({valid_ssn_count / attributes_df.count() * 100:.2f}%)")
print(f"无效SSN数量: {invalid_ssn_count} ({invalid_ssn_count / attributes_df.count() * 100:.2f}%)")

# 查看一些无效SSN示例
print("\n无效SSN示例:")
attributes_df.filter(col("is_valid_ssn") == False).select("SSN").distinct().show(10)

步骤2: 处理SSN数据

SSN处理结果示例:
+-----------+-----------+------------+
|Customer_ID|        SSN|is_valid_ssn|
+-----------+-----------+------------+
| CUS_0x1000|913-74-1218|        true|
| CUS_0x1009|063-67-6938|        true|
| CUS_0x100b|  #F%$D@*&8|       false|
| CUS_0x1011|793-05-8223|        true|
| CUS_0x1013|930-49-9615|        true|
| CUS_0x1015|810-97-7024|        true|
| CUS_0x1018|731-19-8119|        true|
| CUS_0x1026|500-62-9044|        true|
| CUS_0x102d|692-71-7552|        true|
| CUS_0x102e|  #F%$D@*&8|       false|
+-----------+-----------+------------+
only showing top 10 rows


有效SSN数量: 11797 (94.38%)
无效SSN数量: 703 (5.62%)

无效SSN示例:
+---------+
|      SSN|
+---------+
|#F%$D@*&8|
+---------+



In [100]:
from pyspark.sql.functions import trim, when, initcap

print("步骤3: 处理职业数据")

# 3.1 清理职业名称 - 去除前后空格并标准化大小写
attributes_df = attributes_df.withColumn(
    "occupation_cleaned", 
    when(
        col("Occupation").isNull() | 
        (col("Occupation") == "_______") |
        (col("Occupation") == ""),
        "Unknown"
    ).otherwise(initcap(trim(col("Occupation"))))  # 首字母大写，其余小写
)

# 查看处理结果
print("\n职业处理结果示例:")
attributes_df.select(
    "Customer_ID", "Occupation", "occupation_cleaned"
).show(15)

# 统计各职业的数量
print("\n职业分布:")
attributes_df.groupBy("occupation_cleaned").count().orderBy(col("count").desc()).show()

步骤3: 处理职业数据

职业处理结果示例:
+-----------+-------------+------------------+
|Customer_ID|   Occupation|occupation_cleaned|
+-----------+-------------+------------------+
| CUS_0x1000|       Lawyer|            Lawyer|
| CUS_0x1009|     Mechanic|          Mechanic|
| CUS_0x100b|Media_Manager|     Media_manager|
| CUS_0x1011|       Doctor|            Doctor|
| CUS_0x1013|     Mechanic|          Mechanic|
| CUS_0x1015|   Journalist|        Journalist|
| CUS_0x1018|   Accountant|        Accountant|
| CUS_0x1026|      Manager|           Manager|
| CUS_0x102d| Entrepreneur|      Entrepreneur|
| CUS_0x102e|    Scientist|         Scientist|
| CUS_0x1032|       Lawyer|            Lawyer|
| CUS_0x1037|   Accountant|        Accountant|
| CUS_0x1038|    Architect|         Architect|
| CUS_0x103e|    Scientist|         Scientist|
| CUS_0x1041|      Teacher|           Teacher|
+-----------+-------------+------------------+
only showing top 15 rows


职业分布:
+------------------+-----+
|occupation_cleaned|coun

In [101]:
from pyspark.sql.functions import current_timestamp, datediff, months_between, col

print("步骤4: 处理日期数据和最终整合")

# 4.1 验证日期并计算数据年龄
current_date = "2025-05-01"  # 使用一个固定的当前日期点
attributes_df = attributes_df.withColumn(
    "data_age_months",
    months_between(lit(current_date).cast("date"), col("snapshot_date"))
)

# 4.2 创建最终的Silver层表
silver_attributes_df = attributes_df.select(
    # 原始字段
    "Customer_ID",
    "Name",
    "snapshot_date",
    # 处理后的字段
    col("age_processed").alias("age"),
    col("is_valid_age"),
    "SSN",
    col("is_valid_ssn"),
    col("occupation_cleaned").alias("occupation"),
    col("data_age_months"),
    # 元数据字段
    col("bronze_ingest_timestamp"),
    current_timestamp().alias("silver_process_timestamp")
)

# 查看最终结果
print("\nSilver层处理结果示例:")
silver_attributes_df.show(10)

# 查看模式
print("\nSilver层数据模式:")
silver_attributes_df.printSchema()

# 保存Silver层数据
silver_attributes_df.write.mode("overwrite").parquet("datamart/silver/silver_attributes")
print("\nSilver层数据已保存到: datamart/silver/silver_attributes")

# 验证保存的数据
silver_validate = spark.read.parquet("datamart/silver/silver_attributes")
print(f"\n已保存的记录数: {silver_validate.count()}")

步骤4: 处理日期数据和最终整合

Silver层处理结果示例:
+-----------+----------------+-------------+---+------------+-----------+------------+-------------+---------------+-----------------------+------------------------+
|Customer_ID|            Name|snapshot_date|age|is_valid_age|        SSN|is_valid_ssn|   occupation|data_age_months|bronze_ingest_timestamp|silver_process_timestamp|
+-----------+----------------+-------------+---+------------+-----------+------------+-------------+---------------+-----------------------+------------------------+
| CUS_0x1000|  Alistair Barrf|   2023-05-01| 18|        true|913-74-1218|        true|       Lawyer|           24.0|   2025-05-17 16:28:...|    2025-05-18 09:16:...|
| CUS_0x1009|          Arunah|   2025-01-01| 26|        true|063-67-6938|        true|     Mechanic|            4.0|   2025-05-17 16:28:...|    2025-05-18 09:16:...|
| CUS_0x100b|        Shirboni|   2024-03-01| 19|        true|  #F%$D@*&8|       false|Media_manager|           14.0|   2025-05-17 16:28:.

In [103]:
from pyspark.sql.functions import col, count, when, isnan, isnull, avg, min, max, stddev

# 重新加载Bronze和Silver层数据进行比较
bronze_attributes_df = spark.read.parquet("datamart/bronze/bronze_attributes")
silver_attributes_df = spark.read.parquet("datamart/silver/silver_attributes")

print("=== Silver层属性数据验证 ===")

# 1. 基本数量检查
bronze_count = bronze_attributes_df.count()
silver_count = silver_attributes_df.count()

print(f"Bronze层记录数: {bronze_count}")
print(f"Silver层记录数: {silver_count}")
print(f"记录数是否匹配: {bronze_count == silver_count}")

# 2. 检查Silver层质量指标
print("\n年龄数据质量:")
age_stats = silver_attributes_df.agg(
    count("age").alias("非空年龄数"),
    count(when(col("is_valid_age") == True, True)).alias("有效年龄数"),
    avg("age").alias("平均年龄"),
    min("age").alias("最小年龄"),
    max("age").alias("最大年龄"),
    stddev("age").alias("年龄标准差")
).collect()[0]

print(f"非空年龄数: {age_stats['非空年龄数']} ({age_stats['非空年龄数']/silver_count*100:.2f}%)")
print(f"有效年龄数: {age_stats['有效年龄数']} ({age_stats['有效年龄数']/silver_count*100:.2f}%)")
print(f"平均年龄: {age_stats['平均年龄']:.2f}")
print(f"年龄范围: {age_stats['最小年龄']} - {age_stats['最大年龄']}")
print(f"年龄标准差: {age_stats['年龄标准差']:.2f}")

# 3. 检查SSN数据质量
print("\nSSN数据质量:")
ssn_stats = silver_attributes_df.agg(
    count(when(col("is_valid_ssn") == True, True)).alias("有效SSN数")
).collect()[0]

print(f"有效SSN数: {ssn_stats['有效SSN数']} ({ssn_stats['有效SSN数']/silver_count*100:.2f}%)")

# 4. 检查职业数据质量
print("\n职业数据分布 (Top 10):")
occupation_dist = silver_attributes_df.groupBy("occupation") \
    .count() \
    .orderBy(col("count").desc()) \
    .limit(10)

occupation_dist.show()

# 5. 检查职业中是否有"Unknown"值
unknown_count = silver_attributes_df.filter(col("occupation") == "Unknown").count()
print(f"'Unknown'职业数: {unknown_count} ({unknown_count/silver_count*100:.2f}%)")

# 6. 数据年龄检查
print("\n数据年龄统计:")
age_data_stats = silver_attributes_df.agg(
    avg("data_age_months").alias("平均数据年龄(月)"),
    min("data_age_months").alias("最小数据年龄(月)"),
    max("data_age_months").alias("最大数据年龄(月)")
).collect()[0]

print(f"平均数据年龄: {age_data_stats['平均数据年龄(月)']:.2f} 个月")
print(f"数据年龄范围: {age_data_stats['最小数据年龄(月)']:.2f} - {age_data_stats['最大数据年龄(月)']:.2f} 个月")

# 7. 检查处理前后主要字段的分布变化
print("\n处理前后的年龄分布比较:")
print("Bronze层前10个最常见年龄:")
bronze_attributes_df.groupBy("Age") \
    .count() \
    .orderBy(col("count").desc()) \
    .limit(10) \
    .show()

print("Silver层前10个最常见年龄:")
silver_attributes_df.groupBy("age") \
    .count() \
    .orderBy(col("count").desc()) \
    .limit(10) \
    .show()

# 8. 检查是否所有Customer_ID都被正确保留
print("\n客户ID完整性检查:")
bronze_customers = set(row["Customer_ID"] for row in bronze_attributes_df.select("Customer_ID").distinct().collect())
silver_customers = set(row["Customer_ID"] for row in silver_attributes_df.select("Customer_ID").distinct().collect())

missing_customers = bronze_customers - silver_customers
extra_customers = silver_customers - bronze_customers

print(f"Bronze层唯一客户数: {len(bronze_customers)}")
print(f"Silver层唯一客户数: {len(silver_customers)}")
print(f"丢失的客户数: {len(missing_customers)}")
print(f"额外的客户数: {len(extra_customers)}")

if len(missing_customers) > 0:
    print("示例丢失的客户ID:", list(missing_customers)[:5])

# 9. 字段非空值比较
print("\n非空值比较:")
# Bronze层非空值
bronze_non_null = bronze_attributes_df.select([
    count(when(~(col(c).isNull() | isnull(c)), 1)).alias(c)
    for c in ["Customer_ID", "Name", "Age", "SSN", "Occupation", "snapshot_date"]
]).collect()[0]

# Silver层非空值
silver_non_null = silver_attributes_df.select([
    count(when(~(col(c).isNull() | isnull(c)), 1)).alias(c)
    for c in ["Customer_ID", "Name", "age", "SSN", "occupation", "snapshot_date"]
]).collect()[0]

for bronze_col, silver_col in zip(
    ["Customer_ID", "Name", "Age", "SSN", "Occupation", "snapshot_date"],
    ["Customer_ID", "Name", "age", "SSN", "occupation", "snapshot_date"]
):
    bronze_non_null_count = bronze_non_null[bronze_col]
    silver_non_null_count = silver_non_null[silver_col]
    print(f"{bronze_col} -> {silver_col}: Bronze非空={bronze_non_null_count}, Silver非空={silver_non_null_count}, " + 
          f"比例={silver_non_null_count/bronze_non_null_count*100:.2f}%")

print("\n=== 验证完成 ===")

=== Silver层属性数据验证 ===
Bronze层记录数: 12500
Silver层记录数: 12500
记录数是否匹配: True

年龄数据质量:
非空年龄数: 12181 (97.45%)
有效年龄数: 12181 (97.45%)
平均年龄: 33.62
年龄范围: 14 - 56
年龄标准差: 10.77

SSN数据质量:
有效SSN数: 11797 (94.38%)

职业数据分布 (Top 10):
+-------------+-----+
|   occupation|count|
+-------------+-----+
|      Unknown|  880|
|       Lawyer|  828|
|    Architect|  795|
|     Engineer|  793|
|   Accountant|  791|
|    Scientist|  789|
|      Teacher|  782|
|    Developer|  780|
|Media_manager|  780|
|     Mechanic|  780|
+-------------+-----+

'Unknown'职业数: 880 (7.04%)

数据年龄统计:
平均数据年龄: 16.00 个月
数据年龄范围: 4.00 - 28.00 个月

处理前后的年龄分布比较:
Bronze层前10个最常见年龄:
+---+-----+
|Age|count|
+---+-----+
| 26|  364|
| 35|  361|
| 32|  360|
| 39|  360|
| 38|  345|
| 28|  343|
| 44|  343|
| 20|  342|
| 41|  340|
| 25|  340|
+---+-----+

Silver层前10个最常见年龄:
+---+-----+
|age|count|
+---+-----+
| 32|  385|
| 39|  385|
| 26|  379|
| 35|  371|
| 28|  363|
| 38|  362|
| 29|  359|
| 20|  358|
| 36|  357|
| 44|  356|
+---+-----+


客户ID完整性检查:
