In [8]:
from pyspark.sql import SparkSession

# 创建Spark会话
spark = SparkSession.builder \
    .appName("Bronze Layer Validation") \
    .config("spark.driver.memory", "2g") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .getOrCreate()

# 定义要检查的Bronze层表
bronze_tables = [
    "bronze_loan_daily",
    "bronze_financials",
    "bronze_attributes",
    "bronze_clickstream"
]

# 检查所有Bronze层表
for table in bronze_tables:
    try:
        # 构建表路径
        table_path = f"datamart/bronze/{table}"
        
        # 尝试读取表
        df = spark.read.parquet(table_path)
        
        # 显示表信息
        print(f"\n============= {table} =============")
        print(f"行数: {df.count()}")
        print(f"列数: {len(df.columns)}")
        print("\n模式:")
        df.printSchema()
        print("\n样本数据:")
        df.show(5, truncate=False)
        
        # 计算空值统计
        null_counts = {col_name: df.filter(df[col_name].isNull()).count() 
                      for col_name in df.columns[:20]}  # 仅检查前5列以避免输出过多
        print("\n前10列的空值数量:")
        for col, count in null_counts.items():
            print(f"{col}: {count} ({count/df.count()*100:.2f}%)")
        
    except Exception as e:
        print(f"无法读取表 {table}: {e}")

print("\nBronze层验证完成!")


行数: 137500
列数: 13

模式:
root
 |-- loan_id: string (nullable = true)
 |-- Customer_ID: string (nullable = true)
 |-- loan_start_date: date (nullable = true)
 |-- tenure: integer (nullable = true)
 |-- installment_num: integer (nullable = true)
 |-- loan_amt: integer (nullable = true)
 |-- due_amt: integer (nullable = true)
 |-- paid_amt: integer (nullable = true)
 |-- overdue_amt: integer (nullable = true)
 |-- balance: integer (nullable = true)
 |-- snapshot_date: date (nullable = true)
 |-- bronze_ingest_timestamp: timestamp (nullable = true)
 |-- bronze_source_file: string (nullable = true)


样本数据:
+---------------------+-----------+---------------+------+---------------+--------+-------+--------+-----------+-------+-------------+--------------------------+------------------+
|loan_id              |Customer_ID|loan_start_date|tenure|installment_num|loan_amt|due_amt|paid_amt|overdue_amt|balance|snapshot_date|bronze_ingest_timestamp   |bronze_source_file|
+---------------------+-------

In [6]:
# 定义要检查的Silver层表
silver_tables = [
    "silver_loan",
    "silver_financials",
    "silver_attributes",
    "silver_clickstream_detailed",
    "silver_clickstream_aggregated"
]

# 检查所有Silver层表
for table in silver_tables:
    try:
        # 构建表路径
        table_path = f"datamart/silver/{table}"
        
        # 尝试读取表
        df = spark.read.parquet(table_path)
        
        # 显示表信息
        print(f"\n============= {table} =============")
        print(f"行数: {df.count()}")
        print(f"列数: {len(df.columns)}")
        print("\n模式:")
        df.printSchema()
        print("\n样本数据:")
        df.show(5, truncate=False)
        
    except Exception as e:
        print(f"无法读取表 {table}: {e}")

print("\nSilver层验证完成!")


行数: 137500
列数: 27

模式:
root
 |-- loan_id: string (nullable = true)
 |-- Customer_ID: string (nullable = true)
 |-- loan_start_date: date (nullable = true)
 |-- tenure: integer (nullable = true)
 |-- installment_num: integer (nullable = true)
 |-- loan_amt: integer (nullable = true)
 |-- due_amt: integer (nullable = true)
 |-- paid_amt: integer (nullable = true)
 |-- overdue_amt: integer (nullable = true)
 |-- balance: integer (nullable = true)
 |-- snapshot_date: date (nullable = true)
 |-- bronze_ingest_timestamp: timestamp (nullable = true)
 |-- bronze_source_file: string (nullable = true)
 |-- payment_made_flag: integer (nullable = true)
 |-- monthly_overdue_flag: integer (nullable = true)
 |-- total_due_amount: integer (nullable = true)
 |-- total_paid_amount: long (nullable = true)
 |-- payment_ratio: double (nullable = true)
 |-- payment_status: string (nullable = true)
 |-- consecutive_missed_payments: long (nullable = true)
 |-- loan_age_months: double (nullable = true)
 |-- r