In [2]:
from pyspark.sql import SparkSession

# 创建更稳定的Spark会话，带有序列化配置
spark = SparkSession.builder \
    .appName("Test") \
    .config("spark.driver.memory", "2g") \
    .config("spark.executor.memory", "2g") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .getOrCreate()

# 简化的测试 - 避免复杂的操作
print("Spark version:", spark.version)

# 更安全的数据访问方式
import os
print("Data files:")
try:
    for file in os.listdir("/app/data"):
        print(f" - {file}")
except Exception as e:
    print(f"Error accessing data directory: {e}")

Spark version: 3.5.0
Data files:
 - features_attributes.csv
 - features_financials.csv
 - feature_clickstream.csv
 - lms_loan_daily.csv


In [4]:
from pyspark.sql import SparkSession

# 创建 Spark 会话
spark = SparkSession.builder \
    .appName("Bronze Layer Validation") \
    .config("spark.driver.memory", "2g") \
    .getOrCreate()

# 检查 Bronze 表
bronze_tables = [
    "bronze_loan_daily",
    "bronze_financials", 
    "bronze_attributes",
    "bronze_clickstream"
]

for table in bronze_tables:
    try:
        path = f"datamart/bronze/{table}"
        df = spark.read.parquet(path)
        
        print(f"\n=== {table} ===")
        print(f"Row count: {df.count()}")
        print("Schema:")
        df.printSchema()
        print("Sample data:")
        df.show(5, truncate=False)
        
    except Exception as e:
        print(f"Error reading {table}: {e}")


=== bronze_loan_daily ===
Row count: 137500
Schema:
root
 |-- loan_id: string (nullable = true)
 |-- Customer_ID: string (nullable = true)
 |-- loan_start_date: date (nullable = true)
 |-- tenure: integer (nullable = true)
 |-- installment_num: integer (nullable = true)
 |-- loan_amt: integer (nullable = true)
 |-- due_amt: integer (nullable = true)
 |-- paid_amt: integer (nullable = true)
 |-- overdue_amt: integer (nullable = true)
 |-- balance: integer (nullable = true)
 |-- snapshot_date: date (nullable = true)
 |-- bronze_ingest_timestamp: timestamp (nullable = true)
 |-- bronze_source_file: string (nullable = true)

Sample data:
+---------------------+-----------+---------------+------+---------------+--------+-------+--------+-----------+-------+-------------+--------------------------+------------------+
|loan_id              |Customer_ID|loan_start_date|tenure|installment_num|loan_amt|due_amt|paid_amt|overdue_amt|balance|snapshot_date|bronze_ingest_timestamp   |bronze_source_f