In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, when, isnan, isnull, countDistinct

# 创建Spark会话
spark = SparkSession.builder \
    .appName("Financial Data Analysis") \
    .config("spark.driver.memory", "2g") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .getOrCreate()

# 加载bronze_financials数据
bronze_financials_df = spark.read.parquet("datamart/bronze/bronze_financials")

# 基本信息
print(f"总行数: {bronze_financials_df.count()}")
print(f"总列数: {len(bronze_financials_df.columns)}")

# 显示模式
print("\n数据模式:")
bronze_financials_df.printSchema()

# 显示样本数据
print("\n样本数据:")
bronze_financials_df.show(5, truncate=False)

# 计算每列的非空值和空值数量
print("\n列的非空值和空值数量:")
null_counts = bronze_financials_df.select([
    count(when(col(c).isNull() | isnull(c), c)).alias(c) 
    for c in bronze_financials_df.columns
])
null_counts.show(truncate=False)

# 检查数值型字段中的异常符号
from pyspark.sql.functions import regexp_extract

print("\n数值字段中的非数字字符:")
numeric_fields = ["Annual_Income", "Monthly_Inhand_Salary", "Outstanding_Debt", 
                 "Credit_Utilization_Ratio", "Total_EMI_per_month", "Amount_invested_monthly"]

for field in numeric_fields:
    non_numeric = bronze_financials_df.withColumn(
        "has_non_numeric", 
        regexp_extract(col(field), "[^0-9\\.]", 0) != ""
    ).filter(col("has_non_numeric") == True)
    
    count_non_numeric = non_numeric.count()
    print(f"{field}: {count_non_numeric} 行含非数字字符 ({count_non_numeric/bronze_financials_df.count()*100:.2f}%)")
    
    if count_non_numeric > 0:
        print("示例:")
        non_numeric.select(field).distinct().show(5, truncate=False)

# 检查文本格式字段
print("\n文本格式字段分析:")
text_fields = ["Credit_History_Age", "Payment_of_Min_Amount", "Credit_Mix", "Payment_Behaviour"]

for field in text_fields:
    distinct_values = bronze_financials_df.select(field).distinct()
    count_distinct = distinct_values.count()
    print(f"{field}: {count_distinct} 个不同值")
    
    print("示例值:")
    distinct_values.show(5, truncate=False)

总行数: 12500
总列数: 24

数据模式:
root
 |-- Customer_ID: string (nullable = true)
 |-- Annual_Income: string (nullable = true)
 |-- Monthly_Inhand_Salary: double (nullable = true)
 |-- Num_Bank_Accounts: integer (nullable = true)
 |-- Num_Credit_Card: integer (nullable = true)
 |-- Interest_Rate: integer (nullable = true)
 |-- Num_of_Loan: string (nullable = true)
 |-- Type_of_Loan: string (nullable = true)
 |-- Delay_from_due_date: integer (nullable = true)
 |-- Num_of_Delayed_Payment: string (nullable = true)
 |-- Changed_Credit_Limit: string (nullable = true)
 |-- Num_Credit_Inquiries: double (nullable = true)
 |-- Credit_Mix: string (nullable = true)
 |-- Outstanding_Debt: string (nullable = true)
 |-- Credit_Utilization_Ratio: double (nullable = true)
 |-- Credit_History_Age: string (nullable = true)
 |-- Payment_of_Min_Amount: string (nullable = true)
 |-- Total_EMI_per_month: double (nullable = true)
 |-- Amount_invested_monthly: string (nullable = true)
 |-- Payment_Behaviour: string (

In [3]:
from pyspark.sql.functions import (
    col, when, regexp_replace, trim, 
    split, size, array_contains, lit,
    expr, substring, length
)

# 1. 数值字段清理
print("步骤1: 清理数值字段")

# 加载bronze_financials数据
bronze_financials_df = spark.read.parquet("datamart/bronze/bronze_financials")

# 1.1 清理Annual_Income
financials_df = bronze_financials_df.withColumn(
    "annual_income_cleaned", 
    regexp_replace(col("Annual_Income"), "[^0-9\\.]", "")
).withColumn(
    "annual_income", 
    when(
        col("annual_income_cleaned").rlike("^\\d+(\\.\\d+)?$"),
        col("annual_income_cleaned").cast("double")
    ).otherwise(None)
).withColumn(
    "is_valid_annual_income",
    col("annual_income").isNotNull()
)

# 1.2 清理Outstanding_Debt
financials_df = financials_df.withColumn(
    "outstanding_debt_cleaned", 
    regexp_replace(col("Outstanding_Debt"), "[^0-9\\.]", "")
).withColumn(
    "outstanding_debt", 
    when(
        col("outstanding_debt_cleaned").rlike("^\\d+(\\.\\d+)?$"),
        col("outstanding_debt_cleaned").cast("double")
    ).otherwise(None)
).withColumn(
    "is_valid_outstanding_debt",
    col("outstanding_debt").isNotNull()
)

# 1.3 清理Amount_invested_monthly
financials_df = financials_df.withColumn(
    "amount_invested_monthly_cleaned", 
    regexp_replace(col("Amount_invested_monthly"), "[^0-9\\.]", "")
).withColumn(
    "amount_invested_monthly", 
    when(
        col("amount_invested_monthly_cleaned").rlike("^\\d+(\\.\\d+)?$"),
        col("amount_invested_monthly_cleaned").cast("double")
    ).otherwise(None)
).withColumn(
    "is_valid_amount_invested",
    col("amount_invested_monthly").isNotNull()
)

# 查看处理结果
print("\n数值字段清理结果:")
financials_df.select(
    "Customer_ID",
    "Annual_Income", "annual_income", "is_valid_annual_income",
    "Outstanding_Debt", "outstanding_debt", "is_valid_outstanding_debt",
    "Amount_invested_monthly", "amount_invested_monthly", "is_valid_amount_invested"
).show(5, truncate=False)

# 统计有效数据比例
valid_annual_income = financials_df.filter(col("is_valid_annual_income") == True).count()
valid_outstanding_debt = financials_df.filter(col("is_valid_outstanding_debt") == True).count()
valid_amount_invested = financials_df.filter(col("is_valid_amount_invested") == True).count()

total_rows = financials_df.count()
print(f"\n有效数据比例:")
print(f"Annual Income: {valid_annual_income} ({valid_annual_income/total_rows*100:.2f}%)")
print(f"Outstanding Debt: {valid_outstanding_debt} ({valid_outstanding_debt/total_rows*100:.2f}%)")
print(f"Amount Invested Monthly: {valid_amount_invested} ({valid_amount_invested/total_rows*100:.2f}%)")

步骤1: 清理数值字段

数值字段清理结果:
+-----------+------------------+------------------+----------------------+----------------+----------------+-------------------------+-----------------------+-----------------------+------------------------+
|Customer_ID|Annual_Income     |annual_income     |is_valid_annual_income|Outstanding_Debt|outstanding_debt|is_valid_outstanding_debt|Amount_invested_monthly|amount_invested_monthly|is_valid_amount_invested|
+-----------+------------------+------------------+----------------------+----------------+----------------+-------------------------+-----------------------+-----------------------+------------------------+
|CUS_0x1000 |30625.94          |30625.94          |true                  |1562.91         |1562.91         |true                     |77.31427572208112      |77.31427572208112      |true                    |
|CUS_0x1009 |52312.68          |52312.68          |true                  |202.68          |202.68          |true                     |58.66019164

In [9]:
from pyspark.sql.functions import (
    col, when, regexp_extract, split, 
    element_at, concat, lit, lower, upper
)

# 2. 文本格式金融指标处理
print("步骤2: 处理文本格式金融指标")

# 2.1 解析Credit_History_Age
print("\n2.1 解析Credit_History_Age")
financials_df = financials_df.withColumn(
    "credit_history_years",
    when(
        col("Credit_History_Age").rlike("^\\d+ Years?.*"),
        regexp_extract(col("Credit_History_Age"), "^(\\d+) Years?.*", 1).cast("int")
    ).otherwise(0)
).withColumn(
    "credit_history_months",
    when(
        col("Credit_History_Age").rlike(".*and (\\d+) Months?.*"),
        regexp_extract(col("Credit_History_Age"), ".*and (\\d+) Months?.*", 1).cast("int")
    ).otherwise(0)
).withColumn(
    "credit_history_total_months",
    col("credit_history_years") * 12 + col("credit_history_months")
)

# 查看结果
print("Credit_History_Age解析结果:")
financials_df.select(
    "Customer_ID", 
    "Credit_History_Age", 
    "credit_history_years", 
    "credit_history_months",
    "credit_history_total_months"
).show(10, truncate=False)

# 2.2 标准化Payment_of_Min_Amount
print("\n2.2 标准化Payment_of_Min_Amount")
financials_df = financials_df.withColumn(
    "payment_of_min_amount_standardized",
    when(col("Payment_of_Min_Amount") == "Yes", "YES")
    .when(col("Payment_of_Min_Amount") == "No", "NO")
    .when(col("Payment_of_Min_Amount") == "NM", "NOT_SPECIFIED")
    .otherwise("UNKNOWN")
).withColumn(
    "is_min_payment_made",
    when(col("Payment_of_Min_Amount") == "Yes", True)
    .when(col("Payment_of_Min_Amount") == "No", False)
    .otherwise(None)  # NULL for NM and unknown values
)

# 查看结果
print("Payment_of_Min_Amount标准化结果:")
financials_df.select(
    "Customer_ID", 
    "Payment_of_Min_Amount", 
    "payment_of_min_amount_standardized",
    "is_min_payment_made"
).groupBy("Payment_of_Min_Amount", "payment_of_min_amount_standardized", "is_min_payment_made") \
 .count() \
 .show(truncate=False)

# 2.3 处理Credit_Mix
print("\n2.3 处理Credit_Mix")
financials_df = financials_df.withColumn(
    "credit_mix_standardized",
    when(col("Credit_Mix") == "_", "UNKNOWN")
    .when(col("Credit_Mix") == "Bad", "BAD")
    .when(col("Credit_Mix") == "Good", "GOOD")
    .when(col("Credit_Mix") == "Standard", "STANDARD")
    .otherwise(upper(col("Credit_Mix")))  # 处理其他可能的值，统一为大写
)

# 查看结果
print("Credit_Mix标准化结果:")
financials_df.select(
    "Customer_ID", 
    "Credit_Mix", 
    "credit_mix_standardized"
).groupBy("Credit_Mix", "credit_mix_standardized") \
 .count() \
 .show(truncate=False)

# 2.4 分析Payment_Behaviour
print("\n2.4 分析Payment_Behaviour")
# 先查看所有不同的值
payment_behavior_types = financials_df.select("Payment_Behaviour").distinct()
print("Payment_Behaviour所有不同值:")
payment_behavior_types.show(truncate=False)

# 创建标准化版本和提取支付行为特征
financials_df = financials_df.withColumn(
    "payment_behavior_standardized",
    upper(col("Payment_Behaviour"))
).withColumn(
    "payment_behavior_spent_level",
    when(col("Payment_Behaviour").contains("High_spent"), "HIGH")
    .when(col("Payment_Behaviour").contains("Low_spent"), "LOW")
    .otherwise("UNKNOWN")
).withColumn(
    "payment_behavior_value_level",
    when(col("Payment_Behaviour").contains("Large_value"), "LARGE")
    .when(col("Payment_Behaviour").contains("Medium_value"), "MEDIUM")
    .when(col("Payment_Behaviour").contains("Small_value"), "SMALL")
    .otherwise("UNKNOWN")
)

# 查看结果
print("Payment_Behaviour分析结果:")
financials_df.select(
    "Customer_ID", 
    "Payment_Behaviour", 
    "payment_behavior_spent_level",
    "payment_behavior_value_level"
).groupBy("Payment_Behaviour", "payment_behavior_spent_level", "payment_behavior_value_level") \
 .count() \
 .orderBy("count", ascending=False) \
 .show(truncate=False)

步骤2: 处理文本格式金融指标

2.1 解析Credit_History_Age
Credit_History_Age解析结果:
+-----------+----------------------+--------------------+---------------------+---------------------------+
|Customer_ID|Credit_History_Age    |credit_history_years|credit_history_months|credit_history_total_months|
+-----------+----------------------+--------------------+---------------------+---------------------------+
|CUS_0x1000 |10 Years and 9 Months |10                  |9                    |129                        |
|CUS_0x1009 |31 Years and 0 Months |31                  |0                    |372                        |
|CUS_0x100b |15 Years and 10 Months|15                  |10                   |190                        |
|CUS_0x1011 |15 Years and 10 Months|15                  |10                   |190                        |
|CUS_0x1013 |17 Years and 10 Months|17                  |10                   |214                        |
|CUS_0x1015 |21 Years and 5 Months |21                  |5            

In [16]:
from pyspark.sql.functions import (
    col, when, regexp_replace, split, 
    size, array_contains, lit, explode
)

# 3. 贷款相关字段处理
print("步骤3: 处理贷款相关字段")

# 3.1 处理Num_of_Loan - 转换为整数
print("\n3.1 处理Num_of_Loan")
financials_df = financials_df.withColumn(
    "num_of_loan_cleaned",
    regexp_replace(col("Num_of_Loan"), "[^0-9]", "")  # 移除非数字字符
).withColumn(
    "num_of_loans",
    when(
        col("num_of_loan_cleaned").rlike("^\\d+$"),
        col("num_of_loan_cleaned").cast("int")
    ).otherwise(0)  # 默认值为0
)

# 检查结果
print("Num_of_Loan处理结果:")
financials_df.select(
    "Customer_ID", 
    "Num_of_Loan", 
    "num_of_loans"
).show(10, truncate=False)

# 3.2 处理Num_of_Delayed_Payment - 转换为整数
print("\n3.2 处理Num_of_Delayed_Payment")
financials_df = financials_df.withColumn(
    "num_delayed_payment_cleaned",
    regexp_replace(col("Num_of_Delayed_Payment"), "[^0-9]", "")  # 移除非数字字符
).withColumn(
    "num_delayed_payments",
    when(
        col("num_delayed_payment_cleaned").rlike("^\\d+$"),
        col("num_delayed_payment_cleaned").cast("int")
    ).otherwise(0)  # 默认值为0
)

# 检查结果
print("Num_of_Delayed_Payment处理结果:")
financials_df.select(
    "Customer_ID", 
    "Num_of_Delayed_Payment", 
    "num_delayed_payments"
).show(10, truncate=False)

# 3.3 处理Type_of_Loan - 解析逗号分隔列表
print("\n3.3 处理Type_of_Loan")

# 首先查看不同类型的贷款
if True:  # 为了避免输出太多，可以控制是否运行
    print("所有贷款类型（前50个）:")
    loan_types = financials_df.select(
        explode(split(col("Type_of_Loan"), ",")).alias("loan_type")
    ).distinct()
    loan_types.show(50, truncate=False)

# 将贷款类型转换为数组
financials_df = financials_df.withColumn(
    "loan_types_array",
    when(
        col("Type_of_Loan").isNotNull(),
        split(col("Type_of_Loan"), ",")
    ).otherwise(array())  # 空数组
).withColumn(
    "loan_types_count",
    when(
        col("loan_types_array").isNotNull(),
        size(col("loan_types_array"))
    ).otherwise(0)
)

# 创建常见贷款类型的标志
loan_type_flags = [
    "Credit-Builder Loan", 
    "Home Equity Loan", 
    "Debt Consolidation Loan", 
    "Personal Loan", 
    "Payday Loan", 
    "Student Loan", 
    "Mortgage Loan", 
    "Auto Loan", 
    "Credit-Builder Loan"
]

# 为每种常见贷款类型创建标志
for loan_type in loan_type_flags:
    clean_name = loan_type.replace("-", "_").replace(" ", "_").lower()
    financials_df = financials_df.withColumn(
        f"has_{clean_name}",
        when(
            array_contains(col("loan_types_array"), loan_type) |
            array_contains(col("loan_types_array"), f" {loan_type}"),  # 处理前导空格
            True
        ).otherwise(False)
    )

# 检查结果
print("Type_of_Loan处理结果:")
display_columns = ["Customer_ID", "Type_of_Loan", "loan_types_count"] + [
    f"has_{loan_type.replace('-', '_').replace(' ', '_').lower()}" 
    for loan_type in loan_type_flags[:20]  # 只显示前5种贷款类型标志
]
financials_df.select(display_columns).show(5, truncate=False)

# 3.4 处理Changed_Credit_Limit - 转换为数值
print("\n3.4 处理Changed_Credit_Limit")
financials_df = financials_df.withColumn(
    "changed_credit_limit_cleaned",
    regexp_replace(col("Changed_Credit_Limit"), "[^0-9\\.-]", "")  # 移除非数字和非小数点字符
).withColumn(
    "changed_credit_limit",
    when(
        col("changed_credit_limit_cleaned").rlike("^-?\\d+(\\.\\d+)?$"),
        col("changed_credit_limit_cleaned").cast("double")
    ).otherwise(None)  # 无效值设为NULL
)

# 检查结果
print("Changed_Credit_Limit处理结果:")
financials_df.select(
    "Customer_ID", 
    "Changed_Credit_Limit", 
    "changed_credit_limit"
).show(10, truncate=False)

步骤3: 处理贷款相关字段

3.1 处理Num_of_Loan
Num_of_Loan处理结果:
+-----------+-----------+------------+
|Customer_ID|Num_of_Loan|num_of_loans|
+-----------+-----------+------------+
|CUS_0x1000 |2          |2           |
|CUS_0x1009 |4          |4           |
|CUS_0x100b |0          |0           |
|CUS_0x1011 |3          |3           |
|CUS_0x1013 |3          |3           |
|CUS_0x1015 |0          |0           |
|CUS_0x1018 |8          |8           |
|CUS_0x1026 |2          |2           |
|CUS_0x102d |1          |1           |
|CUS_0x102e |4          |4           |
+-----------+-----------+------------+
only showing top 10 rows


3.2 处理Num_of_Delayed_Payment
Num_of_Delayed_Payment处理结果:
+-----------+----------------------+--------------------+
|Customer_ID|Num_of_Delayed_Payment|num_delayed_payments|
+-----------+----------------------+--------------------+
|CUS_0x1000 |26                    |26                  |
|CUS_0x1009 |18                    |18                  |
|CUS_0x100b |8                

In [18]:
# 检查确切的列名
print("所有列名:")
for field in financials_df.schema.fields:
    print(field.name)

# 使用确切的列名检查剩余字段
print("\n检查剩余字段的状态:")

# 确定Monthly_Balance的确切列名
monthly_balance_col_name = [field.name for field in financials_df.schema.fields if field.name.lower() == "monthly_balance" or "monthly" in field.name.lower()]
if monthly_balance_col_name:
    print(f"找到Monthly Balance列名: {monthly_balance_col_name[0]}")
    print("\nMonthly Balance字段样本:")
    financials_df.select("Customer_ID", monthly_balance_col_name[0]).show(5, truncate=False)
else:
    print("未找到Monthly Balance列")

# 检查snapshot_date的NULL值
print("\nsnapshot_date非空值检查:")
snapshot_null_count = financials_df.filter(col("snapshot_date").isNull()).count()
snapshot_not_null_count = financials_df.filter(col("snapshot_date").isNotNull()).count()
print(f"NULL值数量: {snapshot_null_count}")
print(f"非NULL值数量: {snapshot_not_null_count}")

# 查看部分数据记录以确认其他字段的状态
print("\n数据样本:")
financials_df.select(
    "Customer_ID", 
    "Num_Bank_Accounts", 
    "Num_Credit_Card", 
    "Interest_Rate", 
    "Delay_from_due_date",
    "snapshot_date"
).show(5, truncate=False)

所有列名:
Customer_ID
annual_income
Monthly_Inhand_Salary
Num_Bank_Accounts
Num_Credit_Card
Interest_Rate
num_of_loan
Type_of_Loan
Delay_from_due_date
num_of_delayed_payment
changed_credit_limit
Num_Credit_Inquiries
Credit_Mix
outstanding_debt
Credit_Utilization_Ratio
Credit_History_Age
Payment_of_Min_Amount
Total_EMI_per_month
amount_invested_monthly
Payment_Behaviour
monthly_balance
snapshot_date
bronze_ingest_timestamp
bronze_source_file
annual_income_cleaned
is_valid_annual_income
outstanding_debt_cleaned
is_valid_outstanding_debt
amount_invested_monthly_cleaned
is_valid_amount_invested
credit_history_years
credit_history_months
credit_history_total_months
min_payment_made
credit_mix_standardized
loan_types_array
num_loan_types
has_personal_loan
has_student_loan
has_mortgage_loan
has_auto_loan
has_home_equity_loan
has_debt_consolidation_loan
has_credit_builder_loan
has_payday_loan
changed_credit_limit_cleaned
debt_to_income_ratio
monthly_disposable_income
credit_utilization_category
pr

In [19]:
from pyspark.sql.functions import (
    col, when, lit, current_timestamp, 
    to_date, coalesce, regexp_replace
)

# 最后处理步骤
print("执行最终处理步骤")

# 1. 处理monthly_balance字段
print("\n1. 处理monthly_balance字段")
financials_df = financials_df.withColumn(
    "monthly_balance_cleaned", 
    regexp_replace(col("monthly_balance"), "[^0-9\\.-]", "")  # 移除非数字字符
).withColumn(
    "monthly_balance_processed", 
    when(
        col("monthly_balance_cleaned").rlike("^-?\\d+(\\.\\d+)?$"),
        col("monthly_balance_cleaned").cast("double")
    ).otherwise(None)  # 无效值设为NULL
)

# 查看处理结果
print("monthly_balance处理结果:")
financials_df.select(
    "Customer_ID", 
    "monthly_balance", 
    "monthly_balance_processed"
).show(5, truncate=False)

# 2. 处理snapshot_date为NULL的问题
print("\n2. 处理snapshot_date为NULL的问题")
# 由于所有snapshot_date都是NULL，我们可以使用一个默认日期
default_date = "2023-01-01"  # 使用一个合理的默认日期
financials_df = financials_df.withColumn(
    "silver_snapshot_date",
    coalesce(col("snapshot_date"), to_date(lit(default_date)))
)

# 查看处理结果
print("snapshot_date处理结果:")
financials_df.select(
    "Customer_ID", 
    "snapshot_date", 
    "silver_snapshot_date"
).show(5, truncate=False)

# 3. 创建最终的Silver层表，包含所有处理过的字段
print("\n3. 创建最终的Silver层财务表")
silver_financials_df = financials_df.select(
    # 客户ID和日期
    "Customer_ID",
    col("silver_snapshot_date").alias("snapshot_date"),
    
    # 收入和薪资
    col("annual_income").alias("annual_income"),
    "Monthly_Inhand_Salary",
    
    # 银行和信用卡
    "Num_Bank_Accounts",
    "Num_Credit_Card",
    
    # 贷款信息
    "Interest_Rate",
    col("num_of_loans").alias("num_of_loans"),
    col("loan_types_count").alias("loan_types_count"),
    
    # 常见贷款类型标志
    "has_personal_loan",
    "has_student_loan", 
    "has_mortgage_loan",
    "has_auto_loan",
    "has_home_equity_loan",
    "has_debt_consolidation_loan", 
    "has_credit_builder_loan",
    "has_payday_loan",
    
    # 延迟付款
    "Delay_from_due_date",
    col("num_delayed_payments").alias("num_delayed_payments"),
    
    # 信用相关
    col("changed_credit_limit").alias("changed_credit_limit"),
    "Num_Credit_Inquiries",
    col("credit_mix_standardized").alias("credit_mix"),
    col("outstanding_debt").alias("outstanding_debt"),
    "Credit_Utilization_Ratio",
    col("credit_history_years").alias("credit_history_years"),
    col("credit_history_months").alias("credit_history_months"),
    col("credit_history_total_months").alias("credit_history_total_months"),
    
    # 支付行为
    col("is_min_payment_made").alias("is_min_payment_made"),
    "Total_EMI_per_month",
    col("amount_invested_monthly").alias("amount_invested_monthly"),
    col("payment_behavior_spent_level").alias("payment_behavior_spent_level"),
    col("payment_behavior_value_level").alias("payment_behavior_value_level"),
    col("monthly_balance_processed").alias("monthly_balance"),
    
    # 元数据
    "bronze_ingest_timestamp",
    current_timestamp().alias("silver_process_timestamp")
)

# 查看最终结果
print("最终Silver层财务表结构:")
silver_financials_df.printSchema()
print("\n示例数据:")
silver_financials_df.show(5, truncate=False)

# 4. 保存Silver层数据
silver_financials_df.write.mode("overwrite").parquet("datamart/silver/silver_financials")
print("\nSilver层财务数据已保存到: datamart/silver/silver_financials")

# 统计记录数
silver_count = silver_financials_df.count()
print(f"保存的记录总数: {silver_count}")

执行最终处理步骤

1. 处理monthly_balance字段
monthly_balance处理结果:
+-----------+------------------+-------------------------+
|Customer_ID|monthly_balance   |monthly_balance_processed|
+-----------+------------------+-------------------------+
|CUS_0x1000 |400.36080052211616|400.36080052211616       |
|CUS_0x1009 |508.01234122645366|508.01234122645366       |
|CUS_0x100b |597.8989834797281 |597.8989834797281        |
|CUS_0x1011 |294.1014665671429 |294.1014665671429        |
|CUS_0x1013 |485.8897083704929 |485.8897083704929        |
+-----------+------------------+-------------------------+
only showing top 5 rows


2. 处理snapshot_date为NULL的问题
snapshot_date处理结果:
+-----------+-------------+--------------------+
|Customer_ID|snapshot_date|silver_snapshot_date|
+-----------+-------------+--------------------+
|CUS_0x1000 |NULL         |2023-01-01          |
|CUS_0x1009 |NULL         |2023-01-01          |
|CUS_0x100b |NULL         |2023-01-01          |
|CUS_0x1011 |NULL         |2023-01-01          |
