# Spark 数据清洗任务

基于AWS Glue脚本的本地Spark实现

## 功能
- 清洗客户基本信息
- 清洗客户行为资产数据
- 生成数据质量报告

## 环境配置

In [24]:
import os
import sys

# Ensure environment variables are set
if 'JAVA_HOME' not in os.environ:
    java_home = "C:\\Program Files\\Java\\jdk-11"
    os.environ['JAVA_HOME'] = java_home

if 'SPARK_HOME' not in os.environ:
    spark_home = "C:\\Users\\hy120\\spark\\spark-3.5.7-bin-hadoop3"
    os.environ['SPARK_HOME'] = spark_home

# 设置 HADOOP_HOME 指向你的 Hadoop 安装目录
os.environ["HADOOP_HOME"] = "C:\\Users\\hy120\\hadoop"
os.environ["HADOOP_COMMON_HOME"] = "C:\\Users\\hy120\\hadoop"
os.environ["HADOOP_HDFS_HOME"] = "C:\\Users\\hy120\\hadoop"
os.environ["HADOOP_MAPRED_HOME"] = "C:\\Users\\hy120\\hadoop"
os.environ["HADOOP_YARN_HOME"] = "C:\\Users\\hy120\\hadoop"
os.environ["HADOOP_CONF_DIR"] = "C:\\Users\\hy120\\hadoop\\etc\\hadoop"

# 让 Spark executor 和 driver 都用当前这个 Python（你的 .venv 里的 python）
os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

print("Environment setup:")
print("  JAVA_HOME:", os.environ.get("JAVA_HOME"))
print("  SPARK_HOME:", os.environ.get("SPARK_HOME"))
print("  HADOOP_HOME:", os.environ.get("HADOOP_HOME"))
print("  PYSPARK_PYTHON:", os.environ.get("PYSPARK_PYTHON"))

Environment setup:
  JAVA_HOME: C:\Program Files\Java\jdk-11
  SPARK_HOME: C:\Users\hy120\spark\spark-3.5.7-bin-hadoop3
  HADOOP_HOME: C:\Users\hy120\hadoop
  PYSPARK_PYTHON: C:\Users\hy120\Downloads\zhihullm\CASE-customer-group\.venv\Scripts\python.exe


## 1. 初始化 Spark Session

In [25]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, when, to_date, to_timestamp,
    trim, upper, lower, length,
    isnan, isnull, coalesce, lit,
    row_number, sum as spark_sum, year, month
)
from pyspark.sql.window import Window
import warnings
warnings.filterwarnings('ignore')

# 创建Spark Session
# 禁用 Hadoop 权限检查，避免 Windows 上的权限问题
spark = SparkSession.builder \
    .appName("CustomerDataCleansing") \
    .master("local[2]") \
    .config("spark.driver.memory", "1g") \
    .config("spark.executor.memory", "1g") \
    .config("spark.sql.execution.arrow.enabled", "true") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.network.timeout", "600s") \
    .config("spark.executor.heartbeatInterval", "300s") \
    .config("spark.sql.shuffle.partitions", "10") \
    .config("spark.hadoop.dfs.permissions.enabled", "false") \
    .config("spark.hadoop.fs.permissions.umask-mode", "000") \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")

print(f"✓ Spark Session 创建成功")
print(f"  Spark版本: {spark.version}")
print(f"  Master: local[2]")

✓ Spark Session 创建成功
  Spark版本: 3.5.7
  Master: local[2]


## 2. 加载数据

In [26]:
from pathlib import Path

# 获取项目根目录 - 从test/spark目录向上找到项目根
project_root = Path.cwd()
while project_root.name != "CASE-customer-group" and project_root.parent != project_root:
    project_root = project_root.parent

# 数据文件路径 - 在项目根目录
customer_base_path = str(project_root / "customer_base.csv")
customer_behavior_path = str(project_root / "customer_behavior_assets.csv")

print(f"数据目录: {project_root}")
print(f"客户基本信息: {customer_base_path}")
print(f"客户行为资产: {customer_behavior_path}")
print()

# 加载数据
df_customer_base = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv(customer_base_path)

df_customer_behavior = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv(customer_behavior_path)

print(f"✓ 数据加载完成")
print(f"  客户基本信息行数: {df_customer_base.count()}")
print(f"  客户行为资产行数: {df_customer_behavior.count()}")

数据目录: C:\Users\hy120\Downloads\zhihullm\CASE-customer-group
客户基本信息: C:\Users\hy120\Downloads\zhihullm\CASE-customer-group\customer_base.csv
客户行为资产: C:\Users\hy120\Downloads\zhihullm\CASE-customer-group\customer_behavior_assets.csv

✓ 数据加载完成
  客户基本信息行数: 10000
  客户行为资产行数: 120000


## 3. 清洗客户基本信息表

In [27]:
# 3.1 查看原始数据结构
print("=" * 60)
print("客户基本信息 - 原始数据")
print("=" * 60)
print(f"\n行数: {df_customer_base.count()}")
print(f"\n数据类型:")
df_customer_base.printSchema()
print(f"\n前3行:")
df_customer_base.show(3, truncate=False)

客户基本信息 - 原始数据

行数: 10000

数据类型:
root
 |-- customer_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- occupation_type: string (nullable = true)
 |-- monthly_income: double (nullable = true)
 |-- open_account_date: date (nullable = true)
 |-- lifecycle_stage: string (nullable = true)
 |-- marriage_status: string (nullable = true)
 |-- city_level: string (nullable = true)
 |-- branch_name: string (nullable = true)


前3行:
+--------------------------------+----+---+------+----------+---------------+--------------+-----------------+---------------+---------------+----------+----------------------------+
|customer_id                     |name|age|gender|occupation|occupation_type|monthly_income|open_account_date|lifecycle_stage|marriage_status|city_level|branch_name                 |
+--------------------------------+----+---+------+----------+---------------

In [28]:
# 3.2 数据类型转换和标准化
df_customer_base_cleaned = df_customer_base \
    .withColumn("customer_id", trim(col("customer_id"))) \
    .withColumn("name", trim(col("name"))) \
    .withColumn("age", col("age").cast("int")) \
    .withColumn("gender", trim(col("gender"))) \
    .withColumn("occupation", trim(col("occupation"))) \
    .withColumn("occupation_type", trim(col("occupation_type"))) \
    .withColumn("monthly_income", col("monthly_income").cast("double")) \
    .withColumn("open_account_date", to_date(col("open_account_date"), "yyyy-MM-dd")) \
    .withColumn("lifecycle_stage", trim(col("lifecycle_stage"))) \
    .withColumn("marriage_status", trim(col("marriage_status"))) \
    .withColumn("city_level", trim(col("city_level"))) \
    .withColumn("branch_name", trim(col("branch_name")))

print("✓ 数据类型转换完成")

✓ 数据类型转换完成


In [29]:
# 3.3 异常值处理
df_customer_base_cleaned = df_customer_base_cleaned \
    .withColumn("age",
                when((col("age") < 18) | (col("age") > 100), None)
                .otherwise(col("age"))) \
    .withColumn("monthly_income",
                when((col("monthly_income") < 0) | (col("monthly_income") > 1000000), None)
                .otherwise(col("monthly_income")))

print("✓ 异常值处理完成")
print(f"  年龄异常值: {df_customer_base_cleaned.filter(col('age').isNull()).count()}")
print(f"  收入异常值: {df_customer_base_cleaned.filter(col('monthly_income').isNull()).count()}")

✓ 异常值处理完成
  年龄异常值: 0
  收入异常值: 0


In [30]:
# 3.4 性别标准化
df_customer_base_cleaned = df_customer_base_cleaned \
    .withColumn("gender_flag",
                when(col("gender").isin(["男", "女"]), "valid")
                .otherwise("invalid"))

invalid_gender = df_customer_base_cleaned.filter(col("gender_flag") == "invalid").count()
print(f"✓ 性别标准化完成")
print(f"  性别异常值: {invalid_gender}")

✓ 性别标准化完成
  性别异常值: 0


In [31]:
# 3.5 日期验证和提取
df_customer_base_cleaned = df_customer_base_cleaned \
    .withColumn("open_account_year",
                when(col("open_account_date").isNotNull(),
                     year(col("open_account_date")))
                .otherwise(None)) \
    .withColumn("open_account_month",
                when(col("open_account_date").isNotNull(),
                     month(col("open_account_date")))
                .otherwise(None))

print("✓ 日期验证和提取完成")

✓ 日期验证和提取完成


In [32]:
# 3.6 缺失值统计
print("\n缺失值统计:")
null_counts = {}
for col_name in df_customer_base_cleaned.columns:
    null_count = df_customer_base_cleaned.filter(col(col_name).isNull()).count()
    if null_count > 0:
        null_counts[col_name] = null_count

if null_counts:
    for col_name, count in sorted(null_counts.items(), key=lambda x: x[1], reverse=True):
        print(f"  {col_name}: {count}")
else:
    print("  无缺失值")


缺失值统计:
  无缺失值


In [33]:
# 3.7 去重（基于customer_id）
duplicates_before = df_customer_base_cleaned.count()

df_customer_base_cleaned = df_customer_base_cleaned.dropDuplicates(["customer_id"])

duplicates_removed = duplicates_before - df_customer_base_cleaned.count()

print(f"✓ 去重完成")
print(f"  去重前: {duplicates_before}")
print(f"  去重后: {df_customer_base_cleaned.count()}")
print(f"  移除重复行: {duplicates_removed}")

✓ 去重完成
  去重前: 10000
  去重后: 10000
  移除重复行: 0


## 4. 清洗客户行为资产表

In [34]:
# 4.1 查看原始数据
print("=" * 60)
print("客户行为资产 - 原始数据")
print("=" * 60)
print(f"\n行数: {df_customer_behavior.count()}")
print(f"\n前3行:")
df_customer_behavior.show(3, truncate=False)

客户行为资产 - 原始数据

行数: 120000

前3行:
+--------------------------------+--------------------------------+------------+---------------+-----------------+------------+-----------------+-----------+------------+--------------+---------+--------------+-------------+--------------------------+---------------------------+------------------------+---------------+-----------------------+-------------------------+-------------------+-------------------+--------------+---------------------+-------------------+
|id                              |customer_id                     |total_assets|deposit_balance|financial_balance|fund_balance|insurance_balance|asset_level|deposit_flag|financial_flag|fund_flag|insurance_flag|product_count|financial_repurchase_count|credit_card_monthly_expense|investment_monthly_count|app_login_count|app_financial_view_time|app_product_compare_count|last_app_login_time|last_contact_time  |contact_result|marketing_cool_period|stat_month         |
+-------------------------------

In [35]:
# 4.2 数据类型转换
df_customer_behavior_cleaned = df_customer_behavior \
    .withColumn("id", trim(col("id"))) \
    .withColumn("customer_id", trim(col("customer_id"))) \
    .withColumn("total_assets", col("total_assets").cast("double")) \
    .withColumn("deposit_balance", col("deposit_balance").cast("double")) \
    .withColumn("financial_balance", col("financial_balance").cast("double")) \
    .withColumn("fund_balance", col("fund_balance").cast("double")) \
    .withColumn("insurance_balance", col("insurance_balance").cast("double")) \
    .withColumn("deposit_flag", col("deposit_flag").cast("int")) \
    .withColumn("financial_flag", col("financial_flag").cast("int")) \
    .withColumn("fund_flag", col("fund_flag").cast("int")) \
    .withColumn("insurance_flag", col("insurance_flag").cast("int")) \
    .withColumn("product_count", col("product_count").cast("int")) \
    .withColumn("financial_repurchase_count", col("financial_repurchase_count").cast("int")) \
    .withColumn("credit_card_monthly_expense", col("credit_card_monthly_expense").cast("double")) \
    .withColumn("investment_monthly_count", col("investment_monthly_count").cast("int")) \
    .withColumn("app_login_count", col("app_login_count").cast("int")) \
    .withColumn("app_financial_view_time", col("app_financial_view_time").cast("int")) \
    .withColumn("app_product_compare_count", col("app_product_compare_count").cast("int")) \
    .withColumn("last_app_login_time", to_timestamp(col("last_app_login_time"), "yyyy-MM-dd HH:mm:ss")) \
    .withColumn("last_contact_time", to_timestamp(col("last_contact_time"), "yyyy-MM-dd HH:mm:ss")) \
    .withColumn("contact_result", trim(col("contact_result"))) \
    .withColumn("marketing_cool_period", to_date(col("marketing_cool_period"), "yyyy-MM-dd")) \
    .withColumn("stat_month", col("stat_month").cast("string"))

print("✓ 数据类型转换完成")

✓ 数据类型转换完成


In [36]:
# 4.3 资产数据验证
df_customer_behavior_cleaned = df_customer_behavior_cleaned \
    .withColumn("total_assets_valid",
                when((col("total_assets") >= 0) &
                     (col("total_assets") < 100000000), "valid")  # < 1亿
                .otherwise("invalid")) \
    .withColumn("assets_balance_check",
                when((col("deposit_balance") + col("financial_balance") +
                      col("fund_balance") + col("insurance_balance")) > 0, "valid")
                .otherwise("invalid"))

invalid_assets = df_customer_behavior_cleaned.filter(col("total_assets_valid") == "invalid").count()
invalid_balance = df_customer_behavior_cleaned.filter(col("assets_balance_check") == "invalid").count()

print(f"✓ 资产数据验证完成")
print(f"  总资产异常值: {invalid_assets}")
print(f"  资产结余不匹配: {invalid_balance}")

✓ 资产数据验证完成
  总资产异常值: 0
  资产结余不匹配: 0


In [37]:
# 4.4 行为数据验证（非负数）
behavior_cols = [
    "credit_card_monthly_expense", "investment_monthly_count",
    "app_login_count", "app_financial_view_time", "app_product_compare_count"
]

for col_name in behavior_cols:
    df_customer_behavior_cleaned = df_customer_behavior_cleaned \
        .withColumn(col_name,
                    when(col(col_name) < 0, 0)
                    .otherwise(col(col_name)))

print(f"✓ 行为数据验证完成")

✓ 行为数据验证完成


In [38]:
# 4.5 产品标志验证（必须为0或1）
flag_cols = ["deposit_flag", "financial_flag", "fund_flag", "insurance_flag"]
for col_name in flag_cols:
    df_customer_behavior_cleaned = df_customer_behavior_cleaned \
        .withColumn(col_name,
                    when(col(col_name).isin([0, 1]), col(col_name))
                    .otherwise(None))

print(f"✓ 产品标志验证完成")

✓ 产品标志验证完成


In [39]:
# 4.6 缺失值处理统计 - contact_result
df_customer_behavior_cleaned = df_customer_behavior_cleaned \
    .withColumn("contact_result_flag",
                when(col("contact_result").isNull(), "missing")
                .otherwise("present"))

missing_contact = df_customer_behavior_cleaned.filter(col("contact_result_flag") == "missing").count()

print(f"✓ 缺失值标记完成")
print(f"  contact_result 缺失值: {missing_contact}")

✓ 缺失值标记完成
  contact_result 缺失值: 24037


In [40]:
# 4.7 去重（基于customer_id和stat_month，保留最新的记录）
duplicates_before = df_customer_behavior_cleaned.count()

window_spec = Window.partitionBy("customer_id", "stat_month").orderBy(col("last_app_login_time").desc())
df_customer_behavior_cleaned = df_customer_behavior_cleaned \
    .withColumn("row_num", row_number().over(window_spec)) \
    .filter(col("row_num") == 1) \
    .drop("row_num")

duplicates_removed = duplicates_before - df_customer_behavior_cleaned.count()

print(f"✓ 去重完成")
print(f"  去重前: {duplicates_before}")
print(f"  去重后: {df_customer_behavior_cleaned.count()}")
print(f"  移除重复行: {duplicates_removed}")

✓ 去重完成
  去重前: 120000
  去重后: 120000
  移除重复行: 0


## 5. 数据质量检查报告

In [41]:
from datetime import datetime

# 收集清洗前后的数据
quality_report = {
    "timestamp": datetime.now().isoformat(),
    "job_name": "CustomerDataCleansing",
    "customer_base": {
        "input_rows": df_customer_base.count(),
        "output_rows": df_customer_base_cleaned.count(),
        "duplicate_removed": df_customer_base.count() - df_customer_base_cleaned.count(),
        "age_invalid_count": df_customer_base_cleaned.filter(col("age").isNull()).count(),
        "income_invalid_count": df_customer_base_cleaned.filter(col("monthly_income").isNull()).count(),
        "gender_invalid_count": df_customer_base_cleaned.filter(col("gender_flag") == "invalid").count()
    },
    "customer_behavior": {
        "input_rows": df_customer_behavior.count(),
        "output_rows": df_customer_behavior_cleaned.count(),
        "duplicate_removed": df_customer_behavior.count() - df_customer_behavior_cleaned.count(),
        "contact_result_missing": df_customer_behavior_cleaned.filter(col("contact_result_flag") == "missing").count(),
        "assets_invalid_count": df_customer_behavior_cleaned.filter(col("total_assets_valid") == "invalid").count()
    }
}

# 打印质量报告
print("\n" + "=" * 80)
print("数据质量检查报告")
print("=" * 80)

print(f"\n执行时间: {quality_report['timestamp']}")
print(f"任务名称: {quality_report['job_name']}")

print(f"\n【客户基本信息】")
for key, value in quality_report["customer_base"].items():
    print(f"  {key}: {value}")

print(f"\n【客户行为资产】")
for key, value in quality_report["customer_behavior"].items():
    print(f"  {key}: {value}")

print("\n" + "=" * 80)


数据质量检查报告

执行时间: 2025-12-08T13:18:12.604553
任务名称: CustomerDataCleansing

【客户基本信息】
  input_rows: 10000
  output_rows: 10000
  duplicate_removed: 0
  age_invalid_count: 0
  income_invalid_count: 0
  gender_invalid_count: 0

【客户行为资产】
  input_rows: 120000
  output_rows: 120000
  duplicate_removed: 0
  contact_result_missing: 24037
  assets_invalid_count: 0



## 6. 输出清洗后的数据

In [42]:
# 创建output目录并清理旧文件
output_dir = project_root / "output"
output_dir.mkdir(exist_ok=True)

output_path_base = str(output_dir / "cleaned_customer_base.csv")
output_path_behavior = str(output_dir / "cleaned_customer_behavior.csv")

# 如果文件已存在，删除它们（避免权限问题）
import os
try:
    if os.path.exists(output_path_base):
        os.remove(output_path_base)
        print(f"已删除旧文件: {output_path_base}")
except Exception as e:
    print(f"删除旧文件失败: {e}")

try:
    if os.path.exists(output_path_behavior):
        os.remove(output_path_behavior)
        print(f"已删除旧文件: {output_path_behavior}")
except Exception as e:
    print(f"删除旧文件失败: {e}")

# 使用 Pandas 进行本地输出（最可靠的方式）
print("\n✓ 开始导出清洗后的数据...\n")

try:
    print("  正在导出客户基本信息...")
    df_customer_base_cleaned.toPandas().to_csv(output_path_base, index=False, encoding='utf-8')
    print(f"  ✓ 客户基本信息已输出")
    print(f"    路径: {output_path_base}")
    
    # 验证文件
    if os.path.exists(output_path_base):
        size = os.path.getsize(output_path_base)
        print(f"    文件大小: {size / 1024:.2f} KB")
    
    print()
    print("  正在导出客户行为资产...")
    df_customer_behavior_cleaned.toPandas().to_csv(output_path_behavior, index=False, encoding='utf-8')
    print(f"  ✓ 客户行为资产已输出")
    print(f"    路径: {output_path_behavior}")
    
    # 验证文件
    if os.path.exists(output_path_behavior):
        size = os.path.getsize(output_path_behavior)
        print(f"    文件大小: {size / 1024 / 1024:.2f} MB")
    
    print("\n✓ 所有数据已成功导出！")
    
except Exception as e:
    print(f"✗ 导出失败: {e}")
    import traceback
    traceback.print_exc()


✓ 开始导出清洗后的数据...

  正在导出客户基本信息...
  ✓ 客户基本信息已输出
    路径: C:\Users\hy120\Downloads\zhihullm\CASE-customer-group\output\cleaned_customer_base.csv
    文件大小: 1779.71 KB

  正在导出客户行为资产...
  ✓ 客户行为资产已输出
    路径: C:\Users\hy120\Downloads\zhihullm\CASE-customer-group\output\cleaned_customer_behavior.csv
    文件大小: 28.25 MB

✓ 所有数据已成功导出！


## 7. 清洗结果统计

In [43]:
# 统计结果
print("\n" + "=" * 80)
print("清洗结果统计")
print("=" * 80)

base_clean_rate = (quality_report["customer_base"]["output_rows"] / 
                   quality_report["customer_base"]["input_rows"] * 100)
behavior_clean_rate = (quality_report["customer_behavior"]["output_rows"] / 
                       quality_report["customer_behavior"]["input_rows"] * 100)

print(f"\n客户基本信息:")
print(f"  输入行数: {quality_report['customer_base']['input_rows']}")
print(f"  输出行数: {quality_report['customer_base']['output_rows']}")
print(f"  保留率: {base_clean_rate:.2f}%")
print(f"  移除重复: {quality_report['customer_base']['duplicate_removed']}")

print(f"\n客户行为资产:")
print(f"  输入行数: {quality_report['customer_behavior']['input_rows']}")
print(f"  输出行数: {quality_report['customer_behavior']['output_rows']}")
print(f"  保留率: {behavior_clean_rate:.2f}%")
print(f"  移除重复: {quality_report['customer_behavior']['duplicate_removed']}")

print("\n" + "=" * 80)
print("✓ 数据清洗任务完成！")
print("=" * 80)


清洗结果统计

客户基本信息:
  输入行数: 10000
  输出行数: 10000
  保留率: 100.00%
  移除重复: 0

客户行为资产:
  输入行数: 120000
  输出行数: 120000
  保留率: 100.00%
  移除重复: 0

✓ 数据清洗任务完成！


## 8. 清洗结果预览

In [44]:
# 预览清洗后的客户基本信息
print("\n【清洗后的客户基本信息 - 前5行】")
df_customer_base_cleaned.select(
    "customer_id", "name", "age", "gender", "monthly_income", "open_account_date"
).show(5, truncate=False)


【清洗后的客户基本信息 - 前5行】
+--------------------------------+------+---+------+--------------+-----------------+
|customer_id                     |name  |age|gender|monthly_income|open_account_date|
+--------------------------------+------+---+------+--------------+-----------------+
|0001406853e04c14b9f29703c9e46674|朱琴  |48 |女    |59552.84      |2019-11-14       |
|0007100c0acf4e80b4810ef490c18310|萧桂芝|34 |女    |17148.94      |2022-06-13       |
|000845c0fb85413c843b0fe2fe130557|李文  |38 |男    |19248.78      |2025-03-29       |
|00120dc0c4064181b8d2191c100dec00|陈颖  |31 |男    |18833.49      |2019-12-18       |
|001c46c3f442444dbc9025efab247fde|曾小红|29 |女    |28559.58      |2017-01-09       |
+--------------------------------+------+---+------+--------------+-----------------+
only showing top 5 rows



In [45]:
# 预览清洗后的客户行为资产
print("\n【清洗后的客户行为资产 - 前5行】")
df_customer_behavior_cleaned.select(
    "customer_id", "total_assets", "deposit_balance", "product_count", "app_login_count", "stat_month"
).show(5, truncate=False)


【清洗后的客户行为资产 - 前5行】
+--------------------------------+------------+---------------+-------------+---------------+-------------------+
|customer_id                     |total_assets|deposit_balance|product_count|app_login_count|stat_month         |
+--------------------------------+------------+---------------+-------------+---------------+-------------------+
|0001406853e04c14b9f29703c9e46674|3076725.27  |1009485.7      |2            |3              |2024-08-01 00:00:00|
|0001406853e04c14b9f29703c9e46674|3087342.78  |1092487.23     |4            |3              |2024-10-01 00:00:00|
|0001406853e04c14b9f29703c9e46674|3208570.95  |1157064.29     |3            |8              |2024-12-01 00:00:00|
|0001406853e04c14b9f29703c9e46674|3324616.42  |1258589.72     |3            |4              |2025-04-01 00:00:00|
|0007100c0acf4e80b4810ef490c18310|20153.97    |5114.64        |1            |6              |2024-07-01 00:00:00|
+--------------------------------+------------+---------------+-----

In [46]:
# 停止Spark Session
spark.stop()
print("\n✓ Spark Session 已停止")


✓ Spark Session 已停止
