In [1]:
from pyspark.sql.functions import col
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("mlflow") \
    .master("spark://spark-master:7077") \
    .config("spark.executor.cores", "2") \
    .config("spark.cores.max", "2") \
    .enableHiveSupport() \
    .getOrCreate()

silver_df = spark.table("default.stg_user_events")
    
# # 触发一次计算并将其缓存到Spark内存中
silver_df.cache()



:: loading settings :: url = jar:file:/opt/conda/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /opt/spark/.ivy2/cache
The jars for the packages stored in: /opt/spark/.ivy2/jars
org.postgresql#postgresql added as a dependency
io.delta#delta-spark_2.12 added as a dependency
io.delta#delta-storage added as a dependency
org.elasticsearch#elasticsearch-spark-30_2.12 added as a dependency
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
com.amazonaws#aws-java-sdk-bundle added as a dependency
com.amazonaws#aws-java-sdk-bundle added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-864c920f-cf1e-4c80-97cc-2784ed658503;1.0
	confs: [default]
	found org.postgresql#postgresql;42.7.3 in central
	found org.checkerframework#checker-qual;3.42.0 in central
	found io.delta#delta-spark_2.12;3.3.0 in central
	found io.delta#delta-storage;3.3.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
	found org.elasticsearch#elasticsearch-spark-30_2.12;8.11.3 in central
	found org.scala-lang#scala-reflect;2.12.8 in central


DataFrame[user_id: string, event_type: string, page: string, purchase_value: double, ts: timestamp]

In [40]:
# 获取所有配置项
spark.sparkContext.getConf().getAll()

[('spark.jars.packages',
  'org.postgresql:postgresql:42.7.3,io.delta:delta-spark_2.12:3.3.0,io.delta:delta-storage:3.3.0,org.elasticsearch:elasticsearch-spark-30_2.12:8.11.3,org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.7,com.amazonaws:aws-java-sdk-bundle:1.12.262,com.amazonaws:aws-java-sdk-bundle:1.12.262'),
 ('spark.network.timeout', '600s'),
 ('spark.hadoop.javax.jdo.option.ConnectionDriverName',
  'org.postgresql.Driver'),
 ('spark.hadoop.fs.s3a.path.style.access', 'true'),
 ('spark.files',
  'file:///opt/spark/.ivy2/jars/org.postgresql_postgresql-42.7.3.jar,file:///opt/spark/.ivy2/jars/io.delta_delta-spark_2.12-3.3.0.jar,file:///opt/spark/.ivy2/jars/io.delta_delta-storage-3.3.0.jar,file:///opt/spark/.ivy2/jars/org.elasticsearch_elasticsearch-spark-30_2.12-8.11.3.jar,file:///opt/spark/.ivy2/jars/org.apache.spark_spark-sql-kafka-0-10_2.12-3.5.7.jar,file:///opt/spark/.ivy2/jars/com.amazonaws_aws-java-sdk-bundle-1.12.262.jar,file:///opt/spark/.ivy2/jars/org.checkerframework_checker-

In [29]:
import pandas as pd
from evidently import Report
from evidently.presets import DataDriftPreset
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# --- 2. 确认 'silver_df' 存在 ---
if 'silver_df' not in locals():
    print("错误：未找到 Spark DataFrame 'silver_df'。")
    print("请先运行之前的单元格来加载数据 (spark.table('default.stg_user_events'))。")
else:
    print(f"成功找到 'silver_df' (包含 {silver_df.count()} 行数据)。")

    # --- 3. 将 Spark DataFrame 转换为 Pandas DataFrame ---
    # 警告：对于超大数据集 (TB级)，我们应该使用 .sample() 进行采样。
    # 对于我们的45k行数据，.toPandas() 是可以接受的。
    print("正在将 Spark DataFrame 转换为 Pandas DataFrame...")
    all_data_pd = silver_df.toPandas()
    print("转换完成。")

    # --- 4. 模拟“参考”数据集和“当前”数据集 ---
    split_point = int(len(all_data_pd) * 0.5)
    reference_data_pd = all_data_pd.iloc[:split_point]
    current_data_pd = all_data_pd.iloc[split_point:]

    print(f"总行数: {len(all_data_pd)}")
    print(f"参考数据集 (前50%): {len(reference_data_pd)} 行")
    print(f"当前数据集 (后50%): {len(current_data_pd)} 行")

成功找到 'silver_df' (包含 45654 行数据)。
正在将 Spark DataFrame 转换为 Pandas DataFrame...
转换完成。
总行数: 45654
参考数据集 (前50%): 22827 行
当前数据集 (后50%): 22827 行


In [31]:
# --- 5. 创建 Evidently 漂移报告 ---
print("\n正在生成数据漂移报告 (DataDriftPreset)...")

# DataDriftPreset 会自动分析所有列，包括我们的"ts"（时间戳）
# "purchase_value"（数值）和 "event_type"（分类）
data_drift_report = Report(metrics=[
    DataDriftPreset(),
])

# 运行计算！
my_report = data_drift_report.run(
    current_data=current_data_pd, 
    reference_data=reference_data_pd, 
)

# --- 6. 将报告保存为 HTML 文件 ---
report_path = "/home/jovyan/work/data_drift_report.html"
my_report.save_html(report_path)

print("\n--- 成功！---")
print(f"数据漂移报告已保存到: {report_path}")
print("\n请在JupyterLab的左侧文件浏览器中，找到 'work/' 目录下的 'data_drift_report.html' 文件，")
print("右键点击它并选择 'Open in New Browser Tab' (在新浏览器标签页中打开) 来查看。")


正在生成数据漂移报告 (DataDriftPreset)...

--- 成功！---
数据漂移报告已保存到: /home/jovyan/work/data_drift_report.html

请在JupyterLab的左侧文件浏览器中，找到 'work/' 目录下的 'data_drift_report.html' 文件，
右键点击它并选择 'Open in New Browser Tab' (在新浏览器标签页中打开) 来查看。


In [32]:
print(my_report.dict())

{'metrics': [{'id': '15e89f895b482f9b84ba7274ed18a106', 'metric_id': 'DriftedColumnsCount(drift_share=0.5)', 'value': {'count': 0.0, 'share': 0.0}}, {'id': '6e335cb10cdff7116c9a2779505312b9', 'metric_id': 'ValueDrift(column=purchase_value)', 'value': np.float64(0.01365116851194981)}, {'id': '330639c072ebcb0f60d3d208bb6f9b22', 'metric_id': 'ValueDrift(column=user_id)', 'value': np.float64(0.007919947910332509)}, {'id': '2f3ddb08b4459fb21120259cb2aa8302', 'metric_id': 'ValueDrift(column=event_type)', 'value': np.float64(0.0048321714239055)}, {'id': '5ff6a229a83c51b4cd78218a3971d31c', 'metric_id': 'ValueDrift(column=page)', 'value': np.float64(0.0066110243858271585)}], 'tests': []}


In [8]:
import numpy as np

# Example report.dict() struct
# {'metrics': [{'id': '15e89f895b482f9b84ba7274ed18a106', 'metric_id': 'DriftedColumnsCount(drift_share=0.5)', 'value': {'count': 0.0, 'share': 0.0}}, {'id': '6e335cb10cdff7116c9a2779505312b9', 'metric_id': 'ValueDrift(column=purchase_value)', 'value': np.float64(0.01365116851194981)}, {'id': '330639c072ebcb0f60d3d208bb6f9b22', 'metric_id': 'ValueDrift(column=user_id)', 'value': np.float64(0.007919947910332509)}, {'id': '2f3ddb08b4459fb21120259cb2aa8302', 'metric_id': 'ValueDrift(column=event_type)', 'value': np.float64(0.0048321714239055)}, {'id': '5ff6a229a83c51b4cd78218a3971d31c', 'metric_id': 'ValueDrift(column=page)', 'value': np.float64(0.0066110243858271585)}], 'tests': []}



# --- 1. 确保我们的“黄金标准”参考数据已准备好 ---
if 'reference_data_pd' not in locals():
    print("错误：未找到 'reference_data_pd'。请先运行上一个单元格。")
else:
    print("参考数据集 (reference_data_pd) 已准备好。")

    # --- 2. 创建一个“被污染”的当前数据集 ---
    print("正在创建“被污染”的当前数据集 (current_data_pd_corrupted)...")
    current_data_pd_corrupted = all_data_pd.iloc[split_point:].copy()

    # a. 模拟“数值漂移” (Numeric Drift)
    # 假设由于通货膨胀或促销，所有 'purchase_value' 都显著增加了
    # (我们只对非NULL值进行操作，以避免类型错误)
    mask = current_data_pd_corrupted['purchase_value'].notnull()
    current_data_pd_corrupted.loc[mask, 'purchase_value'] = current_data_pd_corrupted.loc[mask, 'purchase_value'] * 3 + 50
    print("...已模拟 'purchase_value' 的数值漂移。")

    # b. 模拟“分类漂移” (Categorical Drift)
    # 假设网站改版，出现了一个模型从未见过的新事件类型 'search'
    # 我们将 30% 的 'click' 事件替换为 'search'
    click_indices = current_data_pd_corrupted[current_data_pd_corrupted['event_type'] == 'click'].index
    indices_to_replace = np.random.choice(click_indices, size=int(len(click_indices) * 0.3), replace=False)
    current_data_pd_corrupted.loc[indices_to_replace, 'event_type'] = 'search'
    print("...已模拟 'event_type' 的分类漂移（引入了 'search'）。")


    # --- 3. 运行新的漂移报告 ---
    print("\n正在生成“被污染”的数据漂移报告...")
    
    corrupted_data_drift_report = Report(metrics=[
        DataDriftPreset(),
    ])

    corrupted_data_drift_report = corrupted_data_drift_report.run(
        current_data=current_data_pd_corrupted, # <-- 使用被污染的数据
        reference_data=reference_data_pd,      # <-- 使用原始的参考数据
    )

    # --- 4. 保存新的报告 ---
    corrupted_report_path = "/home/jovyan/work/data_drift_report_CORRUPTED.html"
    corrupted_data_drift_report.save_html(corrupted_report_path)

    print("\n--- 成功！---")
    print(f"“被污染”的数据漂移报告已保存到: {corrupted_report_path}")
    print("\n请在JupyterLab中打开这个新报告 ('data_drift_report_CORRUPTED.html')。")

参考数据集 (reference_data_pd) 已准备好。
正在创建“被污染”的当前数据集 (current_data_pd_corrupted)...
...已模拟 'purchase_value' 的数值漂移。
...已模拟 'event_type' 的分类漂移（引入了 'search'）。

正在生成“被污染”的数据漂移报告...

--- 成功！---
“被污染”的数据漂移报告已保存到: /home/jovyan/work/data_drift_report_CORRUPTED.html

请在JupyterLab中打开这个新报告 ('data_drift_report_CORRUPTED.html')。


In [17]:
print(corrupted_data_drift_report.dict())

{'metrics': [{'id': '15e89f895b482f9b84ba7274ed18a106', 'metric_id': 'DriftedColumnsCount(drift_share=0.5)', 'value': {'count': 2.0, 'share': 0.5}}, {'id': '6e335cb10cdff7116c9a2779505312b9', 'metric_id': 'ValueDrift(column=purchase_value)', 'value': np.float64(5.554204100091303)}, {'id': '330639c072ebcb0f60d3d208bb6f9b22', 'metric_id': 'ValueDrift(column=user_id)', 'value': np.float64(0.007919947910332509)}, {'id': '2f3ddb08b4459fb21120259cb2aa8302', 'metric_id': 'ValueDrift(column=event_type)', 'value': np.float64(0.17060528922301263)}, {'id': '5ff6a229a83c51b4cd78218a3971d31c', 'metric_id': 'ValueDrift(column=page)', 'value': np.float64(0.0066110243858271585)}], 'tests': []}


In [33]:
from prometheus_client import CollectorRegistry, Gauge, push_to_gateway
import json

# my_report:normal corrupted_data_drift_report:error
reportname='my_report'
# --- 1. 确保 'reportname' 变量存在于内存中 ---
if reportname not in locals():
    print(f"错误：未在内存中找到 '{reportname}' 变量。")
    print("请重新运行上一个Evidently报告单元格。")
else:
    print(f"成功找到 '{reportname}' 变量，正在提取漂移分数...")
    
    # --- 2. 从Evidently的JSON结果中提取漂移分数 ---
    #    (使用你创建的 'my_report' 变量!)
    drift_score = 0.0
    drift_detected = 0 # 0 = false, 1 = true
    try:
        # 遍历所有指标 my_report:normal corrupted_data_drift_report:error
        for metric in locals().get(reportname).dict().get("metrics", []):
            if 'DriftedColumnsCount' in metric.get("metric_id"):
                print("Find it")
                drift_score = metric.get("value", {}).get("share", 0.0)
                drift_detected = metric.get("value", {}).get("count", 0)
                break
    except Exception as e:
        print(f"无法从Evidently报告中提取漂移分数: {e}")

    # --- 3. 准备要推送的指标 ---
    registry = CollectorRegistry()
    g_drift_score = Gauge(
        'model_data_drift_score', 
        'Evidently AI data drift score', 
        ['model_name'], 
        registry=registry
    )
    g_drift_detected = Gauge(
        'model_data_drift_detected', 
        'Whether Evidently AI detected drift (1 = True, 0 = False)', 
        ['model_name'], 
        registry=registry
    )

    # --- 4. 设置指标的值 ---
    model_name = "stg_user_events_v1"
    g_drift_score.labels(model_name=model_name).set(drift_score)
    g_drift_detected.labels(model_name=model_name).set(drift_detected)

    # --- 5. 推送到Pushgateway ---
    try:
        # 使用Docker网络中的服务名 'pushgateway' 和端口 '9091'
        push_to_gateway('pushgateway:9091', job='evidently_batch_validation', registry=registry)
        print(f"\n--- 成功！---")
        print(f"已将漂移指标推送到Prometheus Pushgateway:")
        print(f"model_data_drift_score = {drift_score}")
        print(f"model_data_drift_detected = {drift_detected}")

    except Exception as e:
        print(f"\n--- 失败！---")
        print(f"无法推送到Prometheus Pushgateway (http://pushgateway:9091): {e}")
        print("请确保Pushgateway容器正在运行，并且网络名称正确。")

成功找到 'my_report' 变量，正在提取漂移分数...
Find it

--- 成功！---
已将漂移指标推送到Prometheus Pushgateway:
model_data_drift_score = 0.0
model_data_drift_detected = 0.0
