In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import shap

# 加载数据
#data = pd.read_excel('catalytic ozonation hotmap data.xlsx')
data=pd.read_excel(r"C:\Users\Ahui\ml\catalytic ozonation ML\catalytic ozonation data.xlsx")
# 提取特征名称和目标变量名称
feature_names = data.columns[:10]  # 假设前10列是特征
target_name = data.columns[10]  # 假设第11列是目标变量

# 转换为NumPy数组
x = data.iloc[:, 0:10].values
y = data.iloc[:, 10].values

# 分割数据集
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, test_size=0.2, random_state=0)

# 注意：XGBoost通常不需要对特征进行缩放，但为了与之前代码保持一致，这里保留
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

# 定义XGBoost模型和参数网格
xgb = XGBRegressor(random_state=0)
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

# 创建GridSearchCV对象并执行网格搜索
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(x_train, y_train)

# 使用最佳参数的模型
best_xgb = grid_search.best_estimator_

# 使用SHAP进行分析 - 对于树模型，使用TreeExplainer而不是KernelExplainer
explainer = shap.TreeExplainer(best_xgb)
shap_values = explainer.shap_values(x_test)

# 可视化SHAP摘要图
shap.summary_plot(shap_values, x_test, feature_names=feature_names)

# 可视化特征重要性
feature_importance = np.abs(shap_values).mean(0)
sorted_idx = np.argsort(feature_importance)
plt.barh(range(len(sorted_idx)), feature_importance[sorted_idx], color='b', align='center')
plt.yticks(range(len(sorted_idx)), [feature_names[i] for i in sorted_idx])
plt.xlabel('SHAP Value')
plt.title('Feature Importance')
plt.gca().invert_yaxis()  # 重要的特征在顶部
plt.show()

# 输出模型性能
y_pred = best_xgb.predict(x_test)
print(f"Best parameters: {grid_search.best_params_}")
print(f"Test R² score: {r2_score(y_test, y_pred):.4f}")
print(f"Test MSE: {mean_squared_error(y_test, y_pred):.4f}")
# -------------------------------------------------
# -------------------------------------------------
# 1) SHAP 交互值热图（特征×特征）
# -------------------------------------------------
print("\n正在计算 SHAP 交互值并绘制交互热图...")
shap_interaction = explainer.shap_interaction_values(x_test)

# 取第一个输出（回归只有一个输出）
interaction_matrix = np.abs(shap_interaction).mean(axis=0)  # (n_features, n_features)
plt.figure(figsize=(9, 7))
sns.heatmap(
    interaction_matrix,
    xticklabels=list(feature_names),
    yticklabels=list(feature_names),
    annot=True,
    fmt=".2f",
    cmap="Reds",
    square=True,
    cbar_kws={"label": "平均 |SHAP 交互值|"}
)
plt.title("SHAP Interaction Heatmap")
plt.tight_layout()
plt.savefig("shap_interaction_heatmap.png", dpi=300)
plt.show()

# -------------------------------------------------
# 2) SHAP 摘要热图（样本×特征）
# -------------------------------------------------
print("\n正在绘制 SHAP 摘要热图...")
# 构造 Explanation 对象
exp = shap.Explanation(
    values=shap_values,
    base_values=explainer.expected_value,
    data=x_test,
    feature_names=list(feature_names)
)

# 画热图（只取前 100 个样本防止图过大）
shap.plots.heatmap(exp[:100], show=False)
plt.title("SHAP Summary Heatmap")
plt.tight_layout()
plt.savefig("shap_summary_heatmap.png", dpi=300)
plt.show()