In [1]:
# ============================================================
#  XGBoost 分位数回归（带自动搜参） + CV + holdout + SHAP
#  改进版：
#   1) Training 行也填预测列（用真实值）
#   2) 输出不确定性列（q95_closed - q05_closed）
#   3) 对均值模型和分位数模型都做网格搜索，保存所有参数的表现
#   4) GEE 建议用 *_xgb_q50_closed
# ============================================================

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import itertools
import xgboost as xgb

from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, root_mean_squared_error

# 尝试导入 shap
try:
    import shap
    HAS_SHAP = True
except ImportError:
    HAS_SHAP = False
    print("⚠ 未安装 shap，SHAP 部分将跳过。请先 pip install shap")

# ========== 全局图形样式 ==========
plt.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei', 'Arial']
plt.rcParams['axes.titlesize'] = 22
plt.rcParams['axes.labelsize'] = 22
plt.rcParams['xtick.labelsize'] = 20
plt.rcParams['ytick.labelsize'] = 20

# ========== 1. 路径与读取 ==========
in_fp = r"E:\FinalData_RS_农村点_乡镇共同所在.csv"
out_dir = r"E:\FinalData_RS_农村点_乡镇共同所在_XGB_AUTO"
os.makedirs(out_dir, exist_ok=True)

df = pd.read_csv(in_fp, na_values=["<Null>", "NULL", "Null"])

# ========== 2. 构造空间特征 ==========
df["Lon2"] = df["Lon"] ** 2
df["Lat2"] = df["Lat"] ** 2
df["LonLat"] = df["Lon"] * df["Lat"]

# 解释变量
feature_cols = [
    "floor height",
    "rate of air conditioner",
    "housing quality",
    "rate of car ownership",
    "rate of motorcycle ownership",
    "rate of tiled wall",
    "rate of red-brick wall",
    "house base area",
    "number of houses",
    "rate of old houses",
    "wealth_index",
    "Lon",
    "Lat",
    "Rural",
    "Lon2",
    "Lat2",
    "LonLat",
]

# 四个因变量
target_cols = ["F", "F_NF", "NF_F", "NF"]

# 检查
for c in feature_cols + target_cols + ["Type"]:
    if c not in df.columns:
        raise ValueError(f"缺少列: {c}")

# 分训练/预测
train_df = df[df["Type"] == "Training"].copy()
pred_df  = df[df["Type"] == "Predicting"].copy()

# 填解释变量缺失
X_all = df[feature_cols]
X_all_filled = X_all.fillna(X_all.mean())
df[feature_cols] = X_all_filled
train_df[feature_cols] = X_all_filled.loc[train_df.index]
pred_df[feature_cols]  = X_all_filled.loc[pred_df.index]

# 是否用GPU
USE_GPU = True  # 没有GPU就改成 False


# ========== 3. 网格搜索工具 ==========
def param_grid_to_list(param_grid: dict):
    """把 {'a':[1,2], 'b':[3,4]} 变成 [{'a':1,'b':3}, ...]"""
    keys = list(param_grid.keys())
    vals = list(param_grid.values())
    combos = []
    for vs in itertools.product(*vals):
        combos.append(dict(zip(keys, vs)))
    return combos


def xgb_cv_score(X, y, params, n_splits=3):
    """给一组XGB参数，做K折CV，返回平均R2、MAE、RMSE"""
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    r2_list, mae_list, rmse_list = [], [], []
    for tr_idx, val_idx in kf.split(X):
        X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[tr_idx], y.iloc[val_idx]

        model = xgb.XGBRegressor(**params)
        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_val)

        r2  = r2_score(y_val, y_pred)
        mae = mean_absolute_error(y_val, y_pred)
        rmse = root_mean_squared_error(y_val, y_pred)

        r2_list.append(r2)
        mae_list.append(mae)
        rmse_list.append(rmse)

    return np.mean(r2_list), np.mean(mae_list), np.mean(rmse_list)


# ========== 4. 主循环：4个目标 ==========
for tgt in target_cols:
    print(f"\n================ 处理目标: {tgt} ================")

    # 只用这个目标有值的训练行
    y_train_full = train_df[tgt]
    mask = ~y_train_full.isna()
    X_train = train_df.loc[mask, feature_cols]
    y_train = y_train_full.loc[mask]

    if len(y_train) == 0:
        print(f"{tgt} 没有训练样本，跳过")
        continue

    # ---------- 4.1 均值模型的网格搜索 ----------
    mean_param_grid = {
        "objective": ["reg:squarederror"],
        "eval_metric": ["rmse"],
        "max_depth": [4, 6, 8],
        "eta": [0.03, 0.05, 0.1],
        "subsample": [0.8, 0.9, 1.0],
        "colsample_bytree": [0.8, 0.9, 1.0],
        "n_estimators": [300, 500, 800],
    }
    if USE_GPU:
        # 固定GPU
        mean_fixed = {"tree_method": "gpu_hist", "random_state": 42}
    else:
        mean_fixed = {"tree_method": "hist", "random_state": 42}

    mean_param_list = param_grid_to_list(mean_param_grid)

    mean_search_records = []
    best_mean_score = -1e9
    best_mean_params = None

    print(f"[{tgt}] 开始均值模型网格搜索，共 {len(mean_param_list)} 组参数")
    for p in mean_param_list:
        full_p = {**p, **mean_fixed}
        r2m, maem, rmsem = xgb_cv_score(X_train, y_train, full_p, n_splits=3)
        mean_search_records.append({
            **p,
            **{"R2": r2m, "MAE": maem, "RMSE": rmsem}
        })
        # 这里用R2挑最优
        if r2m > best_mean_score:
            best_mean_score = r2m
            best_mean_params = full_p

    # 保存均值模型的网格搜索结果
    mean_search_df = pd.DataFrame(mean_search_records)
    mean_search_df.to_csv(
        os.path.join(out_dir, f"{tgt}_XGB_mean_param_search.csv"),
        index=False,
        encoding="utf-8-sig"
    )
    print(f"[{tgt}] 均值模型最优参数: {best_mean_params}, CV R2={best_mean_score:.3f}")

    # ---------- 4.2 用最优均值参数重训 ----------
    best_mean_model = xgb.XGBRegressor(**best_mean_params)
    best_mean_model.fit(X_train, y_train)

    # ---------- 4.3 分位数模型的网格搜索 ----------
    # 分位数不要太大网格，不然很慢
    quant_param_grid = {
        "max_depth": [4, 6],
        "eta": [0.03, 0.05],
        "subsample": [0.8, 1.0],
        "colsample_bytree": [0.8, 1.0],
        "n_estimators": [300, 500],
    }
    quant_param_list = param_grid_to_list(quant_param_grid)
    if USE_GPU:
        quant_fixed = {"objective": "reg:quantileerror", "eval_metric": "mae", "tree_method": "gpu_hist", "random_state": 42}
    else:
        quant_fixed = {"objective": "reg:quantileerror", "eval_metric": "mae", "tree_method": "hist", "random_state": 42}

    # 我们对 0.5 分位数做搜索，找到一组好的，再拿同一组参数到0.05/0.95上用
    quant_search_records = []
    best_quant_score = 1e9  # MAE 越小越好
    best_quant_params = None

    print(f"[{tgt}] 开始分位数(0.5)网格搜索，共 {len(quant_param_list)} 组参数")
    for p in quant_param_list:
        full_p = {**p, **quant_fixed, **{"quantile_alpha": 0.5}}
        # 用3折，指标用MAE
        kf = KFold(n_splits=3, shuffle=True, random_state=42)
        mae_list = []
        for tr_idx, val_idx in kf.split(X_train):
            X_tr, X_val = X_train.iloc[tr_idx], X_train.iloc[val_idx]
            y_tr, y_val = y_train.iloc[tr_idx], y_train.iloc[val_idx]
            m = xgb.XGBRegressor(**full_p)
            m.fit(X_tr, y_tr)
            y_pred = m.predict(X_val)
            mae_list.append(mean_absolute_error(y_val, y_pred))
        mean_mae = np.mean(mae_list)
        quant_search_records.append({**p, **{"MAE": mean_mae}})
        if mean_mae < best_quant_score:
            best_quant_score = mean_mae
            best_quant_params = p  # 暂存不包括 fixed 的部分

    # 保存分位数网格搜索结果
    pd.DataFrame(quant_search_records).to_csv(
        os.path.join(out_dir, f"{tgt}_XGB_quantile_param_search.csv"),
        index=False,
        encoding="utf-8-sig"
    )
    print(f"[{tgt}] 分位数模型最优参数(以0.5为目标): {best_quant_params}, MAE={best_quant_score:.4f}")

    # 用这组最优参数去训练 0.05 / 0.5 / 0.95 三个模型
    base_quant_params = {**best_quant_params, **quant_fixed}
    q05_model = xgb.XGBRegressor(**{**base_quant_params, **{"quantile_alpha": 0.05}})
    q50_model = xgb.XGBRegressor(**{**base_quant_params, **{"quantile_alpha": 0.50}})
    q95_model = xgb.XGBRegressor(**{**base_quant_params, **{"quantile_alpha": 0.95}})

    q05_model.fit(X_train, y_train)
    q50_model.fit(X_train, y_train)
    q95_model.fit(X_train, y_train)

    # ---------- 4.4 5折CV（用最优均值模型） ----------
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    cv_metrics = []
    fold_id = 1
    for tr_idx, val_idx in kf.split(X_train):
        X_tr, X_val = X_train.iloc[tr_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[tr_idx], y_train.iloc[val_idx]

        m_cv = xgb.XGBRegressor(**best_mean_params)
        m_cv.fit(X_tr, y_tr)
        y_pred = m_cv.predict(X_val)

        r2  = r2_score(y_val, y_pred)
        mae = mean_absolute_error(y_val, y_pred)
        rmse = root_mean_squared_error(y_val, y_pred)
        cv_metrics.append((r2, mae, rmse))

        # 图
        plt.figure(figsize=(6, 6))
        plt.scatter(y_val, y_pred, alpha=0.6, color='black')
        plt.plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()],
                 'r--', linewidth=2)
        plt.xlabel("Observed", fontweight='bold', color='black')
        plt.ylabel("Predicted", fontweight='bold', color='black')
        plt.title(f"{tgt} - CV fold {fold_id} (R2={r2:.3f})", fontweight='bold', color='black')
        plt.tight_layout()
        plt.savefig(os.path.join(out_dir, f"{tgt}_XGB_CV_fold{fold_id}.png"), dpi=200)
        plt.close()

        fold_id += 1

    pd.DataFrame(cv_metrics, columns=["R2", "MAE", "RMSE"]).to_csv(
        os.path.join(out_dir, f"{tgt}_XGB_cv_metrics.csv"),
        index=False,
        encoding="utf-8-sig"
    )

    # ---------- 4.5 holdout ----------
    X_tr2, X_hold, y_tr2, y_hold = train_test_split(
        X_train, y_train, test_size=0.2, random_state=42
    )
    hold_model = xgb.XGBRegressor(**best_mean_params)
    hold_model.fit(X_tr2, y_tr2)
    y_hold_pred = hold_model.predict(X_hold)

    r2_hold   = r2_score(y_hold, y_hold_pred)
    mae_hold  = mean_absolute_error(y_hold, y_hold_pred)
    rmse_hold = root_mean_squared_error(y_hold, y_hold_pred)

    pd.DataFrame({"R2": [r2_hold], "MAE": [mae_hold], "RMSE": [rmse_hold]}).to_csv(
        os.path.join(out_dir, f"{tgt}_XGB_holdout_metrics.csv"),
        index=False,
        encoding="utf-8-sig"
    )

    plt.figure(figsize=(6, 6))
    plt.scatter(y_hold, y_hold_pred, alpha=0.6, color='black')
    plt.plot([y_hold.min(), y_hold.max()], [y_hold.min(), y_hold.max()],
             'r--', linewidth=2)
    plt.xlabel("Observed (holdout)", fontweight='bold', color='black')
    plt.ylabel("Predicted (holdout)", fontweight='bold', color='black')
    plt.title(f"{tgt} - holdout (R2={r2_hold:.3f})", fontweight='bold', color='black')
    plt.tight_layout()
    plt.savefig(os.path.join(out_dir, f"{tgt}_XGB_holdout.png"), dpi=200)
    plt.close()

    # ---------- 4.6 预测 Predicting 行 ----------
    X_pred = pred_df[feature_cols]
    df.loc[pred_df.index, f"{tgt}_xgb_mean"] = best_mean_model.predict(X_pred)
    df.loc[pred_df.index, f"{tgt}_xgb_q05"]  = q05_model.predict(X_pred)
    df.loc[pred_df.index, f"{tgt}_xgb_q50"]  = q50_model.predict(X_pred)
    df.loc[pred_df.index, f"{tgt}_xgb_q95"]  = q95_model.predict(X_pred)

    # 训练行也填上（你之前说要不空）
    df.loc[train_df.index, f"{tgt}_xgb_mean"] = df.loc[train_df.index, tgt]
    df.loc[train_df.index, f"{tgt}_xgb_q50"]  = df.loc[train_df.index, tgt]
    # q05 / q95 对训练行可以不填

    # ---------- 4.7 特征重要性 ----------
    imp = best_mean_model.feature_importances_
    imp_df = pd.DataFrame({"feature": feature_cols, "importance": imp}).sort_values("importance", ascending=False)
    imp_df.to_csv(
        os.path.join(out_dir, f"{tgt}_XGB_feature_importance.csv"),
        index=False,
        encoding="utf-8-sig"
    )

    plt.figure(figsize=(8, 6))
    plt.barh(imp_df["feature"], imp_df["importance"], color='black')
    plt.gca().invert_yaxis()
    plt.title(f"{tgt} - XGB Feature Importance", fontweight='bold', color='black')
    plt.xlabel("Importance", fontweight='bold', color='black')
    plt.tight_layout()
    plt.savefig(os.path.join(out_dir, f"{tgt}_XGB_feature_importance.png"), dpi=200)
    plt.close()

    # ---------- 4.8 SHAP ----------
    if HAS_SHAP:
        try:
            X_sample = X_train.sample(n=min(1000, len(X_train)), random_state=42)
            explainer = shap.TreeExplainer(best_mean_model)
            shap_values = explainer.shap_values(X_sample)

            shap.plots.beeswarm(
                shap.Explanation(values=shap_values,
                                 base_values=explainer.expected_value,
                                 data=X_sample,
                                 feature_names=feature_cols),
                max_display=20,
                show=False
            )
            plt.title(f"{tgt} - SHAP beeswarm", fontweight='bold', color='black', fontsize=22)
            plt.tight_layout()
            plt.savefig(os.path.join(out_dir, f"{tgt}_XGB_SHAP_beeswarm.png"), dpi=200)
            plt.close()

            shap.plots.bar(
                shap.Explanation(values=shap_values,
                                 base_values=explainer.expected_value,
                                 data=X_sample,
                                 feature_names=feature_cols),
                max_display=20,
                show=False
            )
            plt.title(f"{tgt} - SHAP bar", fontweight='bold', color='black', fontsize=22)
            plt.tight_layout()
            plt.savefig(os.path.join(out_dir, f"{tgt}_XGB_SHAP_bar.png"), dpi=200)
            plt.close()

        except Exception as e:
            print(f"SHAP 计算失败（{tgt}）：{e}")

# ========== 5. closure + 不确定性 + 导出 ==========
pred_idx = df[df["Type"] == "Predicting"].index

def closure_on_df(df, idx, tcols, suffix_in, suffix_out):
    mat = df.loc[idx, [f"{t}{suffix_in}" for t in tcols]].values.astype(float)
    mat[mat < 0] = 0
    row_sum = mat.sum(axis=1, keepdims=True)
    row_sum[row_sum == 0] = 1
    mat_norm = mat / row_sum
    for i, t in enumerate(tcols):
        df.loc[idx, f"{t}{suffix_out}"] = mat_norm[:, i]

# 四套都闭合
closure_on_df(df, pred_idx, target_cols, "_xgb_mean", "_xgb_mean_closed")
closure_on_df(df, pred_idx, target_cols, "_xgb_q05", "_xgb_q05_closed")
closure_on_df(df, pred_idx, target_cols, "_xgb_q50", "_xgb_q50_closed")
closure_on_df(df, pred_idx, target_cols, "_xgb_q95", "_xgb_q95_closed")

# 不确定性 = q95_closed - q05_closed
for tgt in target_cols:
    df.loc[pred_idx, f"{tgt}_uncertainty"] = (
        df.loc[pred_idx, f"{tgt}_xgb_q95_closed"] -
        df.loc[pred_idx, f"{tgt}_xgb_q05_closed"]
    )

# 真正写回原字段，给GEE用中位数闭合版
for tgt in target_cols:
    df.loc[pred_idx, tgt] = df.loc[pred_idx, f"{tgt}_xgb_q50_closed"]

# 导出
out_fp = r"E:\FinalData_RS_农村点_乡镇共同所在_withXGB_quantile_auto.csv"
df.to_csv(out_fp, index=False, encoding="utf-8-sig")
print("✅ 已导出：", out_fp)



[F] 开始均值模型网格搜索，共 243 组参数



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features()

[F] 均值模型最优参数: {'objective': 'reg:squarederror', 'eval_metric': 'rmse', 'max_depth': 4, 'eta': 0.1, 'subsample': 0.9, 'colsample_bytree': 0.8, 'n_estimators': 800, 'tree_method': 'gpu_hist', 'random_state': 42}, CV R2=0.958
[F] 开始分位数(0.5)网格搜索，共 32 组参数



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shap

[F] 分位数模型最优参数(以0.5为目标): {'max_depth': 6, 'eta': 0.05, 'subsample': 0.8, 'colsample_bytree': 0.8, 'n_estimators': 500}, MAE=0.0198



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_fea


[F_NF] 开始均值模型网格搜索，共 243 组参数



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shap

[F_NF] 均值模型最优参数: {'objective': 'reg:squarederror', 'eval_metric': 'rmse', 'max_depth': 4, 'eta': 0.1, 'subsample': 0.8, 'colsample_bytree': 1.0, 'n_estimators': 800, 'tree_method': 'gpu_hist', 'random_state': 42}, CV R2=0.955
[F_NF] 开始分位数(0.5)网格搜索，共 32 组参数



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shap

[F_NF] 分位数模型最优参数(以0.5为目标): {'max_depth': 6, 'eta': 0.05, 'subsample': 0.8, 'colsample_bytree': 0.8, 'n_estimators': 500}, MAE=0.0142



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_fea


[NF_F] 开始均值模型网格搜索，共 243 组参数



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shap

[NF_F] 均值模型最优参数: {'objective': 'reg:squarederror', 'eval_metric': 'rmse', 'max_depth': 6, 'eta': 0.05, 'subsample': 0.9, 'colsample_bytree': 1.0, 'n_estimators': 500, 'tree_method': 'gpu_hist', 'random_state': 42}, CV R2=0.937
[NF_F] 开始分位数(0.5)网格搜索，共 32 组参数



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shap

[NF_F] 分位数模型最优参数(以0.5为目标): {'max_depth': 6, 'eta': 0.05, 'subsample': 1.0, 'colsample_bytree': 1.0, 'n_estimators': 500}, MAE=0.0150



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_fea


[NF] 开始均值模型网格搜索，共 243 组参数



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shap

[NF] 均值模型最优参数: {'objective': 'reg:squarederror', 'eval_metric': 'rmse', 'max_depth': 4, 'eta': 0.03, 'subsample': 0.8, 'colsample_bytree': 0.8, 'n_estimators': 800, 'tree_method': 'gpu_hist', 'random_state': 42}, CV R2=0.906
[NF] 开始分位数(0.5)网格搜索，共 32 组参数



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shap

[NF] 分位数模型最优参数(以0.5为目标): {'max_depth': 6, 'eta': 0.05, 'subsample': 1.0, 'colsample_bytree': 0.8, 'n_estimators': 500}, MAE=0.0117



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_fea

✅ 已导出： E:\FinalData_RS_农村点_乡镇共同所在_withXGB_quantile_auto.csv


In [1]:
import pandas as pd
import os

# === 1. 路径设置 ===
in_fp  = r"E:\FinalData_RS_农村点_乡镇共同所在_withXGB_quantile_auto.csv"
out_fp = r"E:\FinalData_RS_农村点_乡镇共同所在_withXGB_quantile_auto_fixed.csv"

# === 2. 读取数据 ===
df = pd.read_csv(in_fp, na_values=["<Null>", "NULL", "Null"])

# === 3. 四个因变量 ===
target_cols = ["F", "F_NF", "NF_F", "NF"]

# === 4. 区分 Training / Predicting ===
train_idx = df[df["Type"] == "Training"].index
pred_idx  = df[df["Type"] == "Predicting"].index

# === 5. Training 行：用真实值补上 *_closed 列，uncertainty = 0 ===
for tgt in target_cols:
    for suf in ["_xgb_q05_closed", "_xgb_q50_closed", "_xgb_q95_closed", "_xgb_mean_closed"]:
        col = f"{tgt}{suf}"
        if col in df.columns:
            df.loc[train_idx, col] = df.loc[train_idx, tgt]
        else:
            df[col] = df[tgt]  # 如果没这个列，就创建一个

    # 不确定性列
    unc_col = f"{tgt}_uncertainty"
    if unc_col not in df.columns:
        df[unc_col] = 0
    df.loc[train_idx, unc_col] = 0

# === 6. 对 Predicting 行做 sanity check（闭合和≈1） ===
sum_check = df.loc[pred_idx, [f"{t}_xgb_q50_closed" for t in target_cols]].sum(axis=1)
print("预测样本的闭合和(前10个)：")
print(sum_check.head(10))

# === 7. 导出新文件 ===
df.to_csv(out_fp, index=False, encoding="utf-8-sig")
print(f"✅ 已修正 Training 行并导出新文件：{out_fp}")


预测样本的闭合和(前10个)：
0    1.0
1    1.0
2    1.0
3    1.0
4    1.0
5    1.0
6    1.0
7    1.0
8    1.0
9    1.0
dtype: float64
✅ 已修正 Training 行并导出新文件：E:\FinalData_RS_农村点_乡镇共同所在_withXGB_quantile_auto_fixed.csv
