In [2]:

import os, logging
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import train_test_split, KFold
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import Ridge
from sklearn.ensemble import HistGradientBoostingRegressor, StackingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

try:
    from catboost import CatBoostRegressor
    HAS_CATBOOST = True
except Exception:
    HAS_CATBOOST = False

import cloudpickle

# -----------------------------
# 0) Config
# -----------------------------
SEED = 42
TEST_SIZE = 0.2
N_FOLDS_META = 10
ROOT_DIR = Path.home() / "4차 프로젝트 정리"
DATA_PATH = ROOT_DIR
TRAIN_PATH = ROOT_DIR / "merged_train.csv"
PRED_PIPE_PATH = ROOT_DIR / "prediction_pipeline.pkl"

# Optuna-best (from user's previous tuning)
BEST_PARAMS = {
    'CatBoostRegressor_tune': {
        0: {"depth": 8, "iterations": 600, "learning_rate": 0.03, "l2_leaf_reg": 1.0, "subsample": 1.0},
        1: {"depth": 10, "iterations": 600, "learning_rate": 0.03, "l2_leaf_reg": 5.0, "subsample": 1.0},
        2: {"depth": 8, "iterations": 300, "learning_rate": 0.05, "l2_leaf_reg": 5.0, "subsample": 0.7},
        3: {"depth": 10, "iterations": 600, "learning_rate": 0.03, "l2_leaf_reg": 5.0, "subsample": 0.7},
        4: {"depth": 10, "iterations": 600, "learning_rate": 0.03, "l2_leaf_reg": 5.0, "subsample": 1.0},
        5: {"depth": 8, "iterations": 900, "learning_rate": 0.05, "l2_leaf_reg": 5.0, "subsample": 0.9},
        6: {"depth": 8, "iterations": 900, "learning_rate": 0.03, "l2_leaf_reg": 5.0, "subsample": 0.9},
        7: {"depth": 8, "iterations": 900, "learning_rate": 0.03, "l2_leaf_reg": 3.0, "subsample": 0.9},
        8: {"depth": 8, "iterations": 900, "learning_rate": 0.03, "l2_leaf_reg": 3.0, "subsample": 0.9},
        9: {"depth": 8, "iterations": 900, "learning_rate": 0.05, "l2_leaf_reg": 1.0, "subsample": 1.0},
        10: {"depth": 10, "iterations": 900, "learning_rate": 0.03, "l2_leaf_reg": 1.0, "subsample": 0.9},
        11: {"depth": 8, "iterations": 900, "learning_rate": 0.05, "l2_leaf_reg": 5.0, "subsample": 0.7},
        12: {"depth": 8, "iterations": 900, "learning_rate": 0.03, "l2_leaf_reg": 5.0, "subsample": 0.9},
        13: {"depth": 8, "iterations": 900, "learning_rate": 0.03, "l2_leaf_reg": 5.0, "subsample": 0.9},
    },
    'HistGradientBoostingRegressor_tune': {
        0: {"learning_rate": 0.05, "max_depth": None},
        1: {"learning_rate": 0.03, "max_depth": None},
        2: {"learning_rate": 0.05, "max_depth": 6},
        3: {"learning_rate": 0.1, "max_depth": None},
        4: {"learning_rate": 0.05, "max_depth": None},
        5: {"learning_rate": 0.1, "max_depth": None},
        6: {"learning_rate": 0.1, "max_depth": 8},
        7: {"learning_rate": 0.05, "max_depth": 8},
        8: {"learning_rate": 0.05, "max_depth": None},
        9: {"learning_rate": 0.1, "max_depth": None},
        10: {"learning_rate": 0.05, "max_depth": 8},
        11: {"learning_rate": 0.1, "max_depth": 6},
        12: {"learning_rate": 0.05, "max_depth": 6},
        13: {"learning_rate": 0.1, "max_depth": 6},
    },
    'LGBMRegressor_tune': {
        0: {"n_estimators": 400, "learning_rate": 0.03, "num_leaves": 31, "max_depth": -1, "subsample": 0.7, "colsample_bytree": 0.7},
        1: {"n_estimators": 400, "learning_rate": 0.03, "num_leaves": 31, "max_depth": 8, "subsample": 0.8, "colsample_bytree": 0.7},
        2: {"n_estimators": 400, "learning_rate": 0.03, "num_leaves": 31, "max_depth": 8, "subsample": 0.8, "colsample_bytree": 0.7},
        3: {"n_estimators": 400, "learning_rate": 0.03, "num_leaves": 127, "max_depth": 8, "subsample": 1.0, "colsample_bytree": 0.7},
        4: {"n_estimators": 400, "learning_rate": 0.05, "num_leaves": 31, "max_depth": -1, "subsample": 0.7, "colsample_bytree": 0.7},
        5: {"n_estimators": 400, "learning_rate": 0.03, "num_leaves": 127, "max_depth": 12, "subsample": 0.8, "colsample_bytree": 0.9},
        6: {"n_estimators": 400, "learning_rate": 0.03, "num_leaves": 31, "max_depth": -1, "subsample": 0.7, "colsample_bytree": 0.7},
        7: {"n_estimators": 400, "learning_rate": 0.03, "num_leaves": 127, "max_depth": 8, "subsample": 1.0, "colsample_bytree": 0.7},
        8: {"n_estimators": 400, "learning_rate": 0.03, "num_leaves": 127, "max_depth": 8, "subsample": 1.0, "colsample_bytree": 0.7},
        9: {"n_estimators": 400, "learning_rate": 0.03, "num_leaves": 127, "max_depth": -1, "subsample": 0.7, "colsample_bytree": 1.0},
        10: {"n_estimators": 400, "learning_rate": 0.03, "num_leaves": 127, "max_depth": 8, "subsample": 1.0, "colsample_bytree": 0.7},
        11: {"n_estimators": 800, "learning_rate": 0.03, "num_leaves": 31, "max_depth": 8, "subsample": 0.8, "colsample_bytree": 0.7},
        12: {"n_estimators": 400, "learning_rate": 0.03, "num_leaves": 127, "max_depth": 8, "subsample": 1.0, "colsample_bytree": 0.7},
        13: {"n_estimators": 400, "learning_rate": 0.03, "num_leaves": 127, "max_depth": 8, "subsample": 1.0, "colsample_bytree": 0.7},
    },
    'XGBRegressor_tune': {
        0: {"n_estimators": 300, "max_depth": 4, "learning_rate": 0.03, "subsample": 0.7, "colsample_bytree": 0.7},
        1: {"n_estimators": 300, "max_depth": 4, "learning_rate": 0.03, "subsample": 0.7, "colsample_bytree": 0.7},
        2: {"n_estimators": 300, "max_depth": 4, "learning_rate": 0.03, "subsample": 0.7, "colsample_bytree": 0.7},
        3: {"n_estimators": 300, "max_depth": 8, "learning_rate": 0.03, "subsample": 0.7, "colsample_bytree": 1.0},
        4: {"n_estimators": 500, "max_depth": 4, "learning_rate": 0.03, "subsample": 0.7, "colsample_bytree": 0.7},
        5: {"n_estimators": 800, "max_depth": 6, "learning_rate": 0.03, "subsample": 1.0, "colsample_bytree": 1.0},
        6: {"n_estimators": 500, "max_depth": 8, "learning_rate": 0.03, "subsample": 0.7, "colsample_bytree": 0.7},
        7: {"n_estimators": 300, "max_depth": 8, "learning_rate": 0.03, "subsample": 0.8, "colsample_bytree": 1.0},
        8: {"n_estimators": 300, "max_depth": 8, "learning_rate": 0.03, "subsample": 0.8, "colsample_bytree": 1.0},
        9: {"n_estimators": 500, "max_depth": 8, "learning_rate": 0.03, "subsample": 0.7, "colsample_bytree": 0.7},
        10: {"n_estimators": 300, "max_depth": 8, "learning_rate": 0.03, "subsample": 0.8, "colsample_bytree": 1.0},
        11: {"n_estimators": 300, "max_depth": 8, "learning_rate": 0.03, "subsample": 0.8, "colsample_bytree": 1.0},
        12: {"n_estimators": 300, "max_depth": 6, "learning_rate": 0.03, "subsample": 0.8, "colsample_bytree": 1.0},
        13: {"n_estimators": 300, "max_depth": 8, "learning_rate": 0.03, "subsample": 0.7, "colsample_bytree": 0.8},
    }
}

# -----------------------------
# 1) Load data & split
# -----------------------------
print("Loading:", TRAIN_PATH)
train = pd.read_csv(TRAIN_PATH)
X_full = train.filter(regex='^X_').copy()
Y_full = train.filter(regex='^Y_').copy()

# drop columns (as in original training)
drop_cols = ['X_04','X_23','X_47','X_48','X_10','X_11','X_02']
X_full.drop(columns=[c for c in drop_cols if c in X_full.columns], inplace=True, errors='ignore')

# (Optional) train-time pruning by X_33>6 — we keep it ONLY for training data
if 'X_33' in X_full.columns:
    drop_idx = X_full.index[X_full['X_33'] > 6]
    if len(drop_idx)>0:
        X_full = X_full.drop(index=drop_idx)
        Y_full = Y_full.drop(index=drop_idx)

X_full = X_full.reset_index(drop=True)
Y_full = Y_full.reset_index(drop=True)

X_tr_df, X_te_df, Y_tr_df, Y_te_df = train_test_split(
    X_full, Y_full, test_size=TEST_SIZE, random_state=SEED
)

# -----------------------------
# 2) Impute + PCA (pure sklearn)
# -----------------------------
imputer = SimpleImputer(strategy="median")
X_tr_imp = pd.DataFrame(imputer.fit_transform(X_tr_df), columns=X_tr_df.columns, index=X_tr_df.index)
X_te_imp = pd.DataFrame(imputer.transform(X_te_df), columns=X_te_df.columns, index=X_te_df.index)

# same PCA grouping as original
PCA_GROUPS = [
    ("g0", ['X_13','X_14','X_15','X_16','X_17','X_18'], 5),
    ("g1", ['X_19','X_20','X_21','X_22'], 2),
    ("g2", ['X_34','X_35','X_36','X_37'], 1),
    ("g3", ['X_41','X_42','X_43','X_44','X_45'], 1),
    ("g4", ['X_50','X_51','X_52','X_53','X_54','X_55','X_56'], 2),
]

def apply_fit_pca(X_fit: pd.DataFrame, X_apply: pd.DataFrame):
    pca_info = []
    Xf = X_fit.copy(); Xa = X_apply.copy()
    for name, cols, n_comp in PCA_GROUPS:
        cols = [c for c in cols if c in Xf.columns]
        if len(cols)==0:  # group not present
            continue
        pca = PCA(n_components=n_comp, random_state=SEED).fit(Xf[cols])
        Zf = pca.transform(Xf[cols])
        Za = pca.transform(Xa[cols])
        comp_cols = [f"PCA_{name}_{i}" for i in range(n_comp)]
        Xf = pd.concat([Xf.drop(columns=cols), pd.DataFrame(Zf, columns=comp_cols, index=Xf.index)], axis=1)
        Xa = pd.concat([Xa.drop(columns=cols), pd.DataFrame(Za, columns=comp_cols, index=Xa.index)], axis=1)
        pca_info.append({
            "group_name": name,
            "group_cols": cols,
            "n_components": n_comp,
            "pca": pca,
            "component_cols": comp_cols,
        })
    return Xf, Xa, pca_info

X_tr_final, X_te_final, pca_info = apply_fit_pca(X_tr_imp, X_te_imp)
FINAL_FEATURE_NAMES = X_tr_final.columns.tolist()

# numpy views
X_tr = X_tr_final.to_numpy(); X_te = X_te_final.to_numpy()
Y_tr = Y_tr_df.to_numpy();   Y_te = Y_te_df.to_numpy()

y_cols = list(Y_full.columns)

# -----------------------------
# 3) Build per-target StackingRegressor and train
# -----------------------------
models = {}
cv_meta = KFold(n_splits=N_FOLDS_META, shuffle=True, random_state=SEED)

for j, y_name in enumerate(y_cols):
    est_list = []

    # HistGBR
    hgb_params = BEST_PARAMS['HistGradientBoostingRegressor_tune'].get(j, {"random_state": SEED})
    hgb_params.setdefault('random_state', SEED)
    est_list.append(("hgb", HistGradientBoostingRegressor(**hgb_params)))

    # XGB
    xgb_params = BEST_PARAMS['XGBRegressor_tune'].get(j, {"random_state": SEED, "objective": "reg:squarederror", "tree_method": "hist", "n_jobs": -1})
    xgb_params.setdefault('random_state', SEED)
    xgb_params.setdefault('objective', 'reg:squarederror')
    xgb_params.setdefault('tree_method', 'hist')
    xgb_params.setdefault('n_jobs', -1)
    est_list.append(("xgb", XGBRegressor(**xgb_params)))

    # LGBM
    lgb_params = BEST_PARAMS['LGBMRegressor_tune'].get(j, {"random_state": SEED, "verbose": -1})
    lgb_params.setdefault('random_state', SEED)
    lgb_params.setdefault('verbose', -1)
    est_list.append(("lgb", LGBMRegressor(**lgb_params)))

    # CatBoost (optional)
    if HAS_CATBOOST:
        cat_params = BEST_PARAMS['CatBoostRegressor_tune'].get(j, {"random_seed": SEED, "verbose": False, "allow_writing_files": False, "loss_function": "RMSE"})
        cat_params.setdefault('random_seed', SEED)
        cat_params.setdefault('verbose', False)
        cat_params.setdefault('allow_writing_files', False)
        cat_params.setdefault('loss_function', 'RMSE')
        est_list.append(("cat", CatBoostRegressor(**cat_params)))

    stack = StackingRegressor(
        estimators=est_list,
        final_estimator=Ridge(random_state=SEED),
        cv=cv_meta,
        passthrough=False,
        n_jobs=-1
    )
    stack.fit(X_tr, Y_tr[:, j])
    models[y_name] = stack

# -----------------------------
# 4) Evaluate (optional print)
# -----------------------------
rows = []
for j, y_name in enumerate(y_cols):
    y_true = Y_te[:, j]
    y_pred = models[y_name].predict(X_te)
    mse  = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae  = mean_absolute_error(y_true, y_pred)
    r2   = r2_score(y_true, y_pred)
    denom = np.mean(np.abs(y_true)) if np.mean(np.abs(y_true))!=0 else 1.0
    nrmse = rmse / denom
    rows.append([y_name, r2, rmse, mse, mae, nrmse])

metrics_df = pd.DataFrame(rows, columns=['target','R2','RMSE','MSE','MAE','NRMSE'])
print("\n=== Test metrics (per target) ===")
print(metrics_df)

# -----------------------------
# 5) Save artifacts with cloudpickle
# -----------------------------
artifacts = {
    'imputer': imputer,
    'pca_info': pca_info,
    'pre_drop_cols': ['X_04','X_23','X_47','X_48','X_10','X_11','X_02'],
    'final_feature_names': FINAL_FEATURE_NAMES,
    'target_columns': y_cols,
    'models': models,
}

PRED_PIPE_PATH.parent.mkdir(parents=True, exist_ok=True)
with open(PRED_PIPE_PATH, 'wb') as f:
    cloudpickle.dump(artifacts, f)

print(f"Saved pipeline to: {PRED_PIPE_PATH}")


Loading: C:\Users\Taeyoung\4차 프로젝트 정리\병합_train.csv





=== Test metrics (per target) ===
   target        R2      RMSE       MSE       MAE     NRMSE
0    Y_01  0.056885  0.343903  0.118269  0.264795  0.253693
1    Y_02  0.053110  0.375672  0.141130  0.294262  0.355926
2    Y_03  0.041098  0.354065  0.125362  0.277102  0.349172
3    Y_04  0.087038  2.538625  6.444617  2.074254  0.186500
4    Y_05  0.052395  2.474994  6.125593  1.971637  0.079150
5    Y_06  0.177758  1.589225  2.525636  0.712783  0.095619
6    Y_07  0.050976  0.399141  0.159314  0.319735  0.126336
7    Y_08  0.108163  0.625372  0.391091  0.484094  0.023777
8    Y_09  0.099702  0.619394  0.383649  0.479565  0.023540
9    Y_10  0.165629  0.830864  0.690335  0.619023  0.037089
10   Y_11  0.067360  0.794418  0.631101  0.622519  0.032660
11   Y_12  0.103093  0.624721  0.390276  0.483585  0.023807
12   Y_13  0.102088  0.622645  0.387686  0.481791  0.023731
13   Y_14  0.104881  0.621420  0.386163  0.481751  0.023672
Saved pipeline to: C:\Users\Taeyoung\4차 프로젝트 정리\prediction_pipeli