In [9]:
import warnings
warnings.filterwarnings('ignore')

import koreanize_matplotlib

## 앙상블 단계별 비교

In [33]:
import numpy as np
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor, Pool
import xgboost as xgb

# ======================
# 0) 경로 & 상수
# ======================
TRAIN_PATH = "Data/train_0822_changed.csv"
TEST_PATH  = "Data/test_0822_changed.csv"
SUB_PATH   = "Data/sample_submission.csv"

KEYS   = ["num_date_time", "건물번호"]
TARGET = "전력소비량(kWh)"
BASE_DROP_COMMON = ["num_date_time", "일시"]  # 공통 시간 문자열
BASE_DROP_TRAIN  = ["건물유형", TARGET]       # 모델1·3 (건물유형 split)에서 drop
BASE_DROP_TEST   = ["건물유형"]

train = pd.read_csv(TRAIN_PATH)
test  = pd.read_csv(TEST_PATH)
sub   = pd.read_csv(SUB_PATH)


In [24]:
def smape(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    denom = (np.abs(y_true) + np.abs(y_pred))
    denom = np.where(denom == 0, 1, denom)
    return np.mean(2.0 * np.abs(y_pred - y_true) / denom) * 100

def coerce_numeric_objects(df: pd.DataFrame) -> pd.DataFrame:
    """object/category 컬럼을 안전하게 숫자로 변환 (숫자/기호 혼재 대응)"""
    out = df.copy()
    for c in out.columns:
        if str(out[c].dtype) in ["object", "category"]:
            out[c] = pd.to_numeric(
                out[c].astype(str).str.replace(r"[^\d\.\-eE]", "", regex=True),
                errors="coerce"
            )
    return out

def reorder_like(X_to_fix, X_ref):
    """X_to_fix를 X_ref의 컬럼 순서/구성에 맞춘다. 누락은 NaN으로 채움."""
    return X_to_fix.reindex(columns=X_ref.columns, fill_value=np.nan)

In [34]:
# 결과 수집 
# ======================
# 모델 1: CatBoost (건물유형별)
val_oofs_1, test_preds_1 = [], []
# 모델 2: XGBoost (cluster_id별)
val_oofs_2, test_preds_2 = [], []
# 모델 3: XGBoost (건물유형별)
val_oofs_3, test_preds_3 = [], []

val_scores_1 = {}
val_scores_2 = {}
val_scores_3 = {}

In [36]:
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

# --- 설정 (그대로 사용) ---
params_cb = dict(
    loss_function="RMSE", 
    eval_metric="SMAPE",
    learning_rate=0.05,
    depth=8,
    l2_leaf_reg=3.0,
    random_seed=42,
    iterations=2500,
    od_type="Iter",
    od_wait=200,
    verbose=False,
    allow_writing_files=False
)

def cat_features_of(X: pd.DataFrame):
    return [c for c in X.columns if str(X[c].dtype) in ("object", "category")]

def feature_importance_series(model: CatBoostRegressor, pool: Pool, cols):
    imp = model.get_feature_importance(pool, type="PredictionValuesChange")
    return pd.Series(imp, index=cols).sort_values(ascending=True)

def prune_columns_by_bottom_fraction(cols, fi_series, bottom_frac=0.15):
    n = len(cols)
    k = max(1, int(np.ceil(n * bottom_frac)))           # 최소 1개는 제거
    drop_list = list(fi_series.index[:k])               # 중요도 낮은 순으로 k개
    keep_cols = [c for c in cols if c not in drop_list]
    # 과도 제거 방지: 최소 특성 개수 보장 (원하면 임계값 조정)
    min_keep = max(10, int(np.ceil(n * 0.5)))           # 최소 10개 또는 절반
    if len(keep_cols) < min_keep:                       # 너무 많이 지웠으면 상위만 남김
        keep_cols = list(fi_series.index[k:])           # 남은 상위 중요도
    return keep_cols, drop_list

building_types = train["train_group3"].dropna().unique()

for btype in building_types:
    tr_sub = train[train["train_group3"] == btype].copy()
    te_sub = test [test ["train_group3"] == btype].copy()
    if len(tr_sub) == 0:
        continue

    # --- 피처/타깃 분리 ---
    drop_cols_tr = BASE_DROP_COMMON + BASE_DROP_TRAIN
    X_all = tr_sub.drop(columns=[c for c in drop_cols_tr if c in tr_sub.columns], errors="ignore")
    y_all = tr_sub[TARGET].astype(float)

    # --- 고정 검증 분할(동일 분할로 공정 비교) ---
    X_tr, X_val, y_tr, y_val = train_test_split(X_all, y_all, test_size=0.2, random_state=42)
    cat_cols = cat_features_of(X_tr)

    # --- 1) 베이스 학습 ---
    pool_tr  = Pool(X_tr, y_tr, cat_features=cat_cols)
    pool_val = Pool(X_val, y_val, cat_features=cat_cols)
    m_base = CatBoostRegressor(**params_cb)
    m_base.fit(pool_tr, eval_set=pool_val, use_best_model=True)
    pred_base = m_base.predict(pool_val)
    smape_base = smape(y_val.values, pred_base)

    # --- 1차 FI 산출 & 하위 15% 제거 ---
    fi_series = feature_importance_series(m_base, pool_tr, X_tr.columns)
    keep_cols, dropped_cols = prune_columns_by_bottom_fraction(list(X_tr.columns), fi_series, bottom_frac=0.15)

    # --- 2) Pruned 학습(동일 분할) ---
    X_tr2  = X_tr[keep_cols].copy()
    X_val2 = X_val[keep_cols].copy()
    cat_cols2 = [c for c in keep_cols if c in cat_cols]

    pool_tr2  = Pool(X_tr2, y_tr, cat_features=cat_cols2)
    pool_val2 = Pool(X_val2, y_val, cat_features=cat_cols2)

    m_pruned = CatBoostRegressor(**params_cb)
    m_pruned.fit(pool_tr2, eval_set=pool_val2, use_best_model=True)
    pred_pruned = m_pruned.predict(pool_val2)
    smape_pruned = smape(y_val.values, pred_pruned)

    # --- 더 좋은 쪽 선택 ---
    use_pruned = smape_pruned + 1e-12 < smape_base
    best_smape = smape_pruned if use_pruned else smape_base
    val_scores_1[btype] = best_smape
    print(f"[{btype}] Base={smape_base:.4f}, Pruned={smape_pruned:.4f} -> Use {'Pruned' if use_pruned else 'Base'}")

    # --- OOF 저장(선택된 쪽) ---
    val_keys = tr_sub.loc[X_val.index, KEYS].reset_index(drop=True)  # 동일 인덱스 기준
    oof_df = val_keys.copy()
    oof_df["y_true"] = y_val.reset_index(drop=True).values
    oof_df["pred_1"] = (pred_pruned if use_pruned else pred_base)
    val_oofs_1.append(oof_df)

    # --- 최종 재학습: 선택된 피처로 전체 train 사용 + best_iteration 반영 ---
    if use_pruned:
        cols_final = keep_cols
        best_iter  = m_pruned.get_best_iteration() or params_cb["iterations"]
    else:
        cols_final = list(X_tr.columns)
        best_iter  = m_base.get_best_iteration() or params_cb["iterations"]

    # 전체 train에서 같은 컬럼만 사용
    X_full = X_all[cols_final].copy()
    cat_cols_final = [c for c in cols_final if c in cat_cols]  # 카테고리 컬럼 교차
    pool_full = Pool(X_full, y_all, cat_features=cat_cols_final)

    # 동일 파라미터 + best_iter로 재학습
    params_final = params_cb.copy()
    params_final["iterations"] = int(best_iter)
    m_final = CatBoostRegressor(**params_final)
    # 참고: 여기서는 eval_set 없이 고정 iteration 훈련 (use_best_model 불필요)
    m_final.fit(pool_full, verbose=False)

    # --- TEST 예측 ---
    X_test = te_sub.drop(columns=[c for c in BASE_DROP_COMMON + BASE_DROP_TEST if c in te_sub.columns], errors="ignore")
    # 열 순서 맞추기
    X_test = X_test.reindex(columns=cols_final, fill_value=0)
    pool_te = Pool(X_test, cat_features=cat_cols_final)
    te_pred = m_final.predict(pool_te)

    test_preds_1.append(te_sub[KEYS].assign(pred_1=te_pred))


[호텔] Base=5.8292, Pruned=5.8522 -> Use Base
[상용] Base=1.6944, Pruned=1.6775 -> Use Pruned
[병원] Base=2.6731, Pruned=2.6230 -> Use Pruned
[CLUSTER10] Base=7.6803, Pruned=7.7955 -> Use Base
[학교] Base=2.5522, Pruned=2.5531 -> Use Base
[CLUSTER11] Base=5.5509, Pruned=5.5123 -> Use Pruned
[건물기타] Base=4.2859, Pruned=4.1811 -> Use Pruned
[아파트] Base=2.9066, Pruned=2.8773 -> Use Pruned
[연구소] Base=2.9307, Pruned=2.8718 -> Use Pruned
[백화점] Base=5.2557, Pruned=5.2003 -> Use Pruned
[IDC(전화국)] Base=1.2351, Pruned=1.1797 -> Use Pruned
[공공] Base=5.2720, Pruned=5.1687 -> Use Pruned


In [37]:
# ======================
# 모델 2 — XGBoost (train_group별)
# ======================
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split

# 숫자화 유틸
def to_numeric_df(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    for c in out.columns:
        if str(out[c].dtype) in ["object", "category"]:
            out[c] = pd.to_numeric(
                out[c].astype(str).str.replace(r"[^\d\.\-eE]", "", regex=True),
                errors="coerce"
            )
    return out

params_xgb = dict(
    n_estimators=700,
    learning_rate=0.05,
    max_depth=10,
    subsample=0.8,
    colsample_bytree=0.8,
    tree_method="hist",   # GPU면 "gpu_hist"
    random_state=42,
    n_jobs=-1,
    eval_metric="mae"
)

# 중요도 기반 제외 컬럼(선택)
drop_cols = ['T2', 'T_x_rain', 'RH2', '습도(%)', '풍속(m/s)', 'is_rain', 'wind_level', '강수량(mm)', '기온(°C)', 'ess_hours']

GROUP_COL = "train_group3"   # 위에서 만든 그룹키(유형|클러스터)
groups = train[GROUP_COL].dropna().unique()

for gkey in groups:
    tr_sub = train[train[GROUP_COL] == gkey].copy()
    te_sub = test [test [GROUP_COL] == gkey].copy()
    if len(tr_sub) == 0:
        continue

    # 1) Feature/Target
    drop_cols_tr = BASE_DROP_COMMON + ["cluster_id", "건물번호", TARGET] + [GROUP_COL]
    X_all = tr_sub.drop(columns=[c for c in drop_cols_tr if c in tr_sub.columns], errors="ignore")
    y_all = tr_sub[TARGET].astype(float)

    # 숫자화 + 결측 보정
    X_all = to_numeric_df(X_all).fillna(0.0)

    # 2) Split (고정 시드로 공정한 비교)
    X_tr, X_val, y_tr, y_val = train_test_split(X_all, y_all, test_size=0.2, random_state=38)

    # 3) 학습 (조기 종료)
    m2 = xgb.XGBRegressor(**params_xgb)
    m2.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        verbose=False
    )

    best_iter = getattr(m2, "best_iteration", None)
    # 4) 검증 예측 + 점수
    val_pred = m2.predict(X_val, iteration_range=(0, best_iter+1) if best_iter is not None else None)
    score = smape(y_val.values, val_pred)

    val_scores_2[gkey] = score
    print(f"[{gkey}] Validation SMAPE: {score:.4f} | best_iter={best_iter}")

    # 5) OOF 수집 (키 인덱스 정합성 유지)
    val_keys = tr_sub.loc[X_val.index, KEYS].reset_index(drop=True)
    oof_df = val_keys.copy()
    oof_df["y_true"] = y_val.reset_index(drop=True).values
    oof_df["pred_2"] = val_pred
    val_oofs_2.append(oof_df)

    # 6) TEST 예측
    X_test = te_sub.drop(columns=[c for c in BASE_DROP_COMMON + ["cluster_id", GROUP_COL] if c in te_sub.columns], errors="ignore")
    X_test = to_numeric_df(X_test).fillna(0.0)
    # 학습 컬럼 정합성(순서/누락) 맞추기
    X_test = X_test.reindex(columns=X_tr.columns, fill_value=0.0)

    te_pred = m2.predict(X_test, iteration_range=(0, best_iter+1) if best_iter is not None else None)
    test_preds_2.append(te_sub[KEYS].assign(pred_2=te_pred))


[호텔] Validation SMAPE: 5.5505 | best_iter=None
[상용] Validation SMAPE: 1.7613 | best_iter=None
[병원] Validation SMAPE: 2.5763 | best_iter=None
[CLUSTER10] Validation SMAPE: 5.1974 | best_iter=None
[학교] Validation SMAPE: 2.8666 | best_iter=None
[CLUSTER11] Validation SMAPE: 5.5608 | best_iter=None
[건물기타] Validation SMAPE: 4.1807 | best_iter=None
[아파트] Validation SMAPE: 3.6995 | best_iter=None
[연구소] Validation SMAPE: 3.1672 | best_iter=None
[백화점] Validation SMAPE: 4.2196 | best_iter=None
[IDC(전화국)] Validation SMAPE: 0.7764 | best_iter=None
[공공] Validation SMAPE: 5.0731 | best_iter=None


In [30]:
train["건물유형"].unique()

array(['호텔', '상용', '병원', '학교', '건물기타', '아파트', '연구소', '백화점', 'IDC(전화국)',
       '공공'], dtype=object)

In [31]:
# ======================
# 모델 3 — XGBoost (건물유형별)
# ======================
#피쳐임포턴스 반영
drop_cols = ['T2', 'T_x_rain', 'RH2', '습도(%)', '풍속(m/s)', 'is_rain', 'wind_level', '강수량(mm)', '기온(°C)', 'ess_hours']
building_types = list(train["건물유형"].unique())
for btype in building_types:
    tr_sub = train[train["건물유형"] == btype].copy()
    te_sub = test [test ["건물유형"] == btype].copy()
    if len(tr_sub) == 0:
        continue

    drop_cols_tr = BASE_DROP_COMMON + BASE_DROP_TRAIN + drop_cols
    X_all = tr_sub.drop(columns=[c for c in drop_cols_tr if c in tr_sub.columns], errors="ignore")
    y_all = tr_sub[TARGET].astype(float)

    # XGBoost는 float만 → object 수치 변환
    X_all = coerce_numeric_objects(X_all).fillna(0.0)

    X_tr, X_val, y_tr, y_val = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

    m3 = xgb.XGBRegressor(**params_xgb)
    m3.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], verbose=False)

    val_pred = m3.predict(X_val)

    score = smape(y_val.values, val_pred)
    val_scores_3[btype] = score   # 건물유형 단위 성능 기록
    print(f"[{btype}] Validation SMAPE: {score:.4f}")

    val_keys = tr_sub.loc[X_val.index, KEYS].reset_index(drop=True)
    oof_df = val_keys.copy()
    oof_df["y_true"] = y_val.reset_index(drop=True).values
    oof_df["pred_3"] = val_pred
    val_oofs_3.append(oof_df)

    # TEST
    X_test = te_sub.drop(columns=[c for c in BASE_DROP_COMMON + BASE_DROP_TEST if c in te_sub.columns], errors="ignore")
    X_test = coerce_numeric_objects(X_test).fillna(0.0)
    X_test = reorder_like(X_test, X_all).fillna(0.0)
    te_pred = m3.predict(X_test)
    test_preds_3.append(te_sub[KEYS].assign(pred_3=te_pred))

[호텔] Validation SMAPE: 5.1850
[상용] Validation SMAPE: 2.2325
[병원] Validation SMAPE: 2.6472
[학교] Validation SMAPE: 2.9510
[건물기타] Validation SMAPE: 4.7772
[아파트] Validation SMAPE: 4.6186
[연구소] Validation SMAPE: 3.7907
[백화점] Validation SMAPE: 4.9227
[IDC(전화국)] Validation SMAPE: 0.8186
[공공] Validation SMAPE: 4.4485


In [38]:
# 모델 1 결과 확인
final_preds_1 = pd.concat(test_preds_1, axis=0).sort_values(['num_date_time', '건물번호'])
print("=== Model 1 (CatBoost-건물유형) ===")
print("Validation SMAPE by type:", {k: round(v, 4) for k, v in val_scores_1.items()})
print("평균 SMAPE:", round(np.mean(list(val_scores_1.values())), 4))

# 모델 2 결과 확인
final_preds_2 = pd.concat(test_preds_2, axis=0).sort_values(['num_date_time', '건물번호'])
print("\n=== Model 2 (CatBoost-cluster_id) ===")
print("Validation SMAPE by cluster:", {k: round(v, 4) for k, v in val_scores_2.items()})
print("평균 SMAPE:", round(np.mean(list(val_scores_2.values())), 4))

# 모델 3 결과 확인
final_preds_3 = pd.concat(test_preds_3, axis=0).sort_values(['num_date_time', '건물번호'])
print("\n=== Model 3 (XGBoost-건물유형) ===")
print("Validation SMAPE by type:", {k: round(v, 4) for k, v in val_scores_3.items()})
print("평균 SMAPE:", round(np.mean(list(val_scores_3.values())), 4))


=== Model 1 (CatBoost-건물유형) ===
Validation SMAPE by type: {'호텔': np.float64(5.8292), '상용': np.float64(1.6775), '병원': np.float64(2.623), 'CLUSTER10': np.float64(7.6803), '학교': np.float64(2.5522), 'CLUSTER11': np.float64(5.5123), '건물기타': np.float64(4.1811), '아파트': np.float64(2.8773), '연구소': np.float64(2.8718), '백화점': np.float64(5.2003), 'IDC(전화국)': np.float64(1.1797), '공공': np.float64(5.1687)}
평균 SMAPE: 3.9461

=== Model 2 (CatBoost-cluster_id) ===
Validation SMAPE by cluster: {'호텔': np.float64(5.5505), '상용': np.float64(1.7613), '병원': np.float64(2.5763), 'CLUSTER10': np.float64(5.1974), '학교': np.float64(2.8666), 'CLUSTER11': np.float64(5.5608), '건물기타': np.float64(4.1807), '아파트': np.float64(3.6995), '연구소': np.float64(3.1672), '백화점': np.float64(4.2196), 'IDC(전화국)': np.float64(0.7764), '공공': np.float64(5.0731)}
평균 SMAPE: 3.7191


ValueError: No objects to concatenate

#### 기존  
=== Model 1 (CatBoost-건물유형) ===  
평균 SMAPE: 4.4898

=== Model 2 (CatBoost-cluster_id) ===  
평균 SMAPE: 2.6234

=== Model 3 (XGBoost-건물유형) ===  
V평균 SMAPE: 3.4165


In [41]:
import numpy as np
import pandas as pd

# --- 설정 ---
# KEYS는 이미 전역에 있다고 하셨으니 그대로 사용합니다.
# 예: KEYS = ["num_date_time"] 또는 ["num_date_time","건물번호"] 등

def smape(y_true, y_pred):
    return 100 * np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred) + 1e-8))

def concat_if_any(lst):
    return pd.concat(lst, ignore_index=True) if len(lst) else pd.DataFrame(columns=KEYS+["y_true", "pred"])

# 1) OOF 결합 (리스트 → 단일 DF) & 컬럼명 표준화
oof1 = concat_if_any(val_oofs_1).rename(columns={"pred": "pred_1"})
oof2 = concat_if_any(val_oofs_2).rename(columns={"pred": "pred_2"})

# 2) 동일 샘플만 비교하도록 inner merge
need_cols = KEYS + ["y_true"]
assert all(c in oof1.columns for c in need_cols+["pred_1"]), "oof1 컬럼 확인 필요"
assert all(c in oof2.columns for c in need_cols+["pred_2"]), "oof2 컬럼 확인 필요"

val_join = oof1.merge(oof2, on=KEYS+["y_true"], how="inner").copy()
assert len(val_join) > 0, "OOF 조인 결과가 비었습니다. KEYS/y_true 일치 여부를 확인하세요."

# 3) 가중치 최적화 (Σw=1, w>=0) — 모델 2개
y_true = val_join["y_true"].values
p1 = val_join["pred_1"].values
p2 = val_join["pred_2"].values

grid = np.linspace(0, 1, 51)  # 0.02 step
best = {"w": (0.5, 0.5), "smape": 1e9}

for w1 in grid:
    w2 = 1.0 - w1
    y_hat = w1 * p1 + w2 * p2
    s = smape(y_true, y_hat)
    if s < best["smape"]:
        best = {"w": (float(w1), float(w2)), "smape": float(s)}

print(f"[OOF 기반 최적 가중치] w={best['w']} | OOF SMAPE={best['smape']:.4f}")

# 4) test 예측 결합
# 각 리스트를 하나로 합치고, KEYS + answer만 남긴 뒤 inner merge로 정렬/정합성 보장
def concat_test_preds(lst, answer_col="answer", alias=None):
    """
    - lst: 여러 조각의 test 예측 DataFrame 리스트
    - answer_col: 기본적으로 기대하는 예측 컬럼명 (ex. 'answer')
    - alias: 반환 시 예측 컬럼을 이 이름으로 맞춰줌 (ex. 'ans_1', 'ans_2')
    """
    if len(lst) == 0:
        # 빈 리스트면 빈 DF 반환
        col = alias if alias else answer_col
        return pd.DataFrame(columns=KEYS + [col])

    df = pd.concat(lst, ignore_index=True)

    # 1) 예측 컬럼 자동 탐지
    candidates = [answer_col, "pred", "prediction", "y_pred", "ans", "value"]
    pred_col = next((c for c in candidates if c in df.columns), None)

    # 2) 그래도 못 찾으면, KEYS를 제외한 단 하나의 컬럼이 예측이라고 간주
    if pred_col is None:
        non_key_cols = [c for c in df.columns if c not in KEYS]
        if len(non_key_cols) == 1:
            pred_col = non_key_cols[0]
        else:
            raise KeyError(
                f"예측 컬럼을 찾지 못했습니다. 후보 {candidates} 중 하나가 있어야 하거나, "
                f"KEYS({KEYS}) 제외 단일 컬럼만 남아야 합니다. 현재 컬럼: {list(df.columns)}"
            )

    # 3) alias로 이름 통일
    out_col = alias if alias else answer_col
    if pred_col != out_col:
        df = df.rename(columns={pred_col: out_col})

    # 4) 필요한 컬럼만 반환 (정합성 보장)
    need_cols = KEYS + [out_col]
    missing = [c for c in need_cols if c not in df.columns]
    if missing:
        raise KeyError(f"필요 컬럼 누락: {missing}. 현재 컬럼: {list(df.columns)}")

    return df[need_cols]

test1 = concat_test_preds(test_preds_1, answer_col="answer", alias="ans_1")
test2 = concat_test_preds(test_preds_2, answer_col="answer", alias="ans_2")

# 정합성 체크 후 조인
test_join = test1.merge(test2, on=KEYS, how="inner").copy()

w1, w2 = best["w"]
test_join["answer"] = w1 * test_join["ans_1"].values + w2 * test_join["ans_2"].values

submission = test_join[KEYS + ["answer"]].copy()
# submission["answer"] = submission["answer"].clip(lower=0)  # 필요 시
# submission.to_csv("submit_blend_1_2.csv", index=False)


[OOF 기반 최적 가중치] w=(0.24, 0.76) | OOF SMAPE=4.0342


In [46]:
# 5) 제출 형식으로 정리 (예: num_date_time, answer)
submission = test_join[KEYS + ["answer"]].copy()

submit = pd.read_csv('Data/sample_submission.csv')

sub_out = submit.merge(
    submission[['num_date_time', 'answer']],
    on='num_date_time',
    how="left"
).copy()

# 컬럼 이름 변경
sub_out = sub_out[['num_date_time', 'answer_y']].rename(columns={'answer_y': 'answer'})

# 최종 제출 파일 저장
sub_out.to_csv("Submit/submit0822_newcluster.csv", index=False)
sub_out

Unnamed: 0,num_date_time,answer
0,1_20240825 00,4611.274684
1,1_20240825 01,4209.557334
2,1_20240825 02,3988.090061
3,1_20240825 03,3627.912793
4,1_20240825 04,3364.295444
...,...,...
16795,100_20240831 19,2240.111938
16796,100_20240831 20,2352.154591
16797,100_20240831 21,2214.481248
16798,100_20240831 22,2441.752701


In [18]:
def apply_min_clip_by_weekday_hour(train: pd.DataFrame,
                                   test: pd.DataFrame,
                                   preds_df: pd.DataFrame,
                                   pred_col: str = "pred_ens",
                                   target_col: str = "전력소비량(kWh)") -> pd.DataFrame:
    """6-8월 train에서 (건물번호, 요일, 시)별 타깃의 '최저값'을 구해
       예측이 그 값보다 작으면 교체(상향)한다.
    """
    # 1) train에서 요일/시/월 파생
    tr = train.copy()
    tr["dt"] = pd.to_datetime(tr["일시"])
    tr["month"] = tr["dt"].dt.month
    tr["weekday"] = tr["dt"].dt.dayofweek
    tr["hour"] = tr["dt"].dt.hour

    # 2) 6~8월만 필터
    tr_summer = tr[(tr["month"].between(6, 8))].copy()

    # 3) (건물번호, weekday, hour)별 최저 타깃
    grp_min = (tr_summer
               .groupby(["건물번호", "weekday", "hour"], as_index=False)[target_col]
               .min()
               .rename(columns={target_col: "min_hist"}))

    # 4) test에도 요일/시 파생
    te = test[["num_date_time", "건물번호", "일시"]].copy()
    te["dt"] = pd.to_datetime(te["일시"])
    te["weekday"] = te["dt"].dt.dayofweek
    te["hour"] = te["dt"].dt.hour
    te = te[["num_date_time", "건물번호", "weekday", "hour"]]

    # 5) preds_df와 key merge
    base = preds_df.merge(te, on=["num_date_time", "건물번호"], how="left")

    # 6) (건물번호, weekday, hour)로 min_hist merge
    base = base.merge(grp_min, on=["건물번호", "weekday", "hour"], how="left")

    # 7) 하한 클리핑
    before = base[pred_col].copy()
    base[pred_col] = np.where(
        (~base["min_hist"].isna()) & (base[pred_col] < base["min_hist"]),
        base["min_hist"],
        base[pred_col]
    )

    # 차이 계산
    diff = (base[pred_col] - before).clip(lower=0)  # 교체된 만큼의 증가량
    changed = int((diff > 0).sum())
    total_increase = diff.sum()

    print(f"[Clip] {changed} rows were raised. 누적 증가량={total_increase:.2f}")

    return base[preds_df.columns]

In [19]:
clipped_preds = apply_min_clip_by_weekday_hour(train, test, test_join[["num_date_time",'건물번호',"pred_ens"]], pred_col="pred_ens")
clipped_preds

[Clip] 125 rows were raised. 누적 증가량=16409.42


Unnamed: 0,num_date_time,건물번호,pred_ens
0,100_20240825 00,100,2861.548567
1,100_20240825 01,100,2735.605156
2,100_20240825 02,100,2526.012037
3,100_20240825 03,100,2097.608341
4,100_20240825 04,100,2390.432590
...,...,...,...
16795,9_20240831 19,9,2960.160710
16796,9_20240831 20,9,2783.665425
16797,9_20240831 21,9,2656.394903
16798,9_20240831 22,9,2333.739335


NameError: name 'clipped_preds' is not defined