In [1]:
import warnings
warnings.filterwarnings('ignore')

import koreanize_matplotlib

# 모델 학습

In [2]:
import pandas as pd

# 저장
train = pd.read_csv('Data/train_0809.csv')
test = pd.read_csv('Data/test_0809.csv')

In [3]:
test_X =  test.drop(['건물유형','일시'], axis = 1)

In [4]:
X = train[test_X.columns]
Y = train[['건물번호', '전력소비량(kWh)']]

In [11]:
import numpy as np

def smape(gt, preds):
    gt = np.array(gt)
    preds = np.array(preds)
    v = 2 * abs(preds - gt) / (abs(preds) + abs(gt))
    score = np.mean(v) * 100
    return score

def weighted_mse(alpha=1.0):
    # xgb.train 전용: f(preds, dtrain) 반환
    def fobj(preds, dtrain):
        labels = dtrain.get_label()                # (n,)
        residual = labels - preds                  # (n,)
        grad = np.where(residual > 0, -2*alpha*residual, -2*residual).astype(np.float32)
        hess = np.where(residual > 0,  2*alpha,                 2.0).astype(np.float32)
        return grad, hess
    return fobj


In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb

# --- 전제: 아래 3개 함수가 이미 정의되어 있어야 함 ---
# smape(gt, preds), weighted_mse(alpha), custom_smape(preds, dtrain)  # (feval은 이번 코드에선 사용하지 않음)

RANDOM_SEED  = 42
KFOLD_SPLITS = 7
TARGET = '전력소비량(kWh)'

# ===== 0) 전역 모델 (fallback) - DMatrix 버전 =====
X_global = X.drop(columns=['건물번호','건물유형'], errors='ignore').values
y_global_log = np.log(np.maximum(Y[TARGET].values, 1e-6))

dtrain_global = xgb.DMatrix(X_global, label=y_global_log)

params_global = {
    'learning_rate': 0.05,
    'max_depth': 8,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'min_child_weight': 3,
    'tree_method': 'hist',
    'seed': RANDOM_SEED,
    'eval_metric': 'rmse',   # 모니터링 지표 (내장)
}
# 전역모델은 홀드아웃 없이 깔끔하게 학습(조기종료 없이)
global_bst = xgb.train(
    params=params_global,
    dtrain=dtrain_global,
    num_boost_round=2000,
    obj=weighted_mse(3),
    verbose_eval=False
)

# ===== 1) 건물번호별 학습/예측 =====
answer_df = pd.DataFrame(index=test_X.index, columns=["answer"], dtype=float)
pred_df   = pd.DataFrame(index=X.index,       columns=["pred"],   dtype=float)

bnum_list = X["건물번호"].unique()

from sklearn.model_selection import KFold
kf = KFold(n_splits=KFOLD_SPLITS, shuffle=True, random_state=RANDOM_SEED)

for bnum in bnum_list:
    x_df  = X  [X['건물번호'] == bnum].copy()
    y_ser = Y  [Y['건물번호'] == bnum][TARGET].copy()
    xt_df = test_X[test_X['건물번호'] == bnum].copy()

    # 상수/식별자 제거
    drop_cols = ['건물번호']
    if '건물유형' in x_df.columns: drop_cols.append('건물유형')
    x_df  = x_df .drop(columns=drop_cols, errors='ignore')
    xt_df = xt_df.drop(columns=drop_cols, errors='ignore')

    preds_valid = pd.Series(index=y_ser.index, dtype=float)
    preds_test_list = []

    X_mat = x_df.values
    y_arr = y_ser.values

    if len(x_df) < KFOLD_SPLITS:
        # 표본 부족 → 전역 모델 fallback
        dX  = xgb.DMatrix(X_mat)
        preds_valid[:] = np.exp(global_bst.predict(dX))
        if len(xt_df):
            dXt = xgb.DMatrix(xt_df.values)
            answer_df.loc[xt_df.index, "answer"] = np.exp(global_bst.predict(dXt))
        pred_df.loc[preds_valid.index, "pred"] = preds_valid
        print(f'건물번호 {bnum}: 표본 {len(x_df)}개 → 전역 모델 사용')
        continue

    fold_scores = []
    for fold, (tr_idx, va_idx) in enumerate(kf.split(X_mat), 1):
        X_tr, X_va = X_mat[tr_idx], X_mat[va_idx]
        y_tr, y_va = y_arr[tr_idx], y_arr[va_idx]

        # 로그 타깃
        y_tr_log = np.log(np.maximum(y_tr, 1e-6))
        y_va_log = np.log(np.maximum(y_va, 1e-6))

        dtrain = xgb.DMatrix(X_tr, label=y_tr_log)
        dvalid = xgb.DMatrix(X_va, label=y_va_log)

        params = {
            'learning_rate': 0.05,
            'max_depth': 10,
            'subsample': 0.7,
            'colsample_bytree': 0.5,
            'min_child_weight': 3,
            'tree_method': 'hist',
            'seed': RANDOM_SEED,
            'eval_metric': 'rmse',   # 내장 지표로 모니터링
        }

        bst = xgb.train(
            params=params,
            dtrain=dtrain,
            num_boost_round=5000,
            evals=[(dvalid, 'valid')],
            obj=weighted_mse(3),        # 커스텀 손실
            # feval=custom_smape,       # ← 원하면 커스텀 평가(표시용)도 함께 사용 가능
            early_stopping_rounds=100,
            verbose_eval=False
        )

        # 검증 예측 (best_iteration까지)
        va_pred_log = bst.predict(dvalid, iteration_range=(0, bst.best_iteration + 1))
        va_pred = np.exp(va_pred_log)
        preds_valid.iloc[va_idx] = va_pred

        # SMAPE로 fold 점수 기록
        fold_scores.append(smape(y_va, va_pred))

        # 테스트 예측
        if len(xt_df):
            dtest = xgb.DMatrix(xt_df.values)
            te_pred_log = bst.predict(dtest, iteration_range=(0, bst.best_iteration + 1))
            preds_test_list.append(np.exp(te_pred_log))

    pred_df.loc[preds_valid.index, "pred"] = preds_valid
    if len(xt_df):
        answer_df.loc[xt_df.index, "answer"] = np.mean(preds_test_list, axis=0)

    print(f'건물번호 {bnum} : XGB SMAPE = {np.mean(fold_scores):.4f}')



In [None]:
# ===== 2) 전체 SMAPE =====
total_smape = smape(
    Y.sort_index()[TARGET].values,
    pred_df.sort_index()["pred"].values
)
print(f"Total SMAPE = {total_smape:.4f}")


In [53]:
test_X

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2822400 entries, 0 to 2822399
Data columns (total 31 columns):
 #   Column             Dtype  
---  ------             -----  
 0   건물번호               int64  
 1   기온(°C)             float64
 2   강수량(mm)            float64
 3   풍속(m/s)            float64
 4   습도(%)              float64
 5   일조(hr)             int64  
 6   일사(MJ/m2)          int64  
 7   연면적(m2)            float64
 8   냉방면적(m2)           float64
 9   태양광용량(kW)          float64
 10  ESS저장용량(kWh)       float64
 11  PCS용량(kW)          float64
 12  월                  int64  
 13  일                  int64  
 14  요일                 int64  
 15  시간                 int64  
 16  월_sin              float64
 17  월_cos              float64
 18  요일_sin             float64
 19  요일_cos             float64
 20  시간_sin             float64
 21  시간_cos             float64
 22  일_sin              float64
 23  일_cos              float64
 24  건물유형_max           float64
 25  건물유형_mean         

In [None]:
print("answer_df NA:", answer_df['answer'].isna().sum())
print("test_X 시간 분포:", test_X['시간'].value_counts().sort_index().head(10))
print("예시 1개 건물 시간 샘플:", test_X[test_X['건물번호']==bnum_list[0]]['시간'].unique()[:10])


In [None]:
submission = pd.read_csv('Data/sample_submission.csv')

In [66]:
# ===== 파라미터 =====
YEAR = 2024
MONTH = 8                     # 8월만 사용 (필요시 바꿔줘)
BUILDINGS = list(range(1, 101))
DAYS = list(range(25, 32))    # 25~31
HOURS = list(range(0, 24))

# ===== 1) test_X 커버리지 점검 (월/일/시간/건물 필터) =====
mask = (
    test_X['건물번호'].isin(BUILDINGS) &
    (test_X['월'] == MONTH) &
    test_X['일'].isin(DAYS) &
    test_X['시간'].isin(HOURS)
)
test_sub = test_X.loc[mask].copy()

# 건물별 기대 행수 = 7일 * 24시간 = 168
cnt = (test_sub.groupby('건물번호')
                .size()
                .reindex(BUILDINGS, fill_value=0))
print(f"[CHECK] 기대 168행/건물, 실제 건물별 개수 요약:\n{cnt.describe()}")
missing_buildings = cnt[cnt != 168]
if not missing_buildings.empty:
    print("[WARN] 다음 건물은 누락된 (일,시간) 조합이 있습니다(표시는 상위 10개):")
    print(missing_buildings.head(10))

# ===== 2) 전역모델로 예측 누락분 보충 (answer_df 기준) =====
feat_cols = [c for c in test_sub.columns if c not in ['건물번호','건물유형']]
nan_mask = answer_df.loc[test_sub.index, 'answer'].isna()
print(f"[INFO] 예측 누락 행수: {nan_mask.sum()}")

if nan_mask.any():
    X_fb = xgb.DMatrix(test_sub.loc[nan_mask, feat_cols].values)
    answer_df.loc[nan_mask.index, 'answer'] = np.exp(global_bst.predict(X_fb))

# 음수 보정
answer_df.loc[test_sub.index, 'answer'] = answer_df.loc[test_sub.index, 'answer'].clip(lower=0)

# ===== 3) ID(num_date_time) 생성: 건물번호_YYYYMMDD HH =====
year_str  = pd.Series(str(YEAR), index=test_sub.index)
month_str = test_sub['월'].astype(int).map(lambda m: f"{m:02d}")
day_str   = test_sub['일'].astype(int).map(lambda d: f"{d:02d}")
hour_str  = test_sub['시간'].astype(int).map(lambda h: f"{h:02d}")
date_str  = year_str + month_str + day_str
num_date_time = test_sub['건물번호'].astype(str) + '_' + date_str + ' ' + hour_str

# ===== 4) 제출용 DF (필수 조합만, 정렬/중복 제거) =====
submission = pd.DataFrame({
    '건물번호': test_sub['건물번호'].values,
    '월': test_sub['월'].values,
    '일': test_sub['일'].values,
    '시간': test_sub['시간'].values,
    'num_date_time': num_date_time.values,
    'answer': answer_df.loc[test_sub.index, 'answer'].values
})

# 혹시 모를 중복 (같은 num_date_time이 두 번 이상) 제거: 최신값 우선
submission = submission.sort_values(['건물번호','일','시간']).drop_duplicates('num_date_time', keep='last')

# 건물별 행수 재확인 (모두 168이면 정상)
check = submission.groupby('건물번호').size()
print('[CHECK] 제출용 건물별 행수 unique:', sorted(check.unique()))

[CHECK] 기대 168행/건물, 실제 건물별 개수 요약:
count      100.0
mean     28224.0
std          0.0
min      28224.0
25%      28224.0
50%      28224.0
75%      28224.0
max      28224.0
dtype: float64
[WARN] 다음 건물은 누락된 (일,시간) 조합이 있습니다(표시는 상위 10개):
건물번호
1     28224
2     28224
3     28224
4     28224
5     28224
6     28224
7     28224
8     28224
9     28224
10    28224
dtype: int64
[INFO] 예측 누락 행수: 0
[CHECK] 제출용 건물별 행수 unique: [np.int64(168)]


In [67]:
# ===== 5) 저장 (대회 포맷에 맞춰 컬럼만 남기기) =====
submission[['num_date_time', 'answer']].to_csv('submission_t0809.csv', index=False)

In [72]:
t = pd.read_csv('submission_t0809.csv')
t

Unnamed: 0,num_date_time,answer
0,1_20240825 00,3256.193115
1,1_20240825 01,3365.232910
2,1_20240825 02,3345.837402
3,1_20240825 03,3312.782715
4,1_20240825 04,3311.425049
...,...,...
16795,100_20240831 19,1442.896484
16796,100_20240831 20,1417.942749
16797,100_20240831 21,1412.959961
16798,100_20240831 22,1423.716187


In [73]:
submit = pd.read_csv('Data/sample_submission.csv')
submit

Unnamed: 0,num_date_time,answer
0,1_20240825 00,0
1,1_20240825 01,0
2,1_20240825 02,0
3,1_20240825 03,0
4,1_20240825 04,0
...,...,...
16795,100_20240831 19,0
16796,100_20240831 20,0
16797,100_20240831 21,0
16798,100_20240831 22,0


In [74]:
submit.drop('answer',axis = 1, inplace = True)

final = submit.merge(t, on ='num_date_time',how = 'left')
final

Unnamed: 0,num_date_time,answer
0,1_20240825 00,3256.193115
1,1_20240825 01,3365.232910
2,1_20240825 02,3345.837402
3,1_20240825 03,3312.782715
4,1_20240825 04,3311.425049
...,...,...
16795,100_20240831 19,1442.896484
16796,100_20240831 20,1417.942749
16797,100_20240831 21,1412.959961
16798,100_20240831 22,1423.716187


In [75]:
final.to_csv('Submit/submission_t0809.csv', index=False)

In [16]:
import xgboost as xgb
from sklearn.model_selection import TimeSeriesSplit

RANDOM_SEED = 42
TARGET = '전력소비량(kWh)'
sort_keys = ['건물번호','월','일','시간']

# 시간 컬럼 복원(옵션): test_X['시간']이 전부 0이고 sin/cos가 있으면 복원
if (test_X['시간'] == 0).all() and {'시간_sin','시간_cos'}.issubset(test_X.columns):
    ang = np.mod(np.arctan2(test_X['시간_sin'], test_X['시간_cos']), 2*np.pi)
    test_X['시간'] = (np.round(ang/(2*np.pi)*24).astype(int) % 24)

# === 1) 랙/롤링 피처 생성 ===
def add_lags_per_bnum(df, value_col, lags=(1,24), roll_win=24):
    df = df.sort_values(sort_keys).copy()
    g = df.groupby('건물번호', group_keys=False)
    for L in lags:
        df[f'lag_{L}h'] = g[value_col].shift(L)
    # 직전값부터 rolling 평균
    df[f'roll{roll_win}h_mean'] = g[value_col].apply(lambda s: s.shift(1).rolling(roll_win).mean())
    return df

# train: 타깃 기반 랙 생성
train_all  = X.join(Y[[TARGET]], how='left')        # 인덱스 정렬 가정
train_feat = add_lags_per_bnum(train_all, TARGET, lags=(1,24), roll_win=24)

# 피처 선택/결측 처리
feature_cols = [c for c in train_feat.columns if c not in ['건물번호','건물유형', TARGET]]
lag_cols     = [c for c in feature_cols if c.startswith('lag_') or c.startswith('roll')]

X_lag = train_feat[feature_cols].copy()
X_lag[lag_cols] = X_lag[lag_cols].fillna(0)

y_log = np.log(np.maximum(train_feat[TARGET].values, 1e-6))

# test: 타깃이 없으므로 프록시로 랙 생성(예: 건물번호_mean; 없으면 0으로 대체)
proxy_col = '건물번호_mean' if '건물번호_mean' in test_X.columns else None
test_tmp = test_X.copy()
if proxy_col is None:
    test_tmp['__proxy__'] = 0.0
    proxy_col_use = '__proxy__'
else:
    proxy_col_use = proxy_col

test_feat = add_lags_per_bnum(test_tmp, proxy_col_use, lags=(1,24), roll_win=24)

# train과 동일한 컬럼 정렬로 맞추기
test_feat   = test_feat.reindex(columns=train_feat.columns, fill_value=0)
X_test_lag  = test_feat[feature_cols].copy()
X_test_lag[lag_cols] = X_test_lag[lag_cols].fillna(0)


In [17]:
# === 2) TSCV로 best_round 선택 (랙/롤링 피처 기준) ===
order_idx = np.lexsort((train_feat['시간'].values,
                        train_feat['일'].values,
                        train_feat['월'].values))
X_sorted = X_lag.iloc[order_idx].values
y_sorted = np.log(np.maximum(train_feat.iloc[order_idx][TARGET].values, 1e-6))

params = dict(
    learning_rate=0.05,
    max_depth=8,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_weight=3,
    tree_method='hist',
    seed=RANDOM_SEED,
    eval_metric='rmse'
)

tscv = TimeSeriesSplit(n_splits=7)
cv_smapes, best_ests = [], []

for fold, (tr_idx, va_idx) in enumerate(tscv.split(X_sorted), 1):
    dtr = xgb.DMatrix(X_sorted[tr_idx], label=y_sorted[tr_idx])
    dva = xgb.DMatrix(X_sorted[va_idx], label=y_sorted[va_idx])

    bst_cv = xgb.train(
        params=params,
        dtrain=dtr,
        num_boost_round=5000,
        evals=[(dva, 'valid')],
        obj=weighted_mse(3),
        early_stopping_rounds=100,
        verbose_eval=False
    )

    va_pred = np.exp(bst_cv.predict(dva, iteration_range=(0, bst_cv.best_iteration+1)))
    gt      = np.exp(y_sorted[va_idx])
    cv_smapes.append(smape(gt, va_pred))
    best_ests.append(bst_cv.best_iteration + 1)

print(f"[TSCV] SMAPE mean ± std: {np.mean(cv_smapes):.4f} ± {np.std(cv_smapes):.4f}")
best_round = int(np.median(best_ests))
print(f"[TSCV] chosen num_boost_round = {best_round}")

[TSCV] SMAPE mean ± std: 9.6817 ± 2.6873
[TSCV] chosen num_boost_round = 112


In [18]:
# === 3) 전역 모델 최종 학습(전체 train), 예측 ===
dtrain = xgb.DMatrix(X_lag.values, label=y_log)
bst_global = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=best_round,
    obj=weighted_mse(3),
    verbose_eval=False
)

# train 예측(글로벌), 잔차 계산
yhat_global_train = np.exp(bst_global.predict(xgb.DMatrix(X_lag.values)))
residual = train_feat[TARGET].values - yhat_global_train   # 원 스케일

# test 예측(글로벌)
pred_test_global = np.exp(bst_global.predict(xgb.DMatrix(X_test_lag.values)))

# === 4) 건물별 "작은" 잔차 모델 학습 & 보정 ===
from xgboost import XGBRegressor

res_models = {}
for b in train_feat['건물번호'].unique():
    idx_b = (train_feat['건물번호'] == b).values
    if idx_b.sum() < 200:   # 표본 부족 시 생략 (필요시 조정)
        continue
    Xb = X_lag.loc[idx_b, feature_cols].values
    rb = residual[idx_b]
    m = XGBRegressor(
        n_estimators=300, learning_rate=0.05, max_depth=4,
        subsample=0.8, colsample_bytree=0.8, random_state=RANDOM_SEED,
        objective='reg:squarederror', tree_method='hist'
    )
    m.fit(Xb, rb)
    res_models[b] = m

# 테스트 보정
pred_final = pred_test_global.copy()
for b in test_feat['건물번호'].unique():
    idx_b = (test_feat['건물번호'] == b).values
    if b in res_models:
        Xb_te = X_test_lag.loc[idx_b, feature_cols].values
        pred_final[idx_b] += res_models[b].predict(Xb_te)

answer_df = pd.DataFrame({'answer': np.clip(pred_final, 0, None)}, index=test_X.index)

In [19]:
# === 5) 8월 25~31일 × 24h 제출 파일 생성 (num_date_time) ===
YEAR  = 2024
MONTH = 8
BUILDINGS = list(range(1, 101))
DAYS  = list(range(25, 32))
HOURS = list(range(0, 24))

mask = (
    test_X['건물번호'].isin(BUILDINGS) &
    (test_X['월'] == MONTH) &
    test_X['일'].isin(DAYS) &
    test_X['시간'].isin(HOURS)
)
test_sub = test_X.loc[mask].copy()

# 누락 예측 보충(혹시 모를 NaN)
nan_mask = answer_df.loc[test_sub.index, 'answer'].isna()
if nan_mask.any():
    # 글로벌 예측으로 보충
    X_fb = X_test_lag.loc[nan_mask.values].values
    answer_df.loc[nan_mask.index, 'answer'] = np.exp(bst_global.predict(xgb.DMatrix(X_fb)))

# ID 생성: 건물번호_YYYYMMDD HH
year_str  = pd.Series(str(YEAR), index=test_sub.index)
month_str = test_sub['월'].astype(int).map(lambda m: f"{m:02d}")
day_str   = test_sub['일'].astype(int).map(lambda d: f"{d:02d}")
hour_str  = test_sub['시간'].astype(int).map(lambda h: f"{h:02d}")
date_str  = year_str + month_str + day_str

num_date_time = test_sub['건물번호'].astype(str) + '_' + date_str + ' ' + hour_str

submission = pd.DataFrame({
    'num_date_time': num_date_time.values,
    'answer': answer_df.loc[test_sub.index, 'answer'].values
}).sort_values('num_date_time')

# 건물별 행수 체크(168이어야 정상)
chk = test_sub.groupby('건물번호').size()
print('[CHECK] per-building rows (unique):', sorted(chk.unique()))

# 저장
submission.to_csv('submission.csv', index=False)
print('Saved: submission.csv')
print(submission.head())

[CHECK] per-building rows (unique): [np.int64(28224)]
Saved: submission.csv
           num_date_time      answer
2794342  100_20240825 00    0.000000
2794282  100_20240825 00  147.469360
2794283  100_20240825 00  115.195312
2794284  100_20240825 00  112.552872
2794285  100_20240825 00    0.000000


In [20]:

# 0) 필요하면 시간 복원 (시간이 전부 0이고 sin/cos가 있으면)
if (test_X['시간'] == 0).all() and {'시간_sin','시간_cos'}.issubset(test_X.columns):
    ang = np.mod(np.arctan2(test_X['시간_sin'], test_X['시간_cos']), 2*np.pi)
    test_X['시간'] = (np.round(ang/(2*np.pi)*24).astype(int) % 24)

# 1) 비교 전에 dtype 강제 정규화 (int로)
tx = test_X.copy()
tx['월'] = tx['월'].astype(int)
tx['일'] = tx['일'].astype(int)
tx['시간'] = tx['시간'].astype(int)
tx['건물번호'] = tx['건물번호'].astype(int)

# 2) 원하는 구간만 정확히 마스크
MONTH  = 8
DAYS   = range(25, 32)   # 25..31
HOURS  = range(0, 24)    # 0..23
BLD_OK = range(1, 101)

mask = (
    tx['건물번호'].isin(BLD_OK) &
    tx['월'].eq(MONTH) &
    tx['일'].isin(DAYS) &
    tx['시간'].isin(HOURS)
)
test_sub = tx.loc[mask].copy()

print("총 선택 행수:", mask.sum())
print("건물별 행수 unique:", sorted(test_sub.groupby('건물번호').size().unique())[:10])

# 기대 출력: 건물별 행수 unique: [168]


총 선택 행수: 2822400
건물별 행수 unique: [np.int64(28224)]
