In [2]:
import warnings
warnings.filterwarnings('ignore')

import koreanize_matplotlib

In [18]:
import pandas as pd

drop_cols = ['T2', 'T_x_rain', 'RH2', '습도(%)', '풍속(m/s)', 'is_rain', 'wind_level', '강수량(mm)']

train = pd.read_csv('Data/train_0820.csv').drop(drop_cols + ["일조(hr)", "일사(MJ/m2)"], axis = 1)
test = pd.read_csv('Data/test_0820.csv').drop(drop_cols, axis = 1)
submit =  pd.read_csv('Data/sample_submission.csv')

In [19]:
import random
import numpy as np
import os
from xgboost import XGBRegressor
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

In [20]:
train_x = train.drop(['num_date_time', '일시', '전력소비량(kWh)'],axis = 1).iloc[:,:-10]
train_y = train['전력소비량(kWh)']

In [21]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(train_x, train_y, test_size=0.2, random_state=42)

In [22]:
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((163200, 30), (40800, 30), (163200,), (40800,))

In [23]:
import numpy as np

def smape(y_true, y_pred):
    """
    SMAPE (Symmetric Mean Absolute Percentage Error)
    y_true: 실제값 (array-like)
    y_pred: 예측값 (array-like)
    반환값: SMAPE (단위: %)
    """
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    denominator = (np.abs(y_true) + np.abs(y_pred))
    diff = np.abs(y_pred - y_true)
    # 0으로 나누는 경우는 diff=0 이므로 0으로 처리됨
    smape_val = 100 * np.mean(2 * diff / np.where(denominator == 0, 1, denominator))
    return smape_val


## 일조/일사량 예측값을 제외하는것이 더 결과가 좋음

In [24]:
def smape_xgb(y_pred, dtrain):
    y_true = dtrain.get_label()
    denominator = (np.abs(y_true) + np.abs(y_pred))
    diff = np.abs(y_pred - y_true)
    smape_val = 100 * np.mean(2 * diff / np.where(denominator == 0, 1, denominator))
    return 'smape', smape_val, False


In [25]:
from sklearn.metrics import make_scorer
import xgboost as xgb

# SMAPE 기반 scorer
def smape_scorer(y_true, y_pred):
    return -smape(y_true, y_pred)  # 음수화 → 최소화 문제를 최대화로 바꿈

scorer = make_scorer(smape_scorer, greater_is_better=True)

building_types = train['건물유형'].unique()
val_scores = {}
models = {}
preds_list = []

for btype in building_types:
    # 1. 해당 건물유형 train/test 데이터 선택
    train_sub = train[train['건물유형'] == btype]
    test_sub  = test[test['건물유형'] == btype]

    # 2. feature, target 분리
    drop_cols = ['num_date_time', '일시', '건물유형', '전력소비량(kWh)']
    X = train_sub.drop(columns=drop_cols)
    y = train_sub['전력소비량(kWh)']

    # 3. train/val 분리
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # 4. 최적 파라미터 반영한 모델 정의
    xgb_model = xgb.XGBRegressor(
        n_estimators=700,
        learning_rate=0.05,
        max_depth=10,
        subsample=0.8,
        colsample_bytree=0.8,
        tree_method="hist",   # GPU 환경이면 "gpu_hist"
        random_state=42,
        n_jobs=-1,
        eval_metric="mae"
    )

    # 5. 학습
    xgb_model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=100
    )
    
    # 6. Validation 성능 저장
    y_val_pred = xgb_model.predict(X_val)
    val_scores[btype] = smape(y_val, y_val_pred)

    # 7. 모델 저장
    models[btype] = xgb_model

    # 8. test 데이터 예측
    X_test = test_sub.drop(columns=['num_date_time', '일시', '건물유형'])
    X_test = X_test[X_train.columns.tolist()]

    test_sub['pred'] = xgb_model.predict(X_test)

    preds_list.append(test_sub[['num_date_time', '건물번호', 'pred']])

[0]	validation_0-mae:2184.36144
[100]	validation_0-mae:161.38125
[200]	validation_0-mae:148.18900
[300]	validation_0-mae:145.28334
[400]	validation_0-mae:144.17324
[500]	validation_0-mae:143.61062
[600]	validation_0-mae:143.25235
[699]	validation_0-mae:142.98732
[0]	validation_0-mae:1107.72151
[100]	validation_0-mae:51.62928
[200]	validation_0-mae:45.41212
[300]	validation_0-mae:44.09951
[400]	validation_0-mae:43.62120
[500]	validation_0-mae:43.41040
[600]	validation_0-mae:43.27412
[699]	validation_0-mae:43.24257
[0]	validation_0-mae:2790.36692
[100]	validation_0-mae:109.22240
[200]	validation_0-mae:98.35267
[300]	validation_0-mae:96.36259
[400]	validation_0-mae:95.74119
[500]	validation_0-mae:95.30420
[600]	validation_0-mae:95.02053
[699]	validation_0-mae:94.87468
[0]	validation_0-mae:2080.34753
[100]	validation_0-mae:92.68717
[200]	validation_0-mae:77.84216
[300]	validation_0-mae:75.47867
[400]	validation_0-mae:74.49196
[500]	validation_0-mae:74.07326
[600]	validation_0-mae:73.83363


In [26]:
# 전체 예측 결과 합치기
final_preds = pd.concat(preds_list).sort_values(by=['num_date_time', '건물번호'])

# 건물유형별 성능 확인
print("건물유형별 SMAPE:", val_scores)
print("평균 SMAPE:", sum(val_scores.values())/len(val_scores))

건물유형별 SMAPE: {'호텔': np.float64(5.21109363498322), '상용': np.float64(2.1804464712564835), '병원': np.float64(2.7191599460559464), '학교': np.float64(2.8839442545904204), '건물기타': np.float64(4.685194708837909), '아파트': np.float64(5.129326656353152), '연구소': np.float64(3.8915649668417935), '백화점': np.float64(4.8134564761527425), 'IDC(전화국)': np.float64(0.8511624382569762), '공공': np.float64(4.553821277338601)}
평균 SMAPE: 3.6919170830667243


smape 3.9일때 점수 7.62이었음 # 현재에서 가장 좋은 결과 도출  

기본: 3.80  

피처 중요도 하위 20% 제외 후
일조 일사 예측값이 존재하는 경우 SMAPE: 3.73  
일조 일사 예측값이 존재하지 않는 경우 SMAPE: 3.69  
--> 성능 소폭 증가    

In [27]:
submit_final = (
    submit
    .merge(final_preds, on=['num_date_time'], how='left')
    .drop(columns=['answer','건물번호'], axis = 1)                          # 기존 answer 버리고
    .rename(columns={'pred': 'answer'})                # pred → answer 로 변경
)
submit_final.to_csv(f'Submit/submit0820_피처임포턴스 반영.csv',index = False)
submit_final

Unnamed: 0,num_date_time,answer
0,1_20240825 00,4551.306152
1,1_20240825 01,4080.455566
2,1_20240825 02,3707.581299
3,1_20240825 03,3351.706055
4,1_20240825 04,3167.836182
...,...,...
16795,100_20240831 19,2651.156738
16796,100_20240831 20,2745.572266
16797,100_20240831 21,2448.667725
16798,100_20240831 22,2648.843262


In [28]:
train.to_csv('Data/train_0820_피처임포턴스 반영.csv', index = False)
test.to_csv('Data/test_0820_피처임포턴스 반영.csv', index = False)

### 피처임포턴스 확인

In [14]:
import pandas as pd

# 가중 중요도 집계용 딕셔너리
all_importances = {}

for btype, model in models.items():
    # 건물유형별 feature importance (gain 기준)
    importance = model.get_booster().get_score(importance_type='gain')
    
    # 가중치 = 해당 건물유형 샘플 수
    weight = len(train[train['건물유형']==btype])
    
    for feat, val in importance.items():
        if feat not in all_importances:
            all_importances[feat] = 0
        all_importances[feat] += val * weight   # 가중합

# DataFrame 변환
fi_all = pd.DataFrame(list(all_importances.items()), columns=['feature','weighted_importance'])

# 정규화 (0~1 사이로)
fi_all['weighted_importance'] /= fi_all['weighted_importance'].sum()

# 중요도 순 정렬
fi_all = fi_all.sort_values(by='weighted_importance', ascending=False).reset_index(drop=True)

print(fi_all.head(20))  # 상위 20개 출력


               feature  weighted_importance
0         ESS저장용량(kWh)             0.481901
1            태양광용량(kW)             0.106081
2              연면적(m2)             0.105408
3             냉방면적(m2)             0.094437
4                 건물번호             0.062065
5     time_usage_level             0.038372
6            month_cos             0.026623
7      cool_area_ratio             0.018263
8        cdh_base_used             0.014801
9                 hour             0.008188
10            hour_cos             0.007761
11               month             0.007505
12           month_sin             0.006452
13          is_weekend             0.004195
14  cooling_load_index             0.003001
15     day_usage_level             0.002412
16             weekday             0.002158
17             holiday             0.001930
18            hour_sin             0.001777
19                  DI             0.001341


In [15]:
threshold = fi_all['weighted_importance'].quantile(0.2)  # 하위 20% 컷
low_importance_feats = fi_all[fi_all['weighted_importance'] <= threshold]['feature'].tolist()

print("제거 대상 feature:", low_importance_feats)

제거 대상 feature: ['T2', 'T_x_rain', 'RH2', '습도(%)', '풍속(m/s)', 'is_rain', 'wind_level', '강수량(mm)']


In [16]:
val_scores_reduced = {}
models_reduced = {}

for btype in building_types:
    train_sub = train[train['건물유형']==btype]
    test_sub  = test[test['건물유형']==btype]
    
    drop_cols = ['num_date_time','일시','건물유형','일사(MJ/m2)','일조(hr)','전력소비량(kWh)']
    # 기존 drop + 중요도 낮은 feature 제거
    X = train_sub.drop(columns=drop_cols+low_importance_feats, errors='ignore')
    y = train_sub['전력소비량(kWh)']
    
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # ✅ 기존 best_model을 그대로 불러오기
    best_model = models[btype]   # RandomizedSearchCV.best_estimator_
    
    # 새 feature셋으로 다시 피팅
    best_model.fit(X_train, y_train, eval_set=[(X_val,y_val)], verbose=False)
    
    y_val_pred = best_model.predict(X_val)
    val_scores_reduced[btype] = smape(y_val, y_val_pred)
    models_reduced[btype] = best_model

In [17]:
print("원래 SMAPE:", val_scores)
print("줄인 SMAPE:", val_scores_reduced)

print("평균 SMAPE(원래):", sum(val_scores.values())/len(val_scores))
print("평균 SMAPE(줄인):", sum(val_scores_reduced.values())/len(val_scores_reduced))

원래 SMAPE: {'호텔': np.float64(5.326782941497591), '상용': np.float64(2.22477739791447), '병원': np.float64(2.7814560388802883), '학교': np.float64(3.0476565187468028), '건물기타': np.float64(4.784803668382681), '아파트': np.float64(5.354087777884839), '연구소': np.float64(3.992798567664872), '백화점': np.float64(4.965716893522701), 'IDC(전화국)': np.float64(0.8658179908224422), '공공': np.float64(4.682470541325397)}
줄인 SMAPE: {'호텔': np.float64(5.21109363498322), '상용': np.float64(2.1804464712564835), '병원': np.float64(2.7191599460559464), '학교': np.float64(2.8839442545904204), '건물기타': np.float64(4.685194708837909), '아파트': np.float64(5.129326656353152), '연구소': np.float64(3.8915649668417935), '백화점': np.float64(4.8134564761527425), 'IDC(전화국)': np.float64(0.8511624382569762), '공공': np.float64(4.553821277338601)}
평균 SMAPE(원래): 3.8026368336642093
평균 SMAPE(줄인): 3.6919170830667243
