In [1]:
import warnings
warnings.filterwarnings('ignore')

import koreanize_matplotlib

In [19]:
import pandas as pd

#피쳐임포턴스 반영
#drop_cols = ['T2', 'T_x_rain', 'RH2', '습도(%)', '풍속(m/s)', 'is_rain', 'wind_level', '강수량(mm)']
#additional_drop_cols = ['기온(°C)', '강수량_전일', 'ΔT', 'ess_hours']
#["일조(hr)", "일사(MJ/m2)"]

train = pd.read_csv('Data/train_0820with변화량변수.csv')
test = pd.read_csv('Data/test_0820with변화량변수.csv')
submit =  pd.read_csv('Data/sample_submission.csv')

In [20]:
train.columns

Index(['num_date_time', '건물번호', '일시', '전력소비량(kWh)', '건물유형', '연면적(m2)',
       '냉방면적(m2)', '태양광용량(kW)', 'ESS저장용량(kWh)', 'PCS용량(kW)', 'month', 'day',
       'hour', 'weekday', 'is_weekend', 'holiday', 'cdh_base_used', 'DI',
       'THI', 'CDH', 'cool_area_ratio', 'kWh_per_m2', 'is_peak_hour',
       'hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'T_x_RH',
       'cooling_load_index', 'time_usage_level', 'day_usage_level', '기온_MA3'],
      dtype='object')

In [21]:
import random
import numpy as np
import os
from xgboost import XGBRegressor
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

In [22]:
train_x = train.drop(['num_date_time', '일시', '전력소비량(kWh)'],axis = 1).iloc[:,:-10]
train_y = train['전력소비량(kWh)']

In [23]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(train_x, train_y, test_size=0.2, random_state=42)

In [24]:
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((163200, 19), (40800, 19), (163200,), (40800,))

In [25]:
import numpy as np

def smape(y_true, y_pred):
    """
    SMAPE (Symmetric Mean Absolute Percentage Error)
    y_true: 실제값 (array-like)
    y_pred: 예측값 (array-like)
    반환값: SMAPE (단위: %)
    """
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    denominator = (np.abs(y_true) + np.abs(y_pred))
    diff = np.abs(y_pred - y_true)
    # 0으로 나누는 경우는 diff=0 이므로 0으로 처리됨
    smape_val = 100 * np.mean(2 * diff / np.where(denominator == 0, 1, denominator))
    return smape_val


## 일조/일사량 예측값을 제외하는것이 더 결과가 좋음

In [26]:
def smape_xgb(y_pred, dtrain):
    y_true = dtrain.get_label()
    denominator = (np.abs(y_true) + np.abs(y_pred))
    diff = np.abs(y_pred - y_true)
    smape_val = 100 * np.mean(2 * diff / np.where(denominator == 0, 1, denominator))
    return 'smape', smape_val, False


In [27]:
from sklearn.metrics import make_scorer
import xgboost as xgb

# SMAPE 기반 scorer
def smape_scorer(y_true, y_pred):
    return -smape(y_true, y_pred)  # 음수화 → 최소화 문제를 최대화로 바꿈

scorer = make_scorer(smape_scorer, greater_is_better=True)

building_types = train['건물유형'].unique()
val_scores = {}
models = {}
preds_list = []

for btype in building_types:
    # 1. 해당 건물유형 train/test 데이터 선택
    train_sub = train[train['건물유형'] == btype]
    test_sub  = test[test['건물유형'] == btype]

    # 2. feature, target 분리
    drop_cols = ['num_date_time', '일시', '건물유형', '전력소비량(kWh)']
    X = train_sub.drop(columns=drop_cols)
    y = train_sub['전력소비량(kWh)']

    # 3. train/val 분리
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # 4. 최적 파라미터 반영한 모델 정의
    xgb_model = xgb.XGBRegressor(
        n_estimators=700,
        learning_rate=0.05,
        max_depth=10,
        subsample=0.8,
        colsample_bytree=0.8,
        tree_method="hist",   # GPU 환경이면 "gpu_hist"
        random_state=42,
        n_jobs=-1,
        eval_metric="mae"
    )

    # 5. 학습
    xgb_model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=100
    )
    
    # 6. Validation 성능 저장
    y_val_pred = xgb_model.predict(X_val)
    val_scores[btype] = smape(y_val, y_val_pred)

    # 7. 모델 저장
    models[btype] = xgb_model

    # 8. test 데이터 예측
    X_test = test_sub.drop(columns=['num_date_time', '일시', '건물유형'])
    X_test = X_test[X_train.columns.tolist()]

    test_sub['pred'] = xgb_model.predict(X_test)

    preds_list.append(test_sub[['num_date_time', '건물번호', 'pred']])

[0]	validation_0-mae:2184.00045
[100]	validation_0-mae:161.93601
[200]	validation_0-mae:148.11304
[300]	validation_0-mae:145.24335
[400]	validation_0-mae:144.08548
[500]	validation_0-mae:143.36837
[600]	validation_0-mae:142.92314
[699]	validation_0-mae:142.67703
[0]	validation_0-mae:1107.99945
[100]	validation_0-mae:51.73281
[200]	validation_0-mae:45.69198
[300]	validation_0-mae:44.56521
[400]	validation_0-mae:44.13901
[500]	validation_0-mae:43.86722
[600]	validation_0-mae:43.68799
[699]	validation_0-mae:43.64665
[0]	validation_0-mae:2790.17069
[100]	validation_0-mae:110.00830
[200]	validation_0-mae:98.54826
[300]	validation_0-mae:96.55566
[400]	validation_0-mae:95.68912
[500]	validation_0-mae:95.25271
[600]	validation_0-mae:94.96228
[699]	validation_0-mae:94.77579
[0]	validation_0-mae:2080.54489
[100]	validation_0-mae:93.85615
[200]	validation_0-mae:78.54743
[300]	validation_0-mae:76.25614
[400]	validation_0-mae:75.43742
[500]	validation_0-mae:74.92696
[600]	validation_0-mae:74.65774


In [28]:
# 전체 예측 결과 합치기
final_preds = pd.concat(preds_list).sort_values(by=['num_date_time', '건물번호'])

# 건물유형별 성능 확인
print("건물유형별 SMAPE:", val_scores)
print("평균 SMAPE:", sum(val_scores.values())/len(val_scores))

건물유형별 SMAPE: {'호텔': np.float64(5.199020050753658), '상용': np.float64(2.196179026349523), '병원': np.float64(2.7183314085929475), '학교': np.float64(2.9018324945811957), '건물기타': np.float64(4.671065966882287), '아파트': np.float64(4.997275768477072), '연구소': np.float64(3.8653902697491604), '백화점': np.float64(4.767196478222327), 'IDC(전화국)': np.float64(0.832500503033067), '공공': np.float64(4.5460138750278105)}
평균 SMAPE: 3.669480584166905


In [29]:
submit_final = (
    submit
    .merge(final_preds, on=['num_date_time'], how='left')
    .drop(columns=['answer','건물번호'], axis = 1)                          # 기존 answer 버리고
    .rename(columns={'pred': 'answer'})                # pred → answer 로 변경
)
submit_final.to_csv(f'Submit/submit0820_추가파생변수반영_3.66.csv',index = False)
submit_final

Unnamed: 0,num_date_time,answer
0,1_20240825 00,4377.240723
1,1_20240825 01,3872.577393
2,1_20240825 02,3681.821777
3,1_20240825 03,3282.305176
4,1_20240825 04,3008.535400
...,...,...
16795,100_20240831 19,2640.962646
16796,100_20240831 20,2745.410156
16797,100_20240831 21,2440.273926
16798,100_20240831 22,2613.595459


### 피처임포턴스 확인

In [13]:
import pandas as pd

# 가중 중요도 집계용 딕셔너리
all_importances = {}

for btype, model in models.items():
    # 건물유형별 feature importance (gain 기준)
    importance = model.get_booster().get_score(importance_type='gain')
    
    # 가중치 = 해당 건물유형 샘플 수
    weight = len(train[train['건물유형']==btype])
    
    for feat, val in importance.items():
        if feat not in all_importances:
            all_importances[feat] = 0
        all_importances[feat] += val * weight   # 가중합

# DataFrame 변환
fi_all = pd.DataFrame(list(all_importances.items()), columns=['feature','weighted_importance'])

# 정규화 (0~1 사이로)
fi_all['weighted_importance'] /= fi_all['weighted_importance'].sum()

# 중요도 순 정렬
fi_all = fi_all.sort_values(by='weighted_importance', ascending=False).reset_index(drop=True)

print(fi_all.head(20))  # 상위 20개 출력


               feature  weighted_importance
0         ESS저장용량(kWh)             0.370155
1            태양광용량(kW)             0.142299
2             냉방면적(m2)             0.127049
3              연면적(m2)             0.106666
4                 건물번호             0.088276
5     time_usage_level             0.055766
6      cool_area_ratio             0.022530
7        cdh_base_used             0.017881
8      day_usage_level             0.012147
9                 hour             0.011178
10            hour_cos             0.009878
11               month             0.008278
12          is_weekend             0.005622
13  cooling_load_index             0.003584
14             holiday             0.002662
15             weekday             0.002349
16           PCS용량(kW)             0.002310
17           month_sin             0.002075
18           month_cos             0.001664
19                  DI             0.001644


In [14]:
threshold = fi_all['weighted_importance'].quantile(0.1)  # 하위 10% 컷
low_importance_feats = fi_all[fi_all['weighted_importance'] <= threshold]['feature'].tolist()

print("제거 대상 feature:", low_importance_feats)

제거 대상 feature: ['기온(°C)', '강수량_전일', 'ΔT', 'ess_hours']


In [15]:
val_scores_reduced = {}
models_reduced = {}

for btype in building_types:
    train_sub = train[train['건물유형']==btype]
    test_sub  = test[test['건물유형']==btype]
    
    drop_cols = ['num_date_time','일시','건물유형','일사(MJ/m2)','일조(hr)','전력소비량(kWh)']
    # 기존 drop + 중요도 낮은 feature 제거
    X = train_sub.drop(columns=drop_cols+low_importance_feats, errors='ignore')
    y = train_sub['전력소비량(kWh)']
    
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # ✅ 기존 best_model을 그대로 불러오기
    best_model = models[btype]   # RandomizedSearchCV.best_estimator_
    
    # 새 feature셋으로 다시 피팅
    best_model.fit(X_train, y_train, eval_set=[(X_val,y_val)], verbose=False)
    
    y_val_pred = best_model.predict(X_val)
    val_scores_reduced[btype] = smape(y_val, y_val_pred)
    models_reduced[btype] = best_model

In [16]:
print("원래 SMAPE:", val_scores)
print("줄인 SMAPE:", val_scores_reduced)

print("평균 SMAPE(원래):", sum(val_scores.values())/len(val_scores))
print("평균 SMAPE(줄인):", sum(val_scores_reduced.values())/len(val_scores_reduced))

원래 SMAPE: {'호텔': np.float64(5.4241610950788655), '상용': np.float64(2.2336027793754427), '병원': np.float64(2.8414851604968807), '학교': np.float64(3.076577360429025), '건물기타': np.float64(4.771029682744779), '아파트': np.float64(5.283267304975024), '연구소': np.float64(4.05076788113873), '백화점': np.float64(4.94146629044023), 'IDC(전화국)': np.float64(0.8814767396823233), '공공': np.float64(4.676453281515664)}
줄인 SMAPE: {'호텔': np.float64(5.199020050753658), '상용': np.float64(2.196179026349523), '병원': np.float64(2.7183314085929475), '학교': np.float64(2.9018324945811957), '건물기타': np.float64(4.671065966882287), '아파트': np.float64(4.997275768477072), '연구소': np.float64(3.8653902697491604), '백화점': np.float64(4.767196478222327), 'IDC(전화국)': np.float64(0.832500503033067), '공공': np.float64(4.5460138750278105)}
평균 SMAPE(원래): 3.818028757587696
평균 SMAPE(줄인): 3.669480584166905


smape 3.9일때 점수 7.62이었음 # 현재에서 가장 좋은 결과 도출  

--> 현재 3.66 성능 소폭 증가    

In [18]:
additional_drop_cols = ['기온(°C)', '강수량_전일', 'ΔT', 'ess_hours']
train.drop(additional_drop_cols, axis = 1).to_csv('Data/train_0820with변화량변수.csv',index = False)
test.drop(additional_drop_cols, axis = 1).to_csv('Data/test_0820with변화량변수.csv',index = False)