# **Setting**

In [None]:
!pip install xgboost==1.6.1

Collecting xgboost==1.6.1
  Downloading xgboost-1.6.1-py3-none-manylinux2014_x86_64.whl.metadata (1.8 kB)
Downloading xgboost-1.6.1-py3-none-manylinux2014_x86_64.whl (192.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m192.9/192.9 MB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xgboost
  Attempting uninstall: xgboost
    Found existing installation: xgboost 3.0.4
    Uninstalling xgboost-3.0.4:
      Successfully uninstalled xgboost-3.0.4
Successfully installed xgboost-1.6.1


# **Library**

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import sklearn
import xgboost
from xgboost import XGBRegressor
from sklearn.model_selection import KFold
from sklearn.cluster import KMeans
import random as rn
from datetime import datetime
import warnings
import math

# **Utilities**

In [None]:
def smape(gt, preds):
    gt= np.array(gt)
    preds = np.array(preds)
    v = 2 * abs(preds - gt) / (abs(preds) + abs(gt))
    score = np.mean(v) * 100
    return score

def weighted_mse(alpha = 1):
    def weighted_mse_fixed(label, pred):
        residual = (label - pred).astype("float")
        grad = np.where(residual>0, -2*alpha*residual, -2*residual)
        hess = np.where(residual>0, 2*alpha, 2.0)
        return grad, hess
    return weighted_mse_fixed

def custom_smape(preds, dtrain):
    labels = dtrain.get_label()
    return 'custom_smape', np.mean(2 * abs(preds - labels) / (abs(preds) + abs(labels))) * 100

# **Config**

In [None]:
np.random.seed(2025)
rn.seed(2025)

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

RANDOM_SEED = 42
KFOLD_SPLITS = 10

In [None]:
try:
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)
    DATA_DIR = "/content/drive/MyDrive/Colab Notebooks/"
except:
    DATA_DIR = "./data"

Mounted at /content/drive


# **Preprocessing**

In [None]:
def Preprocessing(summer = False, cluster = False):
    train          = pd.read_csv(f'{DATA_DIR}/train.csv', encoding='utf-8-sig')
    test           = pd.read_csv(f'{DATA_DIR}/test.csv',  encoding='utf-8-sig')
    building_info  = pd.read_csv(f'{DATA_DIR}/building_info.csv', encoding='utf-8-sig')

    train = train.rename(columns={
        '건물번호': 'building_number',
        '일시': 'date_time',
        '기온(°C)': 'temperature',
        '강수량(mm)': 'rainfall',
        '풍속(m/s)': 'windspeed',
        '습도(%)': 'humidity',
        '일조(hr)': 'sunshine',
        '일사(MJ/m2)': 'solar_radiation',
        '전력소비량(kWh)': 'power_consumption'
    })
    train.drop('num_date_time', axis = 1, inplace=True)

    test = test.rename(columns={
        '건물번호': 'building_number',
        '일시': 'date_time',
        '기온(°C)': 'temperature',
        '강수량(mm)': 'rainfall',
        '풍속(m/s)': 'windspeed',
        '습도(%)': 'humidity',
        '일조(hr)': 'sunshine',
        '일사(MJ/m2)': 'solar_radiation',
        '전력소비량(kWh)': 'power_consumption'
    })
    test.drop('num_date_time', axis = 1, inplace=True)

    building_info = building_info.rename(columns={
        '건물번호': 'building_number',
        '건물유형': 'building_type',
        '연면적(m2)': 'total_area',
        '냉방면적(m2)': 'cooling_area',
        '태양광용량(kW)': 'solar_power_capacity',
        'ESS저장용량(kWh)': 'ess_capacity',
        'PCS용량(kW)': 'pcs_capacity'
    })

    translation_dict = {
        '건물기타': 'Other Buildings',
        '공공': 'Public',
        '학교': 'University',
        '백화점': 'Department Store',
        '병원': 'Hospital',
        '상용': 'Commercial',
        '아파트': 'Apartment',
        '연구소': 'Research Institute',
        'IDC(전화국)': 'IDC',
        '호텔': 'Hotel'
    }

    building_info['building_type'] = building_info['building_type'].replace(translation_dict)
    building_info['solar_power_utility'] = np.where(building_info.solar_power_capacity !='-',1,0)
    building_info['ess_utility'] = np.where(building_info.ess_capacity !='-',1,0)

    train = pd.merge(train, building_info, on='building_number', how='left')
    test = pd.merge(test, building_info, on='building_number', how='left')

    train['date_time'] = pd.to_datetime(train['date_time'], format='%Y%m%d %H')

    # Datetime
    train['hour'] = train['date_time'].dt.hour
    train['day'] = train['date_time'].dt.day
    train['month'] = train['date_time'].dt.month
    train['day_of_week'] = train['date_time'].dt.dayofweek
    test['date_time'] = pd.to_datetime(test['date_time'], format='%Y%m%d %H')

    test['hour'] = test['date_time'].dt.hour
    test['day'] = test['date_time'].dt.day
    test['month'] = test['date_time'].dt.month
    test['day_of_week'] = test['date_time'].dt.dayofweek

    # Calculate 'day_temperature'
    def calculate_day_values(dataframe, target_column, output_column, aggregation_func):
        result_dict = {}

        grouped_temp = dataframe.groupby(['building_number', 'month', 'day'])[target_column].agg(aggregation_func)

        for (building, month, day), value in grouped_temp.items():
            result_dict.setdefault(building, {}).setdefault(month, {})[day] = value

        dataframe[output_column] = [
            result_dict.get(row['building_number'], {}).get(row['month'], {}).get(row['day'], None)
            for _, row in dataframe.iterrows()
        ]

    train['day_max_temperature'] = 0.0
    train['day_mean_temperature'] = 0.0

    calculate_day_values(train, 'temperature', 'day_max_temperature', 'max')
    calculate_day_values(train, 'temperature', 'day_mean_temperature', 'mean')
    calculate_day_values(train, 'temperature', 'day_min_temperature', 'min')

    train['day_temperature_range'] = train['day_max_temperature'] - train['day_min_temperature']

    calculate_day_values(test, 'temperature', 'day_max_temperature', 'max')
    calculate_day_values(test, 'temperature', 'day_mean_temperature', 'mean')
    calculate_day_values(test, 'temperature', 'day_min_temperature', 'min')

    test['day_temperature_range'] = test['day_max_temperature'] - test['day_min_temperature']

    # Outlier
    outlier_idx = train.index[train['power_consumption'] == 0].tolist()

    train.drop(index=outlier_idx, inplace=True)

    outlier_df = pd.read_excel(f'{DATA_DIR}/outlier (4).xlsx')
    outlier_df['date'] = pd.to_datetime(outlier_df['date'], format='%Y%m%d')

    initial_train_rows = train.shape[0]

    for _, row in outlier_df.iterrows():
        building_num = row['num']
        outlier_date = row['date'].date()

        indices_to_drop = train[(train['building_number'] == building_num) &
                                (train['date_time'].dt.date == outlier_date)].index

        train.drop(indices_to_drop, inplace=True)

    rows_dropped_from_outlier_file = initial_train_rows - train.shape[0]

    # Holiday
    holi_weekday = ['2024-06-06', '2024-08-15']

    train['holiday'] = np.where((train.day_of_week >= 5) | (train.date_time.dt.strftime('%Y-%m-%d').isin(holi_weekday)), 1, 0)
    test['holiday'] = np.where((test.day_of_week >= 5) | (test.date_time.dt.strftime('%Y-%m-%d').isin(holi_weekday)), 1, 0)

    # Datetime Fourier transform
    train['sin_hour'] = np.sin(2 * np.pi * train['hour']/23.0)
    train['cos_hour'] = np.cos(2 * np.pi * train['hour']/23.0)
    test['sin_hour'] = np.sin(2 * np.pi * test['hour']/23.0)
    test['cos_hour'] = np.cos(2 * np.pi * test['hour']/23.0)

    train['sin_date'] = -np.sin(2 * np.pi * (train['month']+train['day']/31)/12)
    train['cos_date'] = -np.cos(2 * np.pi * (train['month']+train['day']/31)/12)
    test['sin_date'] = -np.sin(2 * np.pi * (test['month']+test['day']/31)/12)
    test['cos_date'] = -np.cos(2 * np.pi * (test['month']+test['day']/31)/12)

    train['sin_month'] = -np.sin(2 * np.pi * train['month']/12.0)
    train['cos_month'] = -np.cos(2 * np.pi * train['month']/12.0)
    test['sin_month'] = -np.sin(2 * np.pi * test['month']/12.0)
    test['cos_month'] = -np.cos(2 * np.pi * test['month']/12.0)

    train['sin_dayofweek'] = -np.sin(2 * np.pi * (train['day_of_week']+1)/7.0)
    train['cos_dayofweek'] = -np.cos(2 * np.pi * (train['day_of_week']+1)/7.0)
    test['sin_dayofweek'] = -np.sin(2 * np.pi * (test['day_of_week']+1)/7.0)
    test['cos_dayofweek'] = -np.cos(2 * np.pi * (test['day_of_week']+1)/7.0)

    # Summer feature
    if summer == True:
        def summer_cos(date):
            start_date = datetime.strptime("2024-06-01 00:00:00", "%Y-%m-%d %H:%M:%S")
            end_date = datetime.strptime("2024-09-14 00:00:00", "%Y-%m-%d %H:%M:%S")

            period = (end_date - start_date).total_seconds()

            return math.cos(2 * math.pi * (date - start_date).total_seconds() / period)

        def summer_sin(date):
            start_date = datetime.strptime("2024-06-01 00:00:00", "%Y-%m-%d %H:%M:%S")
            end_date = datetime.strptime("2024-09-14 00:00:00", "%Y-%m-%d %H:%M:%S")

            period = (end_date - start_date).total_seconds()

            return math.sin(2 * math.pi * (date - start_date).total_seconds() / period)

        train['summer_cos'] = train['date_time'].apply(summer_cos)
        train['summer_sin'] = train['date_time'].apply(summer_sin)

        test['summer_cos'] = test['date_time'].apply(summer_cos)
        test['summer_sin'] = test['date_time'].apply(summer_sin)

    # CDH
    def CDH(xs):
        cumsum = np.cumsum(xs - 26)
        return np.concatenate((cumsum[:11], cumsum[11:] - cumsum[:-11]))

    def calculate_and_add_cdh(dataframe):
        cdhs = []
        for i in range(1, 101):
            temp = dataframe[dataframe['building_number'] == i]['temperature'].values
            cdh = CDH(temp)
            cdhs.append(cdh)
        return np.concatenate(cdhs)

    train['CDH'] = calculate_and_add_cdh(train)
    test['CDH'] = calculate_and_add_cdh(test)
    train['THI'] = 9/5*train['temperature'] - 0.55*(1-train['humidity']/100)*(9/5*train['humidity']-26)+32
    test['THI'] = 9/5*test['temperature'] - 0.55*(1-test['humidity']/100)*(9/5*test['humidity']-26)+32
    train['WCT'] = 13.12 + 0.6125*train['temperature'] - 11.37*(train['windspeed']**
                                                                0.16) + 0.3965*(train['windspeed']**0.16)*train['temperature']
    test['WCT'] = 13.12 + 0.6125*test['temperature'] - 11.37*(test['windspeed']**
                                                                0.16) + 0.3965*(test['windspeed']**0.16)*test['temperature']

    # Calculate 'power_consumption'
    power_mean = pd.pivot_table(train, values='power_consumption', index=['building_number', 'hour', 'day_of_week'], aggfunc=np.mean).reset_index()
    power_mean.columns = ['building_number', 'hour', 'day_of_week', 'day_hour_mean']

    power_std = pd.pivot_table(train, values='power_consumption', index=['building_number', 'hour', 'day_of_week'], aggfunc=np.std).reset_index()
    power_std.columns = ['building_number', 'hour', 'day_of_week', 'day_hour_std']

    power_hour_mean = pd.pivot_table(train, values='power_consumption', index=['building_number', 'hour'], aggfunc=np.mean).reset_index()
    power_hour_mean.columns = ['building_number', 'hour', 'hour_mean']

    power_hour_std = pd.pivot_table(train, values='power_consumption', index=['building_number', 'hour'], aggfunc=np.std).reset_index()
    power_hour_std.columns = ['building_number', 'hour', 'hour_std']

    train = train.merge(power_mean, on=['building_number', 'hour', 'day_of_week'], how='left')
    test = test.merge(power_mean, on=['building_number', 'hour', 'day_of_week'], how='left')

    train = train.merge(power_std, on=['building_number', 'hour', 'day_of_week'], how='left')
    test = test.merge(power_std, on=['building_number', 'hour', 'day_of_week'], how='left')

    train = train.merge(power_hour_mean, on=['building_number', 'hour'], how='left')
    test = test.merge(power_hour_mean, on=['building_number', 'hour'], how='left')

    train = train.merge(power_hour_std, on=['building_number', 'hour'], how='left')
    test = test.merge(power_hour_std, on=['building_number', 'hour'], how='left')

    train = train.reset_index(drop=True)

    # Cluster
    if cluster == True:
        pivot_table = train.pivot_table(
            values='power_consumption',
            index='building_number',
            columns=['day_of_week', 'hour'],
            aggfunc='mean'
        ).fillna(0)

        pivot_table.columns = [f'dow_{dow}_hour_{hour}' for (dow, hour) in pivot_table.columns]

        k = 5
        kmeans = KMeans(n_clusters=k, random_state=2025, n_init=10)
        clusters = kmeans.fit_predict(pivot_table)

        building_info = building_info.set_index('building_number')
        building_info['cluster'] = pd.Series(clusters, index=pivot_table.index)
        building_info = building_info.reset_index()

        train = pd.merge(train, building_info[['building_number', 'cluster']], on='building_number', how='left')
        test = pd.merge(test, building_info[['building_number', 'cluster']], on='building_number', how='left')

        cluster_counts = building_info['cluster'].value_counts().sort_index()
        print("Cluster-wise building count:")
        print(cluster_counts)

        total_buildings = building_info['building_number'].nunique()
        print("\nTotal number of buildings:", total_buildings)

    return train, test

# **No Summer Feature**

In [None]:
train, test = Preprocessing(False, False)

In [None]:
X = train.drop(['solar_power_capacity', 'ess_capacity', 'pcs_capacity',
                'power_consumption','rainfall', 'sunshine', 'solar_radiation',
                'hour','day','month','day_of_week','date_time'],axis =1 )

Y = train[['building_type','power_consumption']]

test_X = test.drop(['solar_power_capacity', 'ess_capacity', 'pcs_capacity','rainfall',
                   'hour','month','day_of_week','day','date_time'], axis=1)

In [None]:
type_list = []
for value in train.building_type.values:
    if value not in type_list:
        type_list.append(value)

In [None]:
max_depth_dict = {
    'Other Buildings': 10,
    'Public': 10,
    'University': 8,
    'IDC': 6,
    'Department Store': 8,
    'Hospital': 8,
    'Commercial': 10,
    'Apartment': 6,
    'Research Institute': 10,
    'Hotel': 10
}

In [None]:
# 건물 타입별 모델
type_list = X["building_type"].unique()

answer_df = pd.DataFrame(index=test_X.index, columns=["answer"], dtype=float)
pred_df   = pd.DataFrame(index=X.index,       columns=["pred"],   dtype=float)

kf = KFold(n_splits=KFOLD_SPLITS, shuffle=True, random_state=RANDOM_SEED)

for btype in type_list:
    x  = X  [X['building_type'] == btype].copy()
    y  = Y  [Y['building_type'] == btype]['power_consumption'].copy()
    xt = test_X[test_X['building_type'] == btype].copy()

    x  = pd.get_dummies(x,  columns=["building_number"], drop_first=False)
    xt = pd.get_dummies(xt, columns=["building_number"], drop_first=False)

    xt = xt.reindex(columns=x.columns, fill_value=0)

    drop_cols = ["building_type"]
    x  = x .drop(columns=drop_cols)
    xt = xt.drop(columns=drop_cols)

    preds_valid = pd.Series(index=y.index, dtype=float)
    preds_test  = []

    x_values = x.values
    y_values = y.values

    fold_scores = []
    for fold, (tr_idx, va_idx) in enumerate(kf.split(x_values), 1):
        X_tr, X_va = x_values[tr_idx], x_values[va_idx]
        y_tr, y_va = y_values[tr_idx], y_values[va_idx]

        y_tr_log = np.log(y_tr)
        y_va_log = np.log(y_va)

        model = XGBRegressor(
            learning_rate     = 0.05,
            n_estimators      = 5000,
            max_depth         = max_depth_dict[btype],
            subsample         = 0.7,
            colsample_bytree  = 0.5,
            min_child_weight  = 3,
            random_state      = RANDOM_SEED,
            objective         = weighted_mse(3),
            tree_method       = "gpu_hist",
            gpu_id            = 0,
            early_stopping_rounds = 100,
        )

        model.fit(
            X_tr, y_tr_log,
            eval_set=[(X_va, y_va_log)],
            eval_metric=custom_smape,
            verbose=False,
        )

        va_pred = np.exp(model.predict(X_va))
        preds_valid.iloc[va_idx] = va_pred

        fold_smape = smape(y_va, va_pred)
        fold_scores.append(fold_smape)

        preds_test.append(np.exp(model.predict(xt.values)))

    pred_df.loc[preds_valid.index, "pred"] = preds_valid

    answer_df.loc[xt.index, "answer"] = np.mean(preds_test, axis=0)

    print(f"Building type = {btype} : XGB SMAPE = {np.mean(fold_scores):.4f}")

total_smape = smape(
    Y.sort_index()["power_consumption"].values,
    pred_df.sort_index()["pred"].values
)
print(f"Total SMAPE = {total_smape:.4f}")

pred_df.to_csv(f'{DATA_DIR}/pred_valid_nosummer{RANDOM_SEED}.csv', index=False)
answer_df.to_csv(f'{DATA_DIR}/answer_test_nosummer{RANDOM_SEED}.csv', index=False)

Building type = Hotel : XGB SMAPE = 4.3935
Building type = Commercial : XGB SMAPE = 1.8392
Building type = Hospital : XGB SMAPE = 2.0445
Building type = University : XGB SMAPE = 2.1886
Building type = Other Buildings : XGB SMAPE = 3.3996
Building type = Apartment : XGB SMAPE = 2.7940
Building type = Research Institute : XGB SMAPE = 3.0153
Building type = Department Store : XGB SMAPE = 3.3539
Building type = IDC : XGB SMAPE = 0.6311
Building type = Public : XGB SMAPE = 3.6438
Total SMAPE = 2.7712


In [None]:
# 건물별 모델

Y = train[['building_number','power_consumption']]

answer_df_by_building = pd.DataFrame(index=test_X.index, columns=["answer"], dtype=float)
pred_df_by_building = pd.DataFrame(index=X.index, columns=["pred"], dtype=float)

building_numbers = X["building_number"].unique()

for bnum in building_numbers:
    x_building = X[X['building_number'] == bnum].copy()
    y_building = Y[Y['building_number'] == bnum]['power_consumption'].copy()
    xt_building = test_X[test_X['building_number'] == bnum].copy()

    current_building_type = X[X['building_number'] == bnum]['building_type'].iloc[0]
    current_max_depth = max_depth_dict.get(current_building_type, 10)

    drop_cols_building = ["building_type", "building_number"]
    x_building = x_building.drop(columns=drop_cols_building, errors='ignore')
    xt_building = xt_building.drop(columns=drop_cols_building, errors='ignore')

    xt_building = xt_building.reindex(columns=x_building.columns, fill_value=0)

    preds_valid_building = pd.Series(index=y_building.index, dtype=float)
    preds_test_building = []

    x_values_building = x_building.values
    y_values_building = y_building.values

    fold_scores_building = []
    for fold, (tr_idx, va_idx) in enumerate(kf.split(x_values_building), 1):
        X_tr, X_va = x_values_building[tr_idx], x_values_building[va_idx]
        y_tr, y_va = y_values_building[tr_idx], y_values_building[va_idx]

        y_tr_log = np.log(y_tr)
        y_va_log = np.log(y_va)

        model_building = XGBRegressor(
            learning_rate     = 0.05,
            n_estimators      = 5000,
            max_depth         = current_max_depth,
            subsample         = 0.7,
            colsample_bytree  = 0.5,
            min_child_weight  = 3,
            random_state      = RANDOM_SEED,
            objective         = weighted_mse(3),
            tree_method       = "gpu_hist",
            gpu_id            = 0,
            early_stopping_rounds = 100,
        )

        model_building.fit(
            X_tr, y_tr_log,
            eval_set=[(X_va, y_va_log)],
            eval_metric=custom_smape,
            verbose=False,
        )

        va_pred = np.exp(model_building.predict(X_va))
        preds_valid_building.iloc[va_idx] = va_pred

        fold_smape = smape(y_va, va_pred)
        fold_scores_building.append(fold_smape)

        preds_test_building.append(np.exp(model_building.predict(xt_building.values)))

    pred_df_by_building.loc[preds_valid_building.index, "pred"] = preds_valid_building
    answer_df_by_building.loc[xt_building.index, "answer"] = np.mean(preds_test_building, axis=0)

    print(f"Building number = {bnum} : XGB SMAPE = {np.mean(fold_scores_building):.4f}")

total_smape_by_building = smape(
    Y.sort_index()["power_consumption"].values,
    pred_df_by_building.sort_index()["pred"].values
)
print(f"Total SMAPE (by Building) = {total_smape_by_building:.4f}")

pred_df_by_building.to_csv(f'{DATA_DIR}/pred_valid_by_building_nosummer{RANDOM_SEED}.csv', index=False)
answer_df_by_building.to_csv(f'{DATA_DIR}/answer_test_by_building_nosummer{RANDOM_SEED}.csv', index=False)

Building number = 1 : XGB SMAPE = 6.8060
Building number = 2 : XGB SMAPE = 4.4365
Building number = 3 : XGB SMAPE = 1.6559
Building number = 4 : XGB SMAPE = 4.0734
Building number = 5 : XGB SMAPE = 1.0035
Building number = 6 : XGB SMAPE = 5.2791
Building number = 7 : XGB SMAPE = 2.2819
Building number = 8 : XGB SMAPE = 2.6724
Building number = 9 : XGB SMAPE = 3.9303
Building number = 10 : XGB SMAPE = 3.6799
Building number = 11 : XGB SMAPE = 2.0568
Building number = 12 : XGB SMAPE = 1.2134
Building number = 13 : XGB SMAPE = 2.3945
Building number = 14 : XGB SMAPE = 1.4533
Building number = 15 : XGB SMAPE = 2.6818
Building number = 16 : XGB SMAPE = 1.4676
Building number = 17 : XGB SMAPE = 1.6931
Building number = 18 : XGB SMAPE = 3.0366
Building number = 19 : XGB SMAPE = 4.2689
Building number = 20 : XGB SMAPE = 0.6946
Building number = 21 : XGB SMAPE = 1.2818
Building number = 22 : XGB SMAPE = 2.1128
Building number = 23 : XGB SMAPE = 6.2833
Building number = 24 : XGB SMAPE = 3.3303
B

In [None]:
# 전체 모델

answer_df_global = pd.DataFrame(index=test_X.index, columns=["answer"], dtype=float)
pred_df_global = pd.DataFrame(index=X.index, columns=["pred"], dtype=float)

x_global = pd.get_dummies(X.copy(), columns=["building_type"], drop_first=False)
xt_global = pd.get_dummies(test_X.copy(), columns=["building_type"], drop_first=False)

x_global = pd.get_dummies(x_global, columns=["building_number"], drop_first=False)
xt_global = pd.get_dummies(xt_global, columns=["building_number"], drop_first=False)

drop_cols_global = []

x_global = x_global.drop(columns=drop_cols_global, errors='ignore')
xt_global = xt_global.drop(columns=drop_cols_global, errors='ignore')

xt_global = xt_global.reindex(columns=x_global.columns, fill_value=0)

y_global = Y['power_consumption'].copy()

preds_valid_global = pd.Series(index=y_global.index, dtype=float)
preds_test_global = []

x_values_global = x_global.values
y_values_global = y_global.values

fold_scores_global = []
for fold, (tr_idx, va_idx) in enumerate(kf.split(x_values_global), 1):
    X_tr, X_va = x_values_global[tr_idx], x_values_global[va_idx]
    y_tr, y_va = y_values_global[tr_idx], y_values_global[va_idx]

    y_tr_log = np.log(y_tr)
    y_va_log = np.log(y_va)

    model_global = XGBRegressor(
            learning_rate     = 0.05,
            n_estimators      = 5000,
            max_depth         = 10,
            subsample         = 0.7,
            colsample_bytree  = 0.5,
            min_child_weight  = 3,
            random_state      = RANDOM_SEED,
            objective         = weighted_mse(3),
            tree_method       = "gpu_hist",
            gpu_id            = 0,
            early_stopping_rounds = 100,
        )

    model_global.fit(
            X_tr, y_tr_log,
            eval_set=[(X_va, y_va_log)],
            eval_metric=custom_smape,
            verbose=False,
        )

    va_pred = np.exp(model_global.predict(X_va))
    preds_valid_global.iloc[va_idx] = va_pred

    fold_smape = smape(y_va, va_pred)
    fold_scores_global.append(fold_smape)

    preds_test_global.append(np.exp(model_global.predict(xt_global.values)))

pred_df_global.loc[preds_valid_global.index, "pred"] = preds_valid_global
answer_df_global.loc[xt_global.index, "answer"] = np.mean(preds_test_global, axis=0)

print(f"Global Model : XGB SMAPE = {np.mean(fold_scores_global):.4f}")

total_smape_global = smape(
    Y.sort_index()["power_consumption"].values,
    pred_df_global.sort_index()["pred"].values
)
print(f"Total SMAPE (Global) = {total_smape_global:.4f}")

pred_df_global.to_csv(f'{DATA_DIR}/pred_valid_by_global_nosummer{RANDOM_SEED}.csv', index=False)
answer_df_global.to_csv(f'{DATA_DIR}/answer_test_by_global_nosummer{RANDOM_SEED}.csv', index=False)

Global Model : XGB SMAPE = 2.7776
Total SMAPE (Global) = 2.7776


## **Ensemble of no summer features across building-type, building-specific, and global models**

In [None]:
answer_df = pd.read_csv(f'{DATA_DIR}/answer_test_nosummer{RANDOM_SEED}.csv')
answer_df_by_building = pd.read_csv(f'{DATA_DIR}/answer_test_by_building_nosummer{RANDOM_SEED}.csv')
answer_df_global = pd.read_csv(f'{DATA_DIR}/answer_test_by_global_nosummer{RANDOM_SEED}.csv')

final_ensemble_test_pred = (
    answer_df.sort_index()["answer"].values * 0.25 +
    answer_df_by_building.sort_index()["answer"].values * 0.25 +
    answer_df_global.sort_index()["answer"].values * 0.5
)

In [None]:
final_ensemble_test_pred_fixed = [max(0, x) for x in final_ensemble_test_pred]
submission = pd.read_csv(f'{DATA_DIR}/sample_submission.csv')
submission['answer'] = final_ensemble_test_pred_fixed
submission.to_csv(f'{DATA_DIR}/5.16_nosummer{RANDOM_SEED}.csv', index=False)

# **No Summer Feature Cluster Model**

In [None]:
train, test = Preprocessing(False, True)

Cluster-wise building count:
cluster
0    12
1     4
2    47
3     9
4    28
Name: count, dtype: int64

Total number of buildings: 100


In [None]:
X = train.drop(['solar_power_capacity', 'ess_capacity', 'pcs_capacity',
                'power_consumption','rainfall', 'sunshine', 'solar_radiation',
                'hour','day','month','day_of_week','date_time', 'building_type'],axis =1 )

Y = train[['cluster','power_consumption']]

test_X = test.drop(['solar_power_capacity', 'ess_capacity', 'pcs_capacity','rainfall',
                   'hour','month','day_of_week','day','date_time'], axis=1)

In [None]:
cluster_list = sorted(train["cluster"].unique())

max_depth_dict_cluster = {
    0: 10,
    1: 8,
    2: 10,
    3: 8,
    4: 10
}

answer_df_cluster = pd.DataFrame(index=test_X.index, columns=["answer"], dtype=float)
pred_df_cluster   = pd.DataFrame(index=X.index,         columns=["pred"],    dtype=float)

kf = KFold(n_splits=KFOLD_SPLITS, shuffle=True, random_state=RANDOM_SEED)

for cluster_num in cluster_list:
    x  = X[X['cluster'] == cluster_num].copy()
    y  = Y[Y['cluster'] == cluster_num]['power_consumption'].copy()
    xt = test_X[test_X['cluster'] == cluster_num].copy()

    x  = pd.get_dummies(x,  columns=["building_number"], drop_first=False)
    xt = pd.get_dummies(xt, columns=["building_number"], drop_first=False)

    xt = xt.reindex(columns=x.columns, fill_value=0)

    drop_cols = ["cluster"]
    x  = x.drop(columns=drop_cols)
    xt = xt.drop(columns=drop_cols)

    preds_valid = pd.Series(index=y.index, dtype=float)
    preds_test  = []

    x_values = x.values
    y_values = y.values

    fold_scores = []
    for fold, (tr_idx, va_idx) in enumerate(kf.split(x_values), 1):
        X_tr, X_va = x_values[tr_idx], x_values[va_idx]
        y_tr, y_va = y_values[tr_idx], y_values[va_idx]

        y_tr_log = np.log(y_tr)
        y_va_log = np.log(y_va)

        model = XGBRegressor(
            learning_rate      = 0.05,
            n_estimators      = 5000,
            max_depth          = max_depth_dict_cluster[cluster_num],
            subsample          = 0.7,
            colsample_bytree  = 0.5,
            min_child_weight  = 3,
            random_state      = RANDOM_SEED,
            objective          = weighted_mse(3),
            tree_method        = "gpu_hist",
            gpu_id            = 0,
            early_stopping_rounds = 100,
        )

        model.fit(
            X_tr, y_tr_log,
            eval_set=[(X_va, y_va_log)],
            eval_metric=custom_smape,
            verbose=False,
        )

        va_pred = np.exp(model.predict(X_va))
        preds_valid.iloc[va_idx] = va_pred

        fold_smape = smape(y_va, va_pred)
        fold_scores.append(fold_smape)

        preds_test.append(np.exp(model.predict(xt.values)))

    pred_df_cluster.loc[preds_valid.index, "pred"] = preds_valid
    answer_df_cluster.loc[xt.index, "answer"] = np.mean(preds_test, axis=0)

    print(f"Building Cluster = {cluster_num} : XGB SMAPE = {np.mean(fold_scores):.4f}")

total_smape = smape(
    Y.sort_index()["power_consumption"].values,
    pred_df_cluster.sort_index()["pred"].values
)
print(f"Total SMAPE = {total_smape:.4f}")

pred_df_cluster.to_csv(f'{DATA_DIR}/pred_valid_by_cluster_nosummer{RANDOM_SEED}.csv', index=False)
answer_df_cluster.to_csv(f'{DATA_DIR}/answer_test_by_cluster_nosummer{RANDOM_SEED}.csv', index=False)

Building Cluster = 0 : XGB SMAPE = 2.2067
Building Cluster = 1 : XGB SMAPE = 0.8337
Building Cluster = 2 : XGB SMAPE = 3.2006
Building Cluster = 3 : XGB SMAPE = 1.4139
Building Cluster = 4 : XGB SMAPE = 3.1558
Total SMAPE = 2.8129


In [None]:
answer_df_cluster = pd.read_csv(f'{DATA_DIR}/answer_test_by_cluster_nosummer{RANDOM_SEED}.csv')
final_ensemble_test_pred = answer_df_cluster.sort_index()["answer"].values

In [None]:
final_ensemble_test_pred_fixed = [max(0, x) for x in final_ensemble_test_pred]
submission = pd.read_csv(f'{DATA_DIR}/sample_submission.csv')
submission['answer'] = final_ensemble_test_pred_fixed
submission.to_csv(f'{DATA_DIR}/5.16_nosummer_cluster{RANDOM_SEED}.csv', index=False)

# **Summer Feature**

In [None]:
train, test = Preprocessing(True, False)

In [None]:
X = train.drop(['solar_power_capacity', 'ess_capacity', 'pcs_capacity',
                'power_consumption','rainfall', 'sunshine', 'solar_radiation',
                'hour','day','month','day_of_week','date_time'],axis =1 )

Y = train[['building_type','power_consumption']]

test_X = test.drop(['solar_power_capacity', 'ess_capacity', 'pcs_capacity','rainfall',
                   'hour','month','day_of_week','day','date_time'], axis=1)

In [None]:
type_list = []
for value in train.building_type.values:
    if value not in type_list:
        type_list.append(value)

In [None]:
max_depth_dict = {
    'Other Buildings': 10,
    'Public': 10,
    'University': 8,
    'IDC': 6,
    'Department Store': 8,
    'Hospital': 8,
    'Commercial': 10,
    'Apartment': 6,
    'Research Institute': 10,
    'Hotel': 10
}

In [None]:
# 건물 타입별 모델

type_list = X["building_type"].unique()

answer_df = pd.DataFrame(index=test_X.index, columns=["answer"], dtype=float)
pred_df   = pd.DataFrame(index=X.index,       columns=["pred"],   dtype=float)

kf = KFold(n_splits=KFOLD_SPLITS, shuffle=True, random_state=RANDOM_SEED)

for btype in type_list:
    x  = X  [X['building_type'] == btype].copy()
    y  = Y  [Y['building_type'] == btype]['power_consumption'].copy()
    xt = test_X[test_X['building_type'] == btype].copy()

    x  = pd.get_dummies(x,  columns=["building_number"], drop_first=False)
    xt = pd.get_dummies(xt, columns=["building_number"], drop_first=False)

    xt = xt.reindex(columns=x.columns, fill_value=0)

    drop_cols = ["building_type"]
    x  = x .drop(columns=drop_cols)
    xt = xt.drop(columns=drop_cols)

    preds_valid = pd.Series(index=y.index, dtype=float)
    preds_test  = []

    x_values = x.values
    y_values = y.values

    fold_scores = []
    for fold, (tr_idx, va_idx) in enumerate(kf.split(x_values), 1):
        X_tr, X_va = x_values[tr_idx], x_values[va_idx]
        y_tr, y_va = y_values[tr_idx], y_values[va_idx]

        y_tr_log = np.log(y_tr)
        y_va_log = np.log(y_va)

        model = XGBRegressor(
            learning_rate     = 0.05,
            n_estimators      = 5000,
            max_depth         = max_depth_dict[btype],
            subsample         = 0.7,
            colsample_bytree  = 0.5,
            min_child_weight  = 3,
            random_state      = RANDOM_SEED,
            objective         = weighted_mse(3),
            tree_method       = "gpu_hist",
            gpu_id            = 0,
            early_stopping_rounds = 100,
        )

        model.fit(
            X_tr, y_tr_log,
            eval_set=[(X_va, y_va_log)],
            eval_metric=custom_smape,
            verbose=False,
        )

        va_pred = np.exp(model.predict(X_va))
        preds_valid.iloc[va_idx] = va_pred

        fold_smape = smape(y_va, va_pred)
        fold_scores.append(fold_smape)

        preds_test.append(np.exp(model.predict(xt.values)))

    pred_df.loc[preds_valid.index, "pred"] = preds_valid

    answer_df.loc[xt.index, "answer"] = np.mean(preds_test, axis=0)

    print(f"Building type = {btype} : XGB SMAPE = {np.mean(fold_scores):.4f}")

total_smape = smape(
    Y.sort_index()["power_consumption"].values,
    pred_df.sort_index()["pred"].values
)
print(f"Total SMAPE = {total_smape:.4f}")

pred_df.to_csv(f'{DATA_DIR}/pred_valid_summer{RANDOM_SEED}.csv', index=False)
answer_df.to_csv(f'{DATA_DIR}/answer_test_summer{RANDOM_SEED}.csv', index=False)

Building type = Hotel : XGB SMAPE = 4.2469
Building type = Commercial : XGB SMAPE = 1.8077
Building type = Hospital : XGB SMAPE = 1.9830
Building type = University : XGB SMAPE = 2.1372
Building type = Other Buildings : XGB SMAPE = 3.3220
Building type = Apartment : XGB SMAPE = 2.6941
Building type = Research Institute : XGB SMAPE = 2.9108
Building type = Department Store : XGB SMAPE = 3.2780
Building type = IDC : XGB SMAPE = 0.6285
Building type = Public : XGB SMAPE = 3.5128
Total SMAPE = 2.6937


In [None]:
# 건물별 모델

Y = train[['building_number','power_consumption']]

answer_df_by_building = pd.DataFrame(index=test_X.index, columns=["answer"], dtype=float)
pred_df_by_building = pd.DataFrame(index=X.index, columns=["pred"], dtype=float)

building_numbers = X["building_number"].unique()

for bnum in building_numbers:
    x_building = X[X['building_number'] == bnum].copy()
    y_building = Y[Y['building_number'] == bnum]['power_consumption'].copy()
    xt_building = test_X[test_X['building_number'] == bnum].copy()

    current_building_type = X[X['building_number'] == bnum]['building_type'].iloc[0]
    current_max_depth = max_depth_dict.get(current_building_type, 10)

    drop_cols_building = ["building_type", "building_number"]
    x_building = x_building.drop(columns=drop_cols_building, errors='ignore')
    xt_building = xt_building.drop(columns=drop_cols_building, errors='ignore')

    xt_building = xt_building.reindex(columns=x_building.columns, fill_value=0)

    preds_valid_building = pd.Series(index=y_building.index, dtype=float)
    preds_test_building = []

    x_values_building = x_building.values
    y_values_building = y_building.values

    fold_scores_building = []
    for fold, (tr_idx, va_idx) in enumerate(kf.split(x_values_building), 1):
        X_tr, X_va = x_values_building[tr_idx], x_values_building[va_idx]
        y_tr, y_va = y_values_building[tr_idx], y_values_building[va_idx]

        y_tr_log = np.log(y_tr)
        y_va_log = np.log(y_va)

        model_building = XGBRegressor(
            learning_rate     = 0.05,
            n_estimators      = 5000,
            max_depth         = current_max_depth,
            subsample         = 0.7,
            colsample_bytree  = 0.5,
            min_child_weight  = 3,
            random_state      = RANDOM_SEED,
            objective         = weighted_mse(3),
            tree_method       = "gpu_hist",
            gpu_id            = 0,
            early_stopping_rounds = 100,
        )

        model_building.fit(
            X_tr, y_tr_log,
            eval_set=[(X_va, y_va_log)],
            eval_metric=custom_smape,
            verbose=False,
        )

        va_pred = np.exp(model_building.predict(X_va))
        preds_valid_building.iloc[va_idx] = va_pred

        fold_smape = smape(y_va, va_pred)
        fold_scores_building.append(fold_smape)

        preds_test_building.append(np.exp(model_building.predict(xt_building.values)))

    pred_df_by_building.loc[preds_valid_building.index, "pred"] = preds_valid_building
    answer_df_by_building.loc[xt_building.index, "answer"] = np.mean(preds_test_building, axis=0)

    print(f"Building number = {bnum} : XGB SMAPE = {np.mean(fold_scores_building):.4f}")

total_smape_by_building = smape(
    Y.sort_index()["power_consumption"].values,
    pred_df_by_building.sort_index()["pred"].values
)
print(f"Total SMAPE (by Building) = {total_smape_by_building:.4f}")

pred_df_by_building.to_csv(f'{DATA_DIR}/pred_valid_by_building_summer{RANDOM_SEED}.csv', index=False)
answer_df_by_building.to_csv(f'{DATA_DIR}/answer_test_by_building_summer{RANDOM_SEED}.csv', index=False)

Building number = 1 : XGB SMAPE = 6.5859
Building number = 2 : XGB SMAPE = 4.3622
Building number = 3 : XGB SMAPE = 1.5629
Building number = 4 : XGB SMAPE = 3.9032
Building number = 5 : XGB SMAPE = 0.9665
Building number = 6 : XGB SMAPE = 5.0152
Building number = 7 : XGB SMAPE = 2.2129
Building number = 8 : XGB SMAPE = 2.5517
Building number = 9 : XGB SMAPE = 3.6938
Building number = 10 : XGB SMAPE = 3.3381
Building number = 11 : XGB SMAPE = 2.0084
Building number = 12 : XGB SMAPE = 1.1604
Building number = 13 : XGB SMAPE = 2.3091
Building number = 14 : XGB SMAPE = 1.3709
Building number = 15 : XGB SMAPE = 2.6360
Building number = 16 : XGB SMAPE = 1.4285
Building number = 17 : XGB SMAPE = 1.6203
Building number = 18 : XGB SMAPE = 3.0421
Building number = 19 : XGB SMAPE = 4.1602
Building number = 20 : XGB SMAPE = 0.6824
Building number = 21 : XGB SMAPE = 1.2103
Building number = 22 : XGB SMAPE = 2.0327
Building number = 23 : XGB SMAPE = 5.8610
Building number = 24 : XGB SMAPE = 3.1796
B

In [None]:
# 전체 모델

answer_df_global = pd.DataFrame(index=test_X.index, columns=["answer"], dtype=float)
pred_df_global = pd.DataFrame(index=X.index, columns=["pred"], dtype=float)

x_global = pd.get_dummies(X.copy(), columns=["building_type"], drop_first=False)
xt_global = pd.get_dummies(test_X.copy(), columns=["building_type"], drop_first=False)

x_global = pd.get_dummies(x_global, columns=["building_number"], drop_first=False)
xt_global = pd.get_dummies(xt_global, columns=["building_number"], drop_first=False)

drop_cols_global = []

x_global = x_global.drop(columns=drop_cols_global, errors='ignore')
xt_global = xt_global.drop(columns=drop_cols_global, errors='ignore')

xt_global = xt_global.reindex(columns=x_global.columns, fill_value=0)

y_global = Y['power_consumption'].copy()

preds_valid_global = pd.Series(index=y_global.index, dtype=float)
preds_test_global = []

x_values_global = x_global.values
y_values_global = y_global.values

fold_scores_global = []
for fold, (tr_idx, va_idx) in enumerate(kf.split(x_values_global), 1):
    X_tr, X_va = x_values_global[tr_idx], x_values_global[va_idx]
    y_tr, y_va = y_values_global[tr_idx], y_values_global[va_idx]

    y_tr_log = np.log(y_tr)
    y_va_log = np.log(y_va)

    model_global = XGBRegressor(
            learning_rate     = 0.05,
            n_estimators      = 5000,
            max_depth         = 10,
            subsample         = 0.7,
            colsample_bytree  = 0.5,
            min_child_weight  = 3,
            random_state      = RANDOM_SEED,
            objective         = weighted_mse(3),
            tree_method       = "gpu_hist",
            gpu_id            = 0,
            early_stopping_rounds = 100,
        )

    model_global.fit(
            X_tr, y_tr_log,
            eval_set=[(X_va, y_va_log)],
            eval_metric=custom_smape,
            verbose=False,
        )

    va_pred = np.exp(model_global.predict(X_va))
    preds_valid_global.iloc[va_idx] = va_pred

    fold_smape = smape(y_va, va_pred)
    fold_scores_global.append(fold_smape)

    preds_test_global.append(np.exp(model_global.predict(xt_global.values)))

pred_df_global.loc[preds_valid_global.index, "pred"] = preds_valid_global
answer_df_global.loc[xt_global.index, "answer"] = np.mean(preds_test_global, axis=0)

print(f"Global Model : XGB SMAPE = {np.mean(fold_scores_global):.4f}")

total_smape_global = smape(
    Y.sort_index()["power_consumption"].values,
    pred_df_global.sort_index()["pred"].values
)
print(f"Total SMAPE (Global) = {total_smape_global:.4f}")

pred_df_global.to_csv(f'{DATA_DIR}/pred_valid_by_global_summer{RANDOM_SEED}.csv', index=False)
answer_df_global.to_csv(f'{DATA_DIR}/answer_test_by_global_summer{RANDOM_SEED}.csv', index=False)

Global Model : XGB SMAPE = 2.7091
Total SMAPE (Global) = 2.7091


## **Ensemble of summer features across building-type, building-specific, and global models**

In [None]:
answer_df = pd.read_csv(f'{DATA_DIR}/answer_test_summer{RANDOM_SEED}.csv')
answer_df_by_building = pd.read_csv(f'{DATA_DIR}/answer_test_by_building_summer{RANDOM_SEED}.csv')
answer_df_global = pd.read_csv(f'{DATA_DIR}/answer_test_by_global_summer{RANDOM_SEED}.csv')

final_ensemble_test_pred = (
    answer_df.sort_index()["answer"].values * 0.2 +
    answer_df_by_building.sort_index()["answer"].values * 0.3 +
    answer_df_global.sort_index()["answer"].values * 0.5
)

In [None]:
final_ensemble_test_pred_fixed = [max(0, x) for x in final_ensemble_test_pred]
submission = pd.read_csv(f'{DATA_DIR}/sample_submission.csv')
submission['answer'] = final_ensemble_test_pred_fixed
submission.to_csv(f'{DATA_DIR}/5.16_summer{RANDOM_SEED}.csv', index=False)

# **Summer Cluster Model**

In [None]:
train, test = Preprocessing(True, True)

Cluster-wise building count:
cluster
0    12
1     4
2    47
3     9
4    28
Name: count, dtype: int64

Total number of buildings: 100


In [None]:
X = train.drop(['solar_power_capacity', 'ess_capacity', 'pcs_capacity',
                    'power_consumption','rainfall', 'sunshine', 'solar_radiation',
                    'hour','day','month','day_of_week','date_time', 'building_type'],axis =1 )

Y = train[['cluster','power_consumption']]

test_X = test.drop(['solar_power_capacity', 'ess_capacity', 'pcs_capacity','rainfall',
                      'hour','month','day_of_week','day','date_time'], axis=1)

In [None]:
cluster_list = sorted(train["cluster"].unique())

max_depth_dict_cluster = {
    0: 10,
    1: 8,
    2: 10,
    3: 8,
    4: 10
}

answer_df_cluster = pd.DataFrame(index=test_X.index, columns=["answer"], dtype=float)
pred_df_cluster   = pd.DataFrame(index=X.index,         columns=["pred"],    dtype=float)

kf = KFold(n_splits=KFOLD_SPLITS, shuffle=True, random_state=RANDOM_SEED)

for cluster_num in cluster_list:
    x  = X[X['cluster'] == cluster_num].copy()
    y  = Y[Y['cluster'] == cluster_num]['power_consumption'].copy()
    xt = test_X[test_X['cluster'] == cluster_num].copy()

    x  = pd.get_dummies(x,  columns=["building_number"], drop_first=False)
    xt = pd.get_dummies(xt, columns=["building_number"], drop_first=False)

    xt = xt.reindex(columns=x.columns, fill_value=0)

    drop_cols = ["cluster"]
    x  = x.drop(columns=drop_cols)
    xt = xt.drop(columns=drop_cols)

    preds_valid = pd.Series(index=y.index, dtype=float)
    preds_test  = []

    x_values = x.values
    y_values = y.values

    fold_scores = []
    for fold, (tr_idx, va_idx) in enumerate(kf.split(x_values), 1):
        X_tr, X_va = x_values[tr_idx], x_values[va_idx]
        y_tr, y_va = y_values[tr_idx], y_values[va_idx]

        y_tr_log = np.log(y_tr)
        y_va_log = np.log(y_va)

        model = XGBRegressor(
            learning_rate      = 0.05,
            n_estimators      = 5000,
            max_depth          = max_depth_dict_cluster[cluster_num],
            subsample          = 0.7,
            colsample_bytree  = 0.5,
            min_child_weight  = 3,
            random_state      = RANDOM_SEED,
            objective          = weighted_mse(3),
            tree_method        = "gpu_hist",
            gpu_id            = 0,
            early_stopping_rounds = 100,
        )

        model.fit(
            X_tr, y_tr_log,
            eval_set=[(X_va, y_va_log)],
            eval_metric=custom_smape,
            verbose=False,
        )

        va_pred = np.exp(model.predict(X_va))
        preds_valid.iloc[va_idx] = va_pred

        fold_smape = smape(y_va, va_pred)
        fold_scores.append(fold_smape)

        preds_test.append(np.exp(model.predict(xt.values)))

    pred_df_cluster.loc[preds_valid.index, "pred"] = preds_valid
    answer_df_cluster.loc[xt.index, "answer"] = np.mean(preds_test, axis=0)

    print(f"Building Cluster = {cluster_num} : XGB SMAPE = {np.mean(fold_scores):.4f}")

total_smape = smape(
    Y.sort_index()["power_consumption"].values,
    pred_df_cluster.sort_index()["pred"].values
)
print(f"Total SMAPE = {total_smape:.4f}")

pred_df_cluster.to_csv(f'{DATA_DIR}/pred_valid_by_cluster_summer{RANDOM_SEED}.csv', index=False)
answer_df_cluster.to_csv(f'{DATA_DIR}/answer_test_by_cluster_summer{RANDOM_SEED}.csv', index=False)

Building Cluster = 0 : XGB SMAPE = 2.1547
Building Cluster = 1 : XGB SMAPE = 0.8068
Building Cluster = 2 : XGB SMAPE = 3.0776
Building Cluster = 3 : XGB SMAPE = 1.3513
Building Cluster = 4 : XGB SMAPE = 3.0664
Total SMAPE = 2.7171


In [None]:
answer_df_cluster = pd.read_csv(f'{DATA_DIR}/answer_test_by_cluster_summer{RANDOM_SEED}.csv')
final_ensemble_test_pred = answer_df_cluster.sort_index()["answer"].values

In [None]:
final_ensemble_test_pred_fixed = [max(0, x) for x in final_ensemble_test_pred]
submission = pd.read_csv(f'{DATA_DIR}/sample_submission.csv')
submission['answer'] = final_ensemble_test_pred_fixed
submission.to_csv(f'{DATA_DIR}/5.16_summer_cluster{RANDOM_SEED}.csv', index=False)

# **Seed Ensemble**
## The following steps must be executed after repeating the previous stages with seeds set to 42, 8, 15, 2025, and 1.

In [None]:
summer_paths = [
    f'{DATA_DIR}/5.16_summer42.csv',
    f'{DATA_DIR}/5.16_summer8.csv',
    f'{DATA_DIR}/5.16_summer15.csv',
    f'{DATA_DIR}/5.16_summer2025.csv',
    f'{DATA_DIR}/5.16_summer1.csv'
]

nosummer_paths = [
    f'{DATA_DIR}/5.16_nosummer42.csv',
    f'{DATA_DIR}/5.16_nosummer8.csv',
    f'{DATA_DIR}/5.16_nosummer15.csv',
    f'{DATA_DIR}/5.16_nosummer2025.csv',
    f'{DATA_DIR}/5.16_nosummer1.csv'
]

summer_cluster_paths = [
    f'{DATA_DIR}/5.16_summer_cluster42.csv',
    f'{DATA_DIR}/5.16_summer_cluster8.csv',
    f'{DATA_DIR}/5.16_summer_cluster15.csv',
    f'{DATA_DIR}/5.16_summer_cluster2025.csv',
    f'{DATA_DIR}/5.16_summer_cluster1.csv'
]

nosummer_cluster_paths = [
    f'{DATA_DIR}/5.16_nosummer_cluster42.csv',
    f'{DATA_DIR}/5.16_nosummer_cluster8.csv',
    f'{DATA_DIR}/5.16_nosummer_cluster15.csv',
    f'{DATA_DIR}/5.16_nosummer_cluster2025.csv',
    f'{DATA_DIR}/5.16_nosummer_cluster1.csv'
]

In [None]:
def split_num_date_time(df):
    df[['building_num', 'date_time']] = df['num_date_time'].str.split('_', n=1, expand=True)
    df['building_num'] = df['building_num'].astype(int)
    return df

def merge_and_average(file_paths):
    df_merged = pd.read_csv(file_paths[0])
    for i in range(1, len(file_paths)):
        df_temp = pd.read_csv(file_paths[i])
        df_merged = pd.merge(df_merged, df_temp, on='num_date_time', suffixes=('', f'_{i}'))
    answer_cols = [col for col in df_merged.columns if 'answer' in col]
    df_merged['answer_avg'] = df_merged[answer_cols].mean(axis=1)
    return df_merged[['num_date_time', 'answer_avg']].rename(columns={'answer_avg': 'answer'})


In [None]:
df_summer_avg = merge_and_average(summer_paths)
df_nosummer_avg = merge_and_average(nosummer_paths)

df_cluster_avg = merge_and_average(summer_cluster_paths)
df_nosummer_cluster_avg = merge_and_average(nosummer_cluster_paths)

df_summer_avg = df_summer_avg.sort_values('num_date_time').reset_index(drop=True)
df_nosummer_avg = df_nosummer_avg.sort_values('num_date_time').reset_index(drop=True)
df_cluster_avg = df_cluster_avg.sort_values('num_date_time').reset_index(drop=True)
df_nosummer_cluster_avg = df_nosummer_cluster_avg.sort_values('num_date_time').reset_index(drop=True)

ensemble_answer = (
    (df_summer_avg['answer'] * 0.8 + df_nosummer_avg['answer'] * 0.2) * 0.8 +
    (df_cluster_avg['answer'] * 0.7 + df_nosummer_cluster_avg['answer'] * 0.3) * 0.2
)

df_ensemble = pd.DataFrame({
    'num_date_time': df_summer_avg['num_date_time'],
    'answer': ensemble_answer
})

df_ensemble = split_num_date_time(df_ensemble)
df_ensemble = df_ensemble.sort_values(by=['building_num', 'date_time']).reset_index(drop=True)
df_ensemble = df_ensemble.drop(columns=['building_num', 'date_time'])

df_ensemble.to_csv(f'{DATA_DIR}/all_ensemble.csv', index=False)

# **Post-processing (holiday adjustment)**

In [None]:
train = pd.read_csv(f'{DATA_DIR}/train.csv', encoding='utf-8-sig')
building_info = pd.read_csv(f'{DATA_DIR}/building_info.csv', encoding='utf-8-sig')

In [None]:
train = train.rename(columns={
    '건물번호': 'building_number',
    '일시': 'date_time',
    '기온(°C)': 'temperature',
    '강수량(mm)': 'rainfall',
    '풍속(m/s)': 'windspeed',
    '습도(%)': 'humidity',
    '일조(hr)': 'sunshine',
    '일사(MJ/m2)': 'solar_radiation',
    '전력소비량(kWh)': 'power_consumption'
})
train.drop('num_date_time', axis = 1, inplace=True)

building_info = building_info.rename(columns={
    '건물번호': 'building_number',
    '건물유형': 'building_type',
    '연면적(m2)': 'total_area',
    '냉방면적(m2)': 'cooling_area',
    '태양광용량(kW)': 'solar_power_capacity',
    'ESS저장용량(kWh)': 'ess_capacity',
    'PCS용량(kW)': 'pcs_capacity'
})

translation_dict = {
    '건물기타': 'Other Buildings', '공공': 'Public', '학교': 'University',
    '백화점': 'Department Store', '병원': 'Hospital', '상용': 'Commercial',
    '아파트': 'Apartment', '연구소': 'Research Institute',
    'IDC(전화국)': 'IDC', '호텔': 'Hotel'
}
building_info['building_type'] = building_info['building_type'].replace(translation_dict)

train = pd.merge(train, building_info, on='building_number', how='left')
train['date_time'] = pd.to_datetime(train['date_time'], format='%Y%m%d %H')
train['hour'] = train['date_time'].dt.hour
train['day'] = train['date_time'].dt.day
train['month'] = train['date_time'].dt.month

outlier_idx = train.index[train['power_consumption'] == 0].tolist()
train.drop(index=outlier_idx, inplace=True)

submission_file_path = f'{DATA_DIR}/all_ensemble.csv'
submit = pd.read_csv(submission_file_path)

test_raw = pd.read_csv(f'{DATA_DIR}/test.csv', encoding='utf-8-sig')
test_raw = test_raw.rename(columns={
    '건물번호': 'building_number',
    '일시': 'date_time'
})
test_raw['date_time'] = pd.to_datetime(test_raw['date_time'], format='%Y%m%d %H')
test_raw['hour'] = test_raw['date_time'].dt.hour
test_raw['day'] = test_raw['date_time'].dt.day
test_raw['month'] = test_raw['date_time'].dt.month

submit = submit.merge(test_raw[['building_number', 'date_time', 'hour', 'day', 'month']],
                      left_index=True, right_index=True, how='left')

In [None]:
replacement_rules = {
    29: {'target_date': {'month': 8, 'day': 25}, 'source_dates': [{'month': 6, 'days': [23]}, {'month': 7, 'days': [28]}]},
    27: {'target_date': {'month': 8, 'day': 25}, 'source_dates': [{'month': 6, 'days': [9, 23]}, {'month': 7, 'days': [14, 28]}, {'month': 8, 'days': [11]}]},
    32: {'target_date': {'month': 8, 'day': 26}, 'source_dates': [{'month': 6, 'days': [10, 24]}, {'month': 7, 'days': [8, 22]}, {'month': 8, 'days': [12]}]},
    40: {'target_date': {'month': 8, 'day': 25}, 'source_dates': [{'month': 6, 'days': [9, 23]}, {'month': 7, 'days': [14, 28]}, {'month': 8, 'days': [11]}]},
    59: {'target_date': {'month': 8, 'day': 25}, 'source_dates': [{'month': 6, 'days': [9, 23]}, {'month': 7, 'days': [14, 28]}, {'month': 8, 'days': [11]}]},
    63: {'target_date': {'month': 8, 'day': 25}, 'source_dates': [{'month': 6, 'days': [9, 23]}, {'month': 7, 'days': [14, 28]}, {'month': 8, 'days': [11]}]}
}

In [None]:
for building_num, rules in replacement_rules.items():
    target_month = rules['target_date']['month']
    target_day = rules['target_date']['day']
    source_dates = rules['source_dates']

    for hour in range(24):
        target_indices = submit[(submit['building_number'] == building_num) &
                                (submit['month'] == target_month) &
                                (submit['day'] == target_day) &
                                (submit['hour'] == hour)].index

        if not target_indices.empty:
            source_data_filter = (train['building_number'] == building_num) & (train['hour'] == hour)

            month_day_conditions = []
            for src_date_info in source_dates:
                month_day_conditions.append(
                    (train['month'] == src_date_info['month']) &
                    (train['day'].isin(src_date_info['days']))
                )

            if building_num == 29:
                final_source_filter = source_data_filter & (month_day_conditions[0] | month_day_conditions[1])
            else:
                final_source_filter = source_data_filter & (month_day_conditions[0] | month_day_conditions[1] | month_day_conditions[2])

            values = train[final_source_filter]['power_consumption'].values

            if len(values) > 2:
                trimmed_mean = (values.sum() - values.max() - values.min()) / (len(values) - 2)
            elif len(values) == 2:
                trimmed_mean = values.mean()
            elif len(values) == 1:
                trimmed_mean = values[0]
            else:
                trimmed_mean = np.nan

            submit.loc[target_indices, 'answer'] = trimmed_mean


In [None]:
submit.drop(columns=['date_time', 'hour', 'day', 'month', 'building_number'], inplace=True)
output_file_path = f'{DATA_DIR}/final.csv'
submit.to_csv(output_file_path, index=False)