In [1]:
# 데이터 로딩
from src.data_loader import(
load_building_info, load_and_merge,
split_building_types, encode_building)
from src.features import add_time_features, drop_unused_features

building = load_building_info()

# 건물 유형에 따른 데이터셋 분리
building_types = ['IDC(전화국)']
selected, rest = split_building_types(building, building_types=building_types)

In [2]:
# 인코딩
rest_encoded = encode_building(rest)

train_df = load_and_merge(building_info=rest_encoded)
# 이상치 제거: 전력 소비량 0 제외
train_df = train_df[train_df['전력소비량(kWh)'] != 0]
train_df = add_time_features(train_df)
train_df = drop_unused_features(train_df)

test_df = load_and_merge(path='data/test.csv', building_info=rest_encoded)
test_df = add_time_features(test_df)
test_df = drop_unused_features(test_df, columns=['건물번호', '냉방면적(m2)'])

In [3]:
# 전처리
from src.features import (
    fit_weather_pca, transform_weather_pca,
    split_features_target,
    scale_features, train_valid_split
)

scaler_w, pca_w = fit_weather_pca(train_df)
train_df = transform_weather_pca(train_df, scaler_w, pca_w)

X, y = split_features_target(train_df)
X_scaled, scaler = scale_features(X)
# X_train, X_valid, y_train, y_valid = train_valid_split(X_scaled, y)
X_train, y_train = X_scaled, y # 검증 데이터셋을 나누지 않고 전수 학습

# print(X_train.shape,X_valid.shape)  ## 열의 갯수가 같아야한다.
#                                     ## y의 열 갯수는 1이여야함.
# print(y_train.shape,y_valid.shape)  ## train , test 끼리는 행의 갯수가 같아야한다.

#### Random Forest 모델링

In [4]:
from src.models import train_model
from sklearn.ensemble import RandomForestRegressor

model = train_model(RandomForestRegressor, X_train, y_train, random_state=42, n_estimators=250)

#### 예측

In [5]:
test_df = transform_weather_pca(test_df, scaler_w, pca_w)

X_test = test_df.drop(columns=['기온(°C)', '강수량(mm)', '풍속(m/s)', '습도(%)', 'PCS용량(kW)', 'ESS저장용량(kWh)', 'num_date_time'])
X_test_rest = scaler.transform(X_test)

In [6]:
y_pred_rest = model.predict(X_test_rest)
submission_rest = test_df[['num_date_time']]
submission_rest['answer'] = y_pred_rest

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission_rest['answer'] = y_pred_rest


In [7]:
if building_types:
    # 인코딩
    select_encoded = encode_building(selected)
    
    train_df = load_and_merge(building_info=select_encoded)
    # 이상치 제거: 전력 소비량 0 제외
    train_df = train_df[train_df['전력소비량(kWh)'] != 0]
    train_df = add_time_features(train_df)
    train_df = drop_unused_features(train_df)

    test_df = load_and_merge(path='data/test.csv', building_info=select_encoded)
    test_df = add_time_features(test_df)
    test_df = drop_unused_features(test_df, columns=['건물번호', '냉방면적(m2)'])
    
    scaler_w, pca_w = fit_weather_pca(train_df)
    train_df = transform_weather_pca(train_df, scaler_w, pca_w)
    
    X, y = split_features_target(train_df)
    X_scaled, scaler = scale_features(X)
    X_train_sel, y_train_sel = X_scaled, y
    
    model_sel = train_model(RandomForestRegressor, X_train_sel, y_train_sel, random_state=42, n_estimators=250)

    test_df = transform_weather_pca(test_df, scaler_w, pca_w)

    X_test = test_df.drop(columns=['기온(°C)', '강수량(mm)', '풍속(m/s)', '습도(%)', 'PCS용량(kW)', 'ESS저장용량(kWh)', 'num_date_time'])
    X_test_sel = scaler.transform(X_test)

    y_pred_sel = model_sel.predict(X_test_sel)
    submission_sel = test_df[['num_date_time']]
    submission_sel['answer'] = y_pred_sel

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission_sel['answer'] = y_pred_sel


#### 데이터 저장

In [8]:
import pandas as pd
submission_all = pd.concat([submission_rest, submission_sel], axis=0)
ndt = pd.read_csv('sample_submission.csv', encoding='utf-8-sig')
submission = pd.merge(ndt, submission_all, on='num_date_time', how='left')

In [9]:
submission.head(10)

Unnamed: 0,num_date_time,answer
0,1_20240825 00,4183.6926
1,1_20240825 01,3763.4154
2,1_20240825 02,3873.48852
3,1_20240825 03,3602.63064
4,1_20240825 04,3610.88688
5,1_20240825 05,3661.40748
6,1_20240825 06,3691.16688
7,1_20240825 07,4387.59492
8,1_20240825 08,5138.40468
9,1_20240825 09,6352.0344


In [10]:
submission.to_csv('submission.csv', index=False)