## Import

In [1]:
# preprocessing
import numpy as np
import pandas as pd
import tqdm
import random
import os

# imputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import LabelEncoder

# 경고 무시
import warnings
warnings.filterwarnings('ignore')

# model learning
from sklearn.model_selection import train_test_split
from supervised.automl import AutoML
from sklearn.utils.class_weight import compute_sample_weight
import statsmodels.api as sm

# 평가 지표
from sklearn.metrics import mean_absolute_error

# 모델 저장
import pickle

## Fixed Random-Seed

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

## Load Data

In [3]:
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')
building_info = pd.read_csv('../data/building_info.csv')

## Change Names

In [4]:
translation_dict = {
    '건물기타': 'Other Buildings',
    '공공': 'Public',
    '대학교': 'University',
    '데이터센터': 'Data Center',
    '백화점및아울렛': 'Department Store and Outlet',
    '병원': 'Hospital',
    '상용': 'Commercial',
    '아파트': 'Apartment',
    '연구소': 'Research Institute',
    '지식산업센터': 'Knowledge Industry Center',
    '할인마트': 'Discount Mart',
    '호텔및리조트': 'Hotel and Resort'
}

building_info['건물유형'] = building_info['건물유형'].replace(translation_dict)
# building_info.drop('Unnamed: 0', axis = 1 , inplace=True)

In [5]:
train_df.columns = ['num_date_time', '건물번호', '일시', '기온', '강수량', '풍속', '습도',
       '일조', '일사', '전력소비량']

In [6]:
test_df.columns = ['num_date_time', '건물번호', '일시', '기온', '강수량', '풍속', '습도']

In [7]:
building_info.columns = ['건물번호', '건물유형', '연면적', '냉방면적', '태양광용량', 'ESS저장용량',
       'PCS용량']

## Encoding

In [8]:
# one hot encoding 생성
# building_info = pd.get_dummies(building_info, columns=['건물유형'], drop_first=True)

In [9]:
# LabelEncoder를 객체로 생성
encoder = LabelEncoder()

# fit, transform 메소드를 통한 레이블 인코딩
encoder.fit(building_info['건물유형'])
building_info['건물유형'] = encoder.transform(building_info['건물유형'])

## Merge building info

In [10]:
building_info['태양광용량'][building_info['태양광용량'] == '-'] = np.nan
building_info['ESS저장용량'][building_info['ESS저장용량'] == '-'] = np.nan
building_info['PCS용량'][building_info['PCS용량'] == '-'] = np.nan

In [11]:
train_df = pd.merge(train_df, building_info, on='건물번호', how='left')
test_df = pd.merge(test_df, building_info, on='건물번호', how='left')

## Train Data Pre-Processing

In [12]:
train_df.isna().sum()

num_date_time         0
건물번호                  0
일시                    0
기온                    0
강수량              160069
풍속                   19
습도                    9
일조                75182
일사                87913
전력소비량                 0
건물유형                  0
연면적                   0
냉방면적                  0
태양광용량            130560
ESS저장용량          193800
PCS용량            193800
dtype: int64

- 결측치가 시급하다. 아무리봐도 처리 해야할 것 같다.

In [13]:
#시계열 특성을 학습에 반영하기 위해 일시를 월, 일, 시간으로 나눕니다
train_df['월'] = train_df['일시'].apply(lambda x : int(x[4:6]))
train_df['일'] = train_df['일시'].apply(lambda x : int(x[6:8]))
train_df['시'] = train_df['일시'].apply(lambda x : int(x[9:11]))

In [14]:
train_df['일시'] = pd.to_datetime(train_df['일시'], format='%Y%m%d %H')

In [15]:
# 요일 추가 (0이 월요일 6이 일요일)
train_df['주'] = train_df['일시'].dt.dayofweek

In [16]:
# 변수 정리
train_df2 = train_df.drop(columns=['num_date_time', '일시', '일조', '일사',
                                 '태양광용량','ESS저장용량','PCS용량', '강수량'])

In [17]:
# 강수량 결측치는 0으로 보정 -> 비 안왔을것이라 예상하고 진행함
# train_df2['강수량(mm)'][train_df2['강수량(mm)'].isna()] = 0

In [18]:
# imputation for mice
imputer_mice = IterativeImputer(random_state=42)
imputer_mice.fit(train_df2)

# 데이터 변환 (array로 반환하기 때문에 필요에 맞는 형태로 변환 후 사용)
train_df2 = pd.DataFrame(imputer_mice.transform(train_df2),
                         columns=train_df2.columns)

## Append Value

In [19]:
# 체감온도(℃)
train_df2['체감온도'] = 13.12 + 0.6215*train_df2['기온'] - 11.37*(train_df2['풍속']**0.16) + 0.3965*(train_df2['풍속']**0.16)*(train_df2['기온'])

In [20]:
# 온도에 따른 포화 수증기압(mb)
train_df2['포화수증기압'] = 6.11*10**((7.5*train_df2['기온'])/(237.3 + train_df2['기온']))

In [21]:
# 현재 대기의 수증기압(mb)
train_df2['대기압'] = train_df2['습도'] * train_df2['포화수증기압'] / 100

In [22]:
# 절대습도(g/m^3)
train_df2['절대습도'] = (0.794*train_df2['대기압'])/(1+0.00366*train_df2['기온'])

In [23]:
# 공기중의 혼합비(mb)
# train_df2['혼합비'] = 0.622 * train_df2['대기압']/(1013.25 - train_df2['대기압'])

In [24]:
# 습구온도
train_df2['습구온도'] = train_df2['기온']*np.arctan(0.151977+(train_df2['습도'] + 8.313659)**0.5) + np.arctan(train_df2['기온'] + train_df2['습도']) - np.arctan(train_df2['습도'] - 1.676331) +0.00391838*(train_df2['습도']**1.5)*np.arctan(0.023101*train_df2['습도'])-4.686035

In [25]:
# 불쾌지수
train_df2['불쾌지수'] = 9/5*train_df2['기온'] - 0.55*(1-0.01*train_df2['습도'])*(9/5*train_df2['기온'] - 26) + 32

In [26]:
# 더위체감지수
train_df2['더위체감지수'] = -0.24418 + 0.553991*train_df2['습구온도'] + 0.455346*train_df2['기온'] - 0.00217*train_df2['습구온도']**2 + 0.002782*train_df2['습구온도']*train_df2['기온']

In [27]:
# 열지수
# train_df2['열지수'] = (5/9)*(-42.379 + (2.04901523 * ((9/5)*train_df2['기온']+32)) + (10.14333127 * train_df2['습도']) - (0.22475541 * ((9/5)*train_df2['기온']+32)*train_df2['습도']) - (6.83783e-3 * ((9/5)*train_df2['기온']+32)**2) - (5.481717e-2 * train_df2['습도']**2) + (1.22874e-3 * ((9/5)*train_df2['기온']+32)**2*train_df2['습도']) + (8.5282e-4 * ((9/5)*train_df2['기온']+32)*train_df2['습도']**2) - (1.99e-6 * ((9/5)*train_df2['기온']+32)**2*train_df2['습도']**2) - 32)

In [28]:
# 훈련 및 테스트 데이터 셋
train_x = train_df2[train_df2.columns.difference(['전력소비량'])]
train_y = train_df2['전력소비량']

# BoxCox

In [29]:
# 가중치
boxcox = 0.25

In [30]:
# 적용
train_y_root_twice = train_y**boxcox

## Value Check

In [31]:
results = sm.OLS(train_y**0.25, sm.add_constant(train_x[train_x.columns.difference(['mixture'])])).fit()

In [32]:
results.summary()

0,1,2,3
Dep. Variable:,전력소비량,R-squared:,0.144
Model:,OLS,Adj. R-squared:,0.144
Method:,Least Squares,F-statistic:,1910.0
Date:,"Mon, 31 Jul 2023",Prob (F-statistic):,0.0
Time:,00:20:55,Log-Likelihood:,-326250.0
No. Observations:,204000,AIC:,652500.0
Df Residuals:,203981,BIC:,652700.0
Df Model:,18,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,39.8555,4.799,8.305,0.000,30.450,49.261
건물번호,-0.0071,9.68e-05,-73.275,0.000,-0.007,-0.007
건물유형,0.0387,0.001,46.686,0.000,0.037,0.040
기온,-15.2436,0.999,-15.262,0.000,-17.201,-13.286
냉방면적,1.121e-06,5.18e-08,21.639,0.000,1.02e-06,1.22e-06
대기압,-6.6013,0.861,-7.663,0.000,-8.290,-4.913
더위체감지수,25.8804,1.424,18.170,0.000,23.089,28.672
불쾌지수,-0.3138,0.081,-3.855,0.000,-0.473,-0.154
습구온도,-11.8957,0.565,-21.039,0.000,-13.004,-10.788

0,1,2,3
Omnibus:,26371.301,Durbin-Watson:,0.045
Prob(Omnibus):,0.0,Jarque-Bera (JB):,43041.581
Skew:,0.897,Prob(JB):,0.0
Kurtosis:,4.357,Cond. No.,3310000000.0


## Regression Model Fit

In [33]:
# automl modeling
automl = AutoML(mode="Compete",
                total_time_limit = 3600*6,
                algorithms=["Baseline",
                            "CatBoost",
                            "Xgboost",
                            "Random Forest",
                            "Extra Trees",
                            "LightGBM",
                            "Neural Network"],
                ml_task = "regression",
                eval_metric = 'mae',
                random_state = 42,
                results_path = '../model/AutoML_SelectValue_extend_12h/'
               )

In [None]:
# fitting
automl.fit(train_x, train_y_root_twice)

AutoML directory: ../model/AutoML_SelectValue_extend_12h/
The task is regression with evaluation metric mae
AutoML will use algorithms: ['Baseline', 'CatBoost', 'Xgboost', 'Random Forest', 'Extra Trees', 'LightGBM', 'Neural Network']
AutoML will stack models
AutoML will ensemble available models
AutoML steps: ['adjust_validation', 'simple_algorithms', 'default_algorithms', 'not_so_random', 'golden_features', 'kmeans_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'boost_on_errors', 'ensemble', 'stack', 'ensemble_stacked']
* Step adjust_validation will try to check up to 1 model
1_DecisionTree mae 0.738748 trained in 1.13 seconds
Adjust validation. Remove: 1_DecisionTree
Validation strategy: 10-fold CV Shuffle
* Step simple_algorithms will try to check up to 1 model
1_Baseline mae 1.00311 trained in 3.28 seconds
* Step default_algorithms will try to check up to 6 models
2_Default_LightGBM mae 0.05981 trained in 1055.06 seconds
3_Default_Xg

2023-07-31 06:25:10,701 concurrent.futures ERROR exception calling callback for <Future at 0x2d4ecc87048 state=finished raised TerminatedWorkerError>
Traceback (most recent call last):
  File "C:\Users\abc\anaconda3\envs\please\lib\site-packages\joblib\externals\loky\_base.py", line 26, in _invoke_callbacks
    callback(self)
  File "C:\Users\abc\anaconda3\envs\please\lib\site-packages\joblib\parallel.py", line 385, in __call__
    self.parallel.dispatch_next()
  File "C:\Users\abc\anaconda3\envs\please\lib\site-packages\joblib\parallel.py", line 834, in dispatch_next
    if not self.dispatch_one_batch(self._original_iterator):
  File "C:\Users\abc\anaconda3\envs\please\lib\site-packages\joblib\parallel.py", line 901, in dispatch_one_batch
    self._dispatch(tasks)
  File "C:\Users\abc\anaconda3\envs\please\lib\site-packages\joblib\parallel.py", line 819, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "C:\Users\abc\anaconda3\envs\please\lib\site-packages\jo

There was an error during 2_Default_LightGBM_GoldenFeatures training.
Please check ../model/AutoML_SelectValue_extend_12h/errors.md for details.
There was an error during 10_Xgboost_GoldenFeatures training.
Please check ../model/AutoML_SelectValue_extend_12h/errors.md for details.
There was an error during 19_LightGBM_GoldenFeatures training.
Please check ../model/AutoML_SelectValue_extend_12h/errors.md for details.
* Step kmeans_features will try to check up to 3 models
There was an error during 2_Default_LightGBM_KMeansFeatures training.
Please check ../model/AutoML_SelectValue_extend_12h/errors.md for details.
There was an error during 10_Xgboost_KMeansFeatures training.
Please check ../model/AutoML_SelectValue_extend_12h/errors.md for details.
There was an error during 19_LightGBM_KMeansFeatures training.
Please check ../model/AutoML_SelectValue_extend_12h/errors.md for details.
Not enough time to perform features selection. Skip
Time needed for features selection ~ 7205.0 seconds


## Test Data Pre-Processing

In [None]:
test_df['월'] = test_df['일시'].apply(lambda x : int(x[4:6]))
test_df['일'] = test_df['일시'].apply(lambda x : int(x[6:8]))
test_df['시'] = test_df['일시'].apply(lambda x : int(x[9:11]))

In [None]:
test_df['일시'] = pd.to_datetime(test_df['일시'], format='%Y%m%d %H')

In [None]:
# 요일 추가 (0이 월요일 6이 일요일)
test_df['주'] = test_df['일시'].dt.dayofweek

In [None]:
# 변수 정리
test_df2 = test_df.drop(columns=['num_date_time', '일시',
                                 '태양광용량','ESS저장용량','PCS용량','강수량'])

- test는 결측치가 없기 때문에 impute 안해도됨.

## Append Value

In [None]:
# 체감온도(℃)
test_df2['체감온도'] = 13.12 + 0.6215*test_df2['기온'] - 11.37*(test_df2['풍속']**0.16) + 0.3965*(test_df2['풍속']**0.16)*(test_df2['기온'])

In [None]:
# 온도에 따른 포화 수증기압(mb)
test_df2['포화수증기압'] = 6.11*10**((7.5*test_df2['기온'])/(237.3 + test_df2['기온']))

In [None]:
# 현재 대기의 수증기압(mb)
test_df2['대기압'] = test_df2['습도'] * test_df2['포화수증기압'] / 100

In [None]:
# 절대습도(g/m^3)
test_df2['절대습도'] = (0.794*test_df2['대기압'])/(1+0.00366*test_df2['기온'])

In [None]:
# 공기중의 혼합비(mb)
# test_df2['혼합비'] = 0.622 * test_df2['대기압']/(1013.25 - test_df2['대기압'])

In [None]:
# 습구온도
test_df2['습구온도'] = test_df2['기온']*np.arctan(0.151977+(test_df2['습도'] + 8.313659)**0.5) + np.arctan(test_df2['기온'] + test_df2['습도']) - np.arctan(test_df2['습도'] - 1.676331) +0.00391838*(test_df2['습도']**1.5)*np.arctan(0.023101*test_df2['습도'])-4.686035

In [None]:
# 불쾌지수
test_df2['불쾌지수'] = 9/5*test_df2['기온'] - 0.55*(1-0.01*test_df2['습도'])*(9/5*test_df2['기온'] - 26) + 32

In [None]:
# 더위체감지수
test_df2['더위체감지수'] = -0.24418 + 0.553991*test_df2['습구온도'] + 0.455346*test_df2['기온'] - 0.00217*test_df2['습구온도']**2 + 0.002782*test_df2['습구온도']*test_df2['기온']

In [None]:
# 열지수
# test_df2['열지수'] = (5/9)*(-42.379 + (2.04901523 * ((9/5)*test_df2['기온']+32)) + (10.14333127 * test_df2['습도']) - (0.22475541 * ((9/5)*test_df2['기온']+32)*test_df2['습도']) - (6.83783e-3 * ((9/5)*test_df2['기온']+32)**2) - (5.481717e-2 * test_df2['습도']**2) + (1.22874e-3 * ((9/5)*test_df2['기온']+32)**2*test_df2['습도']) + (8.5282e-4 * ((9/5)*test_df2['기온']+32)*test_df2['습도']**2) - (1.99e-6 * ((9/5)*test_df2['기온']+32)**2*test_df2['습도']**2) - 32)

## Inference

In [None]:
preds = automl.predict(test_df2)

## Submission

In [None]:
submission = pd.read_csv('../data/sample_submission.csv')
submission

In [None]:
submission['answer'] = preds**(1/boxcox)
submission

In [None]:
submission.to_csv('../data/select_value_mae_extend_24h_submission.csv', index=False)