## Import

In [1]:
# preprocessing
import numpy as np
import pandas as pd
import tqdm
import random
import os

# imputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import LabelEncoder

# 경고 무시
import warnings
warnings.filterwarnings('ignore')

# model learning
from sklearn.model_selection import train_test_split
from supervised.automl import AutoML
from sklearn.utils.class_weight import compute_sample_weight

# 평가 지표
from sklearn.metrics import mean_absolute_error

# 모델 저장
import pickle

## Fixed Random-Seed

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

## Load Data

In [3]:
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')
building_info = pd.read_csv('../data/building_info.csv')

In [4]:
translation_dict = {
    '건물기타': 'Other Buildings',
    '공공': 'Public',
    '대학교': 'University',
    '데이터센터': 'Data Center',
    '백화점및아울렛': 'Department Store and Outlet',
    '병원': 'Hospital',
    '상용': 'Commercial',
    '아파트': 'Apartment',
    '연구소': 'Research Institute',
    '지식산업센터': 'Knowledge Industry Center',
    '할인마트': 'Discount Mart',
    '호텔및리조트': 'Hotel and Resort'
}

building_info['건물유형'] = building_info['건물유형'].replace(translation_dict)
# building_info.drop('Unnamed: 0', axis = 1 , inplace=True)

## Encoding

In [5]:
# one hot encoding 생성
building_info = pd.get_dummies(building_info, columns=['건물유형'], drop_first=True)

## Merge building info

In [6]:
building_info['태양광용량(kW)'][building_info['태양광용량(kW)'] == '-'] = np.nan
building_info['ESS저장용량(kWh)'][building_info['ESS저장용량(kWh)'] == '-'] = np.nan
building_info['PCS용량(kW)'][building_info['PCS용량(kW)'] == '-'] = np.nan

In [7]:
train_df = pd.merge(train_df, building_info, on='건물번호', how='left')
test_df = pd.merge(test_df, building_info, on='건물번호', how='left')

## Train Data Pre-Processing

In [8]:
train_df.isna().sum()

num_date_time                            0
건물번호                                     0
일시                                       0
기온(C)                                    0
강수량(mm)                             160069
풍속(m/s)                                 19
습도(%)                                    9
일조(hr)                               75182
일사(MJ/m2)                            87913
전력소비량(kWh)                               0
연면적(m2)                                  0
냉방면적(m2)                                 0
태양광용량(kW)                           130560
ESS저장용량(kWh)                        193800
PCS용량(kW)                           193800
건물유형_Commercial                          0
건물유형_Data Center                         0
건물유형_Department Store and Outlet         0
건물유형_Discount Mart                       0
건물유형_Hospital                            0
건물유형_Hotel and Resort                    0
건물유형_Knowledge Industry Center           0
건물유형_Other Buildings                     0
건물유형_Public

- 결측치가 시급하다. 아무리봐도 처리 해야할 것 같다.

In [9]:
#시계열 특성을 학습에 반영하기 위해 일시를 월, 일, 시간으로 나눕니다
train_df['월'] = train_df['일시'].apply(lambda x : int(x[4:6]))
train_df['일'] = train_df['일시'].apply(lambda x : int(x[6:8]))
train_df['시'] = train_df['일시'].apply(lambda x : int(x[9:11]))

In [10]:
train_df['일시'] = pd.to_datetime(train_df['일시'], format='%Y%m%d %H')

In [11]:
# 요일 추가 (0이 월요일 6이 일요일)
train_df['주'] = train_df['일시'].dt.dayofweek

In [12]:
# 변수 정리
train_df2 = train_df.drop(columns=['num_date_time', '일시', '일조(hr)', '일사(MJ/m2)',
                                 '태양광용량(kW)','ESS저장용량(kWh)','PCS용량(kW)', '강수량(mm)'])

In [13]:
# 강수량 결측치는 0으로 보정 -> 비 안왔을것이라 예상하고 진행함
# train_df2['강수량(mm)'][train_df2['강수량(mm)'].isna()] = 0

In [14]:
# imputation for mice
imputer_mice = IterativeImputer(random_state=42)
imputer_mice.fit(train_df2)

# 데이터 변환 (array로 반환하기 때문에 필요에 맞는 형태로 변환 후 사용)
train_df2 = pd.DataFrame(imputer_mice.transform(train_df2),
                         columns=train_df2.columns)

## Change Name

In [15]:
train_df2.columns

Index(['건물번호', '기온(C)', '풍속(m/s)', '습도(%)', '전력소비량(kWh)', '연면적(m2)',
       '냉방면적(m2)', '건물유형_Commercial', '건물유형_Data Center',
       '건물유형_Department Store and Outlet', '건물유형_Discount Mart',
       '건물유형_Hospital', '건물유형_Hotel and Resort',
       '건물유형_Knowledge Industry Center', '건물유형_Other Buildings', '건물유형_Public',
       '건물유형_Research Institute', '건물유형_University', '월', '일', '시', '주'],
      dtype='object')

In [16]:
train_df2.columns = ['건물번호', '기온', '풍속', '습도', '전력소비량', '연면적',
       '냉방면적', 'Commercial', 'Data Center',
       'Department Store and Outlet', 'Discount Mart',
       'Hospital', 'Hotel and Resort',
       'Knowledge Industry Center', 'Other Buildings', 'Public',
       'Research Institute', 'University', '월', '일', '시', '주']

## Append Value

In [17]:
train_df2.isna().sum()

건물번호                           0
기온                             0
풍속                             0
습도                             0
전력소비량                          0
연면적                            0
냉방면적                           0
Commercial                     0
Data Center                    0
Department Store and Outlet    0
Discount Mart                  0
Hospital                       0
Hotel and Resort               0
Knowledge Industry Center      0
Other Buildings                0
Public                         0
Research Institute             0
University                     0
월                              0
일                              0
시                              0
주                              0
dtype: int64

In [18]:
# 체감온도(℃)
train_df2['체감온도'] = 13.12 + 0.6215*train_df2['기온'] - 11.37*(train_df2['풍속']**0.16) + 0.3965*(train_df2['풍속']**0.16)*(train_df2['기온'])

In [19]:
# 온도에 따른 포화 수증기압(mb)
train_df2['포화수증기압'] = 6.11*10**((7.5*train_df2['기온'])/(237.3 + train_df2['기온']))

In [20]:
# 현재 대기의 수증기압(mb)
train_df2['대기압'] = train_df2['습도'] * train_df2['포화수증기압'] / 100

In [21]:
# 절대습도(g/m^3)
train_df2['절대습도'] = (0.794*train_df2['대기압'])/(1+0.00366*train_df2['기온'])

In [22]:
# 공기중의 혼합비(mb)
train_df2['혼합비'] = 0.622 * train_df2['대기압']/(1013.25 - train_df2['대기압'])

In [23]:
# 습구온도
train_df2['습구온도'] = train_df2['기온']*np.arctan(0.151977+(train_df2['습도'] + 8.313659)**0.5) + np.arctan(train_df2['기온'] + train_df2['습도']) - np.arctan(train_df2['습도'] - 1.676331) +0.00391838*(train_df2['습도']**1.5)*np.arctan(0.023101*train_df2['습도'])-4.686035

In [24]:
# 불쾌지수
train_df2['불쾌지수'] = 9/5*train_df2['기온'] - 0.55*(1-0.01*train_df2['습도'])*(9/5*train_df2['기온'] - 26) + 32

In [25]:
# 더위체감지수
train_df2['더위체감지수'] = -0.24418 + 0.553991*train_df2['습구온도'] + 0.455346*train_df2['기온'] - 0.00217*train_df2['습구온도']**2 + 0.002782*train_df2['습구온도']*train_df2['기온']

In [26]:
# 열지수
train_df2['열지수'] = (5/9)*(-42.379 + (2.04901523 * ((9/5)*train_df2['기온']+32)) + (10.14333127 * train_df2['습도']) - (0.22475541 * ((9/5)*train_df2['기온']+32)*train_df2['습도']) - (6.83783e-3 * ((9/5)*train_df2['기온']+32)**2) - (5.481717e-2 * train_df2['습도']**2) + (1.22874e-3 * ((9/5)*train_df2['기온']+32)**2*train_df2['습도']) + (8.5282e-4 * ((9/5)*train_df2['기온']+32)*train_df2['습도']**2) - (1.99e-6 * ((9/5)*train_df2['기온']+32)**2*train_df2['습도']**2) - 32)

In [27]:
# 훈련 및 테스트 데이터 셋
train_x = train_df2[train_df2.columns.difference(['전력소비량'])]
train_y = train_df2['전력소비량']

In [28]:
train_x.columns

Index(['Commercial', 'Data Center', 'Department Store and Outlet',
       'Discount Mart', 'Hospital', 'Hotel and Resort',
       'Knowledge Industry Center', 'Other Buildings', 'Public',
       'Research Institute', 'University', '건물번호', '기온', '냉방면적', '대기압',
       '더위체감지수', '불쾌지수', '습구온도', '습도', '시', '연면적', '열지수', '월', '일', '절대습도',
       '주', '체감온도', '포화수증기압', '풍속', '혼합비'],
      dtype='object')

In [29]:
train_x.columns = ['Commercial', 'Data Center', 'Department Store and Outlet',
       'Discount Mart', 'Hospital', 'Hotel and Resort',
       'Knowledge Industry Center', 'Other Buildings', 'Public',
       'Research Institute', 'University','buildnum', 'temperature', 'icearea', 'air', 'hotskinindex', 'madindex', 'humidtemp',
       'humid', 'hh', 'area', 'heatindex', 'mm', 'dd', 'abshumid', 'week', 'skinindex', 'humidairpressure', 'wind',
       'mixture']

In [30]:
train_y.name = 'power'

In [31]:
train_y_root_twice = train_y**0.25

## Regression Model Fit

In [32]:
# smape
def smape(y_true, y_predicted, sample_weight=None):
    score = 100 / len(y_true) * np.sum(np.abs(y_predicted - y_true) / (np.abs(y_true) + np.abs(y_predicted)))
    return score

In [33]:
AutoML

supervised.automl.AutoML

In [34]:
# automl modeling
automl = AutoML(mode="Compete",
                total_time_limit = 3600,
                algorithms=["Baseline",
                            "CatBoost",
                            "Xgboost",
                            "Random Forest",
                            "Extra Trees",
                            "LightGBM",
                            "Neural Network",
                            "CatBoost"],
                ml_task = "regression",
                eval_metric = 'mae',
                random_state = 42,
                results_path = '../model/AutoML_UpValue/'
               )

In [35]:
# fitting
automl.fit(train_x, train_y_root_twice)

AutoML directory: ../model/AutoML_UpValue/
The task is regression with evaluation metric mae
AutoML will use algorithms: ['Baseline', 'CatBoost', 'Xgboost', 'Random Forest', 'Extra Trees', 'LightGBM', 'Neural Network', 'CatBoost']
AutoML will stack models
AutoML will ensemble available models
AutoML steps: ['adjust_validation', 'simple_algorithms', 'default_algorithms', 'not_so_random', 'golden_features', 'kmeans_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'boost_on_errors', 'ensemble', 'stack', 'ensemble_stacked']
* Step adjust_validation will try to check up to 1 model
1_DecisionTree mae 0.692193 trained in 2.05 seconds
Adjust validation. Remove: 1_DecisionTree
Validation strategy: 10-fold CV Shuffle
* Step simple_algorithms will try to check up to 1 model
1_Baseline mae 1.00311 trained in 5.78 seconds
* Step default_algorithms will try to check up to 6 models
2_Default_LightGBM mae 0.059964 trained in 1863.55 seconds
Skip not_so_ra

AutoML(algorithms=['Baseline', 'CatBoost', 'Xgboost', 'Random Forest',
                   'Extra Trees', 'LightGBM', 'Neural Network', 'CatBoost'],
       eval_metric='mae', ml_task='regression', mode='Compete', random_state=42,
       results_path='../model/AutoML_UpValue/')

## Test Data Pre-Processing

In [36]:
test_df['월'] = test_df['일시'].apply(lambda x : int(x[4:6]))
test_df['일'] = test_df['일시'].apply(lambda x : int(x[6:8]))
test_df['시'] = test_df['일시'].apply(lambda x : int(x[9:11]))

In [37]:
test_df['일시'] = pd.to_datetime(test_df['일시'], format='%Y%m%d %H')

In [38]:
# 요일 추가 (0이 월요일 6이 일요일)
test_df['주'] = test_df['일시'].dt.dayofweek

In [39]:
# 변수 정리
test_df2 = test_df.drop(columns=['num_date_time', '일시',
                                 '태양광용량(kW)','ESS저장용량(kWh)','PCS용량(kW)','강수량(mm)'])

- test는 결측치가 없기 때문에 impute 안해도됨.

## Change Name

In [40]:
test_df2.columns

Index(['건물번호', '기온(C)', '풍속(m/s)', '습도(%)', '연면적(m2)', '냉방면적(m2)',
       '건물유형_Commercial', '건물유형_Data Center',
       '건물유형_Department Store and Outlet', '건물유형_Discount Mart',
       '건물유형_Hospital', '건물유형_Hotel and Resort',
       '건물유형_Knowledge Industry Center', '건물유형_Other Buildings', '건물유형_Public',
       '건물유형_Research Institute', '건물유형_University', '월', '일', '시', '주'],
      dtype='object')

In [41]:
test_df2.columns = ['건물번호', '기온', '풍속', '습도', '연면적', '냉방면적',
       'Commercial', 'Data Center',
       'Department Store and Outlet', 'Discount Mart',
       'Hospital', 'Hotel and Resort',
       'Knowledge Industry Center', 'Other Buildings', 'Public',
       'Research Institute', 'University', '월', '일', '시', '주']

## Append Value

In [42]:
# 체감온도(℃)
test_df2['체감온도'] = 13.12 + 0.6215*test_df2['기온'] - 11.37*(test_df2['풍속']**0.16) + 0.3965*(test_df2['풍속']**0.16)*(test_df2['기온'])

In [43]:
# 온도에 따른 포화 수증기압(mb)
test_df2['포화수증기압'] = 6.11*10**((7.5*test_df2['기온'])/(237.3 + test_df2['기온']))

In [44]:
# 현재 대기의 수증기압(mb)
test_df2['대기압'] = test_df2['습도'] * test_df2['포화수증기압'] / 100

In [45]:
# 절대습도(g/m^3)
test_df2['절대습도'] = (0.794*test_df2['대기압'])/(1+0.00366*test_df2['기온'])

In [46]:
# 공기중의 혼합비(mb)
test_df2['혼합비'] = 0.622 * test_df2['대기압']/(1013.25 - test_df2['대기압'])

In [47]:
# 습구온도
test_df2['습구온도'] = test_df2['기온']*np.arctan(0.151977+(test_df2['습도'] + 8.313659)**0.5) + np.arctan(test_df2['기온'] + test_df2['습도']) - np.arctan(test_df2['습도'] - 1.676331) +0.00391838*(test_df2['습도']**1.5)*np.arctan(0.023101*test_df2['습도'])-4.686035

In [48]:
# 불쾌지수
test_df2['불쾌지수'] = 9/5*test_df2['기온'] - 0.55*(1-0.01*test_df2['습도'])*(9/5*test_df2['기온'] - 26) + 32

In [49]:
# 더위체감지수
test_df2['더위체감지수'] = -0.24418 + 0.553991*test_df2['습구온도'] + 0.455346*test_df2['기온'] - 0.00217*test_df2['습구온도']**2 + 0.002782*test_df2['습구온도']*test_df2['기온']

In [50]:
# 열지수
test_df2['열지수'] = (5/9)*(-42.379 + (2.04901523 * ((9/5)*test_df2['기온']+32)) + (10.14333127 * test_df2['습도']) - (0.22475541 * ((9/5)*test_df2['기온']+32)*test_df2['습도']) - (6.83783e-3 * ((9/5)*test_df2['기온']+32)**2) - (5.481717e-2 * test_df2['습도']**2) + (1.22874e-3 * ((9/5)*test_df2['기온']+32)**2*test_df2['습도']) + (8.5282e-4 * ((9/5)*test_df2['기온']+32)*test_df2['습도']**2) - (1.99e-6 * ((9/5)*test_df2['기온']+32)**2*test_df2['습도']**2) - 32)

In [51]:
test_x = test_df2[['Commercial', 'Data Center', 'Department Store and Outlet',
       'Discount Mart', 'Hospital', 'Hotel and Resort',
       'Knowledge Industry Center', 'Other Buildings', 'Public',
       'Research Institute', 'University', '건물번호', '기온', '냉방면적', '대기압',
       '더위체감지수', '불쾌지수', '습구온도', '습도', '시', '연면적', '열지수', '월', '일', '절대습도',
       '주', '체감온도', '포화수증기압', '풍속', '혼합비']]

In [52]:
test_x.columns

Index(['Commercial', 'Data Center', 'Department Store and Outlet',
       'Discount Mart', 'Hospital', 'Hotel and Resort',
       'Knowledge Industry Center', 'Other Buildings', 'Public',
       'Research Institute', 'University', '건물번호', '기온', '냉방면적', '대기압',
       '더위체감지수', '불쾌지수', '습구온도', '습도', '시', '연면적', '열지수', '월', '일', '절대습도',
       '주', '체감온도', '포화수증기압', '풍속', '혼합비'],
      dtype='object')

In [53]:
test_x.columns = ['Commercial', 'Data Center', 'Department Store and Outlet',
       'Discount Mart', 'Hospital', 'Hotel and Resort',
       'Knowledge Industry Center', 'Other Buildings', 'Public',
       'Research Institute', 'University','buildnum', 'temperature', 'icearea', 'air', 'hotskinindex', 'madindex', 'humidtemp',
       'humid', 'hh', 'area', 'heatindex', 'mm', 'dd', 'abshumid', 'week', 'skinindex', 'humidairpressure', 'wind',
       'mixture']

## Inference

In [54]:
preds = automl.predict(test_x)

## Submission

In [55]:
submission = pd.read_csv('../data/sample_submission.csv')
submission

Unnamed: 0,num_date_time,answer
0,1_20220825 00,0
1,1_20220825 01,0
2,1_20220825 02,0
3,1_20220825 03,0
4,1_20220825 04,0
...,...,...
16795,100_20220831 19,0
16796,100_20220831 20,0
16797,100_20220831 21,0
16798,100_20220831 22,0


In [56]:
submission['answer'] = preds**4
submission

Unnamed: 0,num_date_time,answer
0,1_20220825 00,2090.829482
1,1_20220825 01,2150.440273
2,1_20220825 02,1994.058556
3,1_20220825 03,1806.047785
4,1_20220825 04,1779.849622
...,...,...
16795,100_20220831 19,907.067036
16796,100_20220831 20,862.938483
16797,100_20220831 21,781.049005
16798,100_20220831 22,665.198841


In [57]:
submission.to_csv('../data/upvalue_mape_norain_submission.csv', index=False)