## Import

In [1]:
# preprocessing
import numpy as np
import pandas as pd
import tqdm
import random
import os

# imputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

# 경고 무시
import warnings
warnings.filterwarnings('ignore')

# model learning
from sklearn.model_selection import train_test_split
from supervised.automl import AutoML
from sklearn.utils.class_weight import compute_sample_weight
import statsmodels.api as sm

# 평가 지표
from sklearn.metrics import mean_absolute_error

# 모델 저장
import pickle

## Fixed Random-Seed

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

## Load Data

In [3]:
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')
building_info = pd.read_csv('../data/building_info.csv')

## Change Names

In [4]:
#translation_dict = {
#    '건물기타': 'Other Buildings',
#    '공공': 'Public',
#    '대학교': 'University',
#    '데이터센터': 'Data Center',
#    '백화점및아울렛': 'Department Store and Outlet',
#    '병원': 'Hospital',
#    '상용': 'Commercial',
#    '아파트': 'Apartment',
#    '연구소': 'Research Institute',
#    '지식산업센터': 'Knowledge Industry Center',
#    '할인마트': 'Discount Mart',
#    '호텔및리조트': 'Hotel and Resort'
#}

#building_info['건물유형'] = building_info['건물유형'].replace(translation_dict)
# building_info.drop('Unnamed: 0', axis = 1 , inplace=True)

In [5]:
train_df.columns = ['num_date_time', '건물번호', '일시', '기온', '강수량', '풍속', '습도',
       '일조', '일사', '전력소비량']

In [6]:
test_df.columns = ['num_date_time', '건물번호', '일시', '기온', '강수량', '풍속', '습도']

In [7]:
building_info.columns = ['건물번호', '건물유형', '연면적', '냉방면적', '태양광용량', 'ESS저장용량',
       'PCS용량']

## Encoding

In [8]:
# one hot encoding 생성
#building_info = pd.get_dummies(building_info, columns=['건물유형'], drop_first=True)

In [9]:
# LabelEncoder를 객체로 생성
#encoder = LabelEncoder()

# fit, transform 메소드를 통한 레이블 인코딩
#encoder.fit(building_info['건물유형'])
#building_info['건물유형'] = encoder.transform(building_info['건물유형'])

## Merge building info

In [10]:
#building_info['태양광용량'][building_info['태양광용량'] == '-'] = np.nan
#building_info['ESS저장용량'][building_info['ESS저장용량'] == '-'] = np.nan
#building_info['PCS용량'][building_info['PCS용량'] == '-'] = np.nan

In [11]:
# 태양광 사용하는지 마는지 여부
#building_info['태양광사용여부'] = 0

In [12]:
#building_info['태양광사용여부'][(building_info['태양광용량'].notnull()) | (building_info['ESS저장용량'].notnull()) | (building_info['PCS용량'].notnull())] = 1

In [13]:
train_df = pd.merge(train_df, building_info, on='건물번호', how='left')
test_df = pd.merge(test_df, building_info, on='건물번호', how='left')

## Train Data Pre-Processing

In [14]:
train_df.isna().sum()

num_date_time         0
건물번호                  0
일시                    0
기온                    0
강수량              160069
풍속                   19
습도                    9
일조                75182
일사                87913
전력소비량                 0
건물유형                  0
연면적                   0
냉방면적                  0
태양광용량                 0
ESS저장용량               0
PCS용량                 0
dtype: int64

- 결측치가 시급하다. 아무리봐도 처리 해야할 것 같다.

In [15]:
#시계열 특성을 학습에 반영하기 위해 일시를 월, 일, 시간으로 나눕니다
train_df['월'] = train_df['일시'].apply(lambda x : int(x[4:6]))
#train_df['일'] = train_df['일시'].apply(lambda x : int(x[6:8]))
train_df['시'] = train_df['일시'].apply(lambda x : int(x[9:11]))

In [16]:
train_df['일시'] = pd.to_datetime(train_df['일시'], format='%Y%m%d %H')

In [17]:
# 요일 추가 (0이 월요일 6이 일요일)
train_df['주'] = train_df['일시'].dt.dayofweek

In [18]:
#train_df['일조'][(train_df['일조'].isna()) & ((train_df['시'] <= 5) | (train_df['시'] >= 21))] = 0
#train_df['일사'][(train_df['일사'].isna()) & ((train_df['시'] <= 5) | (train_df['시'] >= 21))] = 0

In [19]:
train_df['풍속'][train_df['풍속'].isna()] = 0
train_df['습도'][train_df['습도'].isna()] = 0

In [20]:
# imputation for mice
#imputer_mice = IterativeImputer(random_state=42)
#imputer_mice.fit(train_df[['기온', '풍속', '습도', '일조', '일사']])

# 데이터 변환 (array로 반환하기 때문에 필요에 맞는 형태로 변환 후 사용)
#train_df[['기온', '풍속', '습도', '일조', '일사']] = pd.DataFrame(imputer_mice.transform(train_df[['기온', '풍속', '습도', '일조', '일사']]),
#                         columns=['기온', '풍속', '습도', '일조', '일사'])

In [21]:
# 숫자 보정
#train_df['일조'][train_df['일조'] < 0] = 0
#train_df['일사'][train_df['일사'] < 0] = 0

In [22]:
# 변수 정리
train_df2 = train_df.drop(columns=['num_date_time', '일시',
                                 '태양광용량','ESS저장용량','PCS용량', '강수량', '일조', '일사', '건물유형'])

In [23]:
# 강수량 결측치는 0으로 보정 -> 비 안왔을것이라 예상하고 진행함
# train_df2['강수량(mm)'][train_df2['강수량(mm)'].isna()] = 0

In [24]:
# imputation for mice
#imputer_mice = IterativeImputer(random_state=42)
#imputer_mice.fit(train_df2)

# 데이터 변환 (array로 반환하기 때문에 필요에 맞는 형태로 변환 후 사용)
#train_df2 = pd.DataFrame(imputer_mice.transform(train_df2),
#                         columns=train_df2.columns)

In [25]:
# 그냥 0으로 채우기
#train_df2[train_df2.isna()] = 0

## Append Value

In [26]:
# 온도에 따른 포화 수증기압(mb)
#train_df2['포화수증기압'] = 6.11*10**((7.5*train_df2['기온'])/(237.3 + train_df2['기온']))

In [27]:
# 현재 대기의 수증기압(mb)
#train_df2['대기압'] = train_df2['습도'] * train_df2['포화수증기압'] / 100

In [28]:
# 절대습도(g/m^3)
#train_df2['절대습도'] = (0.794*train_df2['대기압'])/(1+0.00366*train_df2['기온'])

In [29]:
# 공기중의 혼합비(mb)
#train_df2['혼합비'] = 0.622 * train_df2['대기압']/(1013.25 - train_df2['대기압'])

In [30]:
# 습구온도
#train_df2['습구온도'] = train_df2['기온']*np.arctan(0.151977+(train_df2['습도'] + 8.313659)**0.5) + np.arctan(train_df2['기온'] + train_df2['습도']) - np.arctan(train_df2['습도'] - 1.676331) +0.00391838*(train_df2['습도']**1.5)*np.arctan(0.023101*train_df2['습도'])-4.686035

In [31]:
# 불쾌지수
#train_df2['불쾌지수'] = 9/5*train_df2['기온'] - 0.55*(1-0.01*train_df2['습도'])*(9/5*train_df2['기온'] - 26) + 32

In [32]:
# 더위체감지수
#train_df2['더위체감지수'] = -0.24418 + 0.553991*train_df2['습구온도'] + 0.455346*train_df2['기온'] - 0.00217*train_df2['습구온도']**2 + 0.002782*train_df2['습구온도']*train_df2['기온'] + 3

In [33]:
# np.select
#condlist = [
#    (train_df2['더위체감지수'] < 21),
#    (train_df2['더위체감지수'] < 25) & (train_df2['더위체감지수'] >= 21),
#    (train_df2['더위체감지수'] < 28) & (train_df2['더위체감지수'] >= 25),
#    (train_df2['더위체감지수'] < 31) & (train_df2['더위체감지수'] >= 28),
#    (train_df2['더위체감지수'] >= 31)
#            ]

#choicelist = [0, 1, 2, 3, 4]

In [34]:
# 범주형 체감지수
#train_df2['더위체감지수_범주'] = np.select(condlist, choicelist)

In [35]:
# 열지수
#train_df2['열지수'] = (5/9)*(-42.379 + (2.04901523 * ((9/5)*train_df2['기온']+32)) + (10.14333127 * train_df2['습도']) - (0.22475541 * ((9/5)*train_df2['기온']+32)*train_df2['습도']) - (6.83783e-3 * ((9/5)*train_df2['기온']+32)**2) - (5.481717e-2 * train_df2['습도']**2) + (1.22874e-3 * ((9/5)*train_df2['기온']+32)**2*train_df2['습도']) + (8.5282e-4 * ((9/5)*train_df2['기온']+32)*train_df2['습도']**2) - (1.99e-6 * ((9/5)*train_df2['기온']+32)**2*train_df2['습도']**2) - 32)

In [36]:
# 평균냉방면적
#train_df2['평균냉방면적'] = train_df2['냉방면적']/train_df2['연면적']

In [37]:
# onehot 주
#train_df2 = pd.get_dummies(train_df2, columns=['주'], drop_first=True)

In [38]:
# 훈련 및 테스트 데이터 셋
train_x = train_df2[train_df2.columns.difference(['전력소비량', '냉방면적', '연면적'])]
train_y = train_df2['전력소비량']

# BoxCox

In [39]:
# 가중치
boxcox = 0.21

In [40]:
# 적용
train_y_boxcox = train_y**boxcox

## Test Data Pre-Processing

In [43]:
test_df['월'] = test_df['일시'].apply(lambda x : int(x[4:6]))
#test_df['일'] = test_df['일시'].apply(lambda x : int(x[6:8]))
test_df['시'] = test_df['일시'].apply(lambda x : int(x[9:11]))

In [44]:
test_df['일시'] = pd.to_datetime(test_df['일시'], format='%Y%m%d %H')

In [45]:
# 요일 추가 (0이 월요일 6이 일요일)
test_df['주'] = test_df['일시'].dt.dayofweek

In [46]:
# 없는 변수 추가하기
#test_df['일조'] = np.nan
#test_df['일사'] = np.nan

# 데이터 변환 (array로 반환하기 때문에 필요에 맞는 형태로 변환 후 사용)
#test_df[['기온', '풍속', '습도', '일조', '일사']] = pd.DataFrame(imputer_mice.transform(test_df[['기온', '풍속', '습도', '일조', '일사']]),
#                         columns=['기온', '풍속', '습도', '일조', '일사'])

In [47]:
# 숫자 보정
#test_df['일조'][test_df['일조'] < 0] = 0
#test_df['일사'][test_df['일사'] < 0] = 0

In [48]:
# 변수 정리
test_df2 = test_df.drop(columns=['num_date_time', '일시',
                                 '태양광용량','ESS저장용량','PCS용량', '강수량', '건물유형', '연면적',
                                '냉방면적'])

## Append Value

In [49]:
# 온도에 따른 포화 수증기압(mb)
#test_df2['포화수증기압'] = 6.11*10**((7.5*test_df2['기온'])/(237.3 + test_df2['기온']))

In [50]:
# 현재 대기의 수증기압(mb)
#test_df2['대기압'] = test_df2['습도'] * test_df2['포화수증기압'] / 100

In [51]:
# 절대습도(g/m^3)
#test_df2['절대습도'] = (0.794*test_df2['대기압'])/(1+0.00366*test_df2['기온'])

In [52]:
# 공기중의 혼합비(mb)
#test_df2['혼합비'] = 0.622 * test_df2['대기압']/(1013.25 - test_df2['대기압'])

In [53]:
# 습구온도
#test_df2['습구온도'] = test_df2['기온']*np.arctan(0.151977+(test_df2['습도'] + 8.313659)**0.5) + np.arctan(test_df2['기온'] + test_df2['습도']) - np.arctan(test_df2['습도'] - 1.676331) +0.00391838*(test_df2['습도']**1.5)*np.arctan(0.023101*test_df2['습도'])-4.686035

In [54]:
# 불쾌지수
#test_df2['불쾌지수'] = 9/5*test_df2['기온'] - 0.55*(1-0.01*test_df2['습도'])*(9/5*test_df2['기온'] - 26) + 32

In [55]:
# 더위체감지수
#test_df2['더위체감지수'] = -0.24418 + 0.553991*test_df2['습구온도'] + 0.455346*test_df2['기온'] - 0.00217*test_df2['습구온도']**2 + 0.002782*test_df2['습구온도']*test_df2['기온'] + 3

In [56]:
# np.select
#condlist = [
#    (test_df2['더위체감지수'] < 21),
#    (test_df2['더위체감지수'] < 25) & (test_df2['더위체감지수'] >= 21),
#    (test_df2['더위체감지수'] < 28) & (test_df2['더위체감지수'] >= 25),
#    (test_df2['더위체감지수'] < 31) & (test_df2['더위체감지수'] >= 28),
#    (test_df2['더위체감지수'] >= 31)
#           ]

#choicelist = [0, 1, 2, 3, 4]

In [57]:
# 범주형 체감지수
#test_df2['더위체감지수_범주'] = np.select(condlist, choicelist)

In [58]:
# 열지수
#test_df2['열지수'] = (5/9)*(-42.379 + (2.04901523 * ((9/5)*test_df2['기온']+32)) + (10.14333127 * test_df2['습도']) - (0.22475541 * ((9/5)*test_df2['기온']+32)*test_df2['습도']) - (6.83783e-3 * ((9/5)*test_df2['기온']+32)**2) - (5.481717e-2 * test_df2['습도']**2) + (1.22874e-3 * ((9/5)*test_df2['기온']+32)**2*test_df2['습도']) + (8.5282e-4 * ((9/5)*test_df2['기온']+32)*test_df2['습도']**2) - (1.99e-6 * ((9/5)*test_df2['기온']+32)**2*test_df2['습도']**2) - 32)

In [59]:
# 평균냉방면적
#test_df2['평균냉방면적'] = test_df2['냉방면적']/test_df2['연면적']

In [60]:
# onehot 주
#test_df2 = pd.get_dummies(test_df2, columns=['주'], drop_first=True)

In [61]:
# test_df2 = std_scaler.fit_transform(test_df2)

In [62]:
# 훈련 및 테스트 데이터 셋
#test_df2 = test_df2[test_df2.columns.difference(['냉방면적', '연면적', '습도', '월', '건물번호', '기온'])]

In [63]:
test_df2

Unnamed: 0,건물번호,기온,풍속,습도,월,시,주
0,1,23.5,2.2,72,8,0,3
1,1,23.0,0.9,72,8,1,3
2,1,22.7,1.5,75,8,2,3
3,1,22.1,1.3,78,8,3,3
4,1,21.8,1.0,77,8,4,3
...,...,...,...,...,...,...,...
16795,100,22.5,0.9,84,8,19,2
16796,100,20.7,0.4,95,8,20,2
16797,100,20.2,0.4,98,8,21,2
16798,100,20.1,1.1,97,8,22,2


In [64]:
train_x

Unnamed: 0,건물번호,기온,습도,시,월,주,풍속
0,1,18.6,42.0,0,6,2,0.9
1,1,18.0,45.0,1,6,2,1.1
2,1,17.7,45.0,2,6,2,1.5
3,1,16.7,48.0,3,6,2,1.4
4,1,18.4,43.0,4,6,2,2.8
...,...,...,...,...,...,...,...
203995,100,23.1,86.0,19,8,2,0.9
203996,100,22.4,86.0,20,8,2,1.3
203997,100,21.3,92.0,21,8,2,1.0
203998,100,21.0,94.0,22,8,2,0.3


## Regression Model Fit by

In [66]:
# 건물 번호별로 fitting하기
preds = pd.DataFrame()
for i in tqdm.notebook.tqdm(range(53, 71)):
    # train x, y 만들고 적합하기
    globals()['train_buildnum{}_x'.format(i)] = train_x[['기온','습도','시','월','주','풍속']][train_x['건물번호'] == i]
    globals()['train_buildnum{}_y'.format(i)] = train_y_boxcox[train_x['건물번호'] == i]
    
    # automl modeling
    automl = AutoML(mode="Compete",
                    algorithms=["Baseline",
                                "CatBoost",
                                "Xgboost",
                                "Random Forest",
                                "Extra Trees",
                                "LightGBM",
                                "Neural Network"], 
                    ml_task = "regression",
                    eval_metric = 'mae',
                    random_state = 42,
                    results_path = '../model/split_build/' + str(i) + '/'
                   )
    
    # fitting
    automl.fit(globals()['train_buildnum{}_x'.format(i)], globals()['train_buildnum{}_y'.format(i)])
    
    # predict
    globals()['pred_buildnum{}'.format(i)] = automl.predict(test_df2[['기온','습도','시','월','주','풍속']][test_df2['건물번호'] == i])
    
    # 혹시몰라서 saving
    pd.DataFrame(globals()['pred_buildnum{}'.format(i)]).to_csv('../data/split_build/split_build{}.csv'.format(i), index = False)
    

  0%|          | 0/18 [00:00<?, ?it/s]

AutoML directory: ../model/split_build/53/
The task is regression with evaluation metric mae
AutoML will use algorithms: ['Baseline', 'CatBoost', 'Xgboost', 'Random Forest', 'Extra Trees', 'LightGBM', 'Neural Network']
AutoML will stack models
AutoML will ensemble available models
AutoML steps: ['adjust_validation', 'simple_algorithms', 'default_algorithms', 'not_so_random', 'golden_features', 'kmeans_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'boost_on_errors', 'ensemble', 'stack', 'ensemble_stacked']
* Step adjust_validation will try to check up to 1 model
1_DecisionTree mae 0.170836 trained in 0.34 seconds
Adjust validation. Remove: 1_DecisionTree
Validation strategy: 10-fold CV Shuffle
* Step simple_algorithms will try to check up to 1 model
1_Baseline mae 0.558141 trained in 1.56 seconds
* Step default_algorithms will try to check up to 6 models
2_Default_LightGBM mae 0.100976 trained in 8.68 seconds
3_Default_Xgboost mae 0.0935

28_CatBoost_GoldenFeatures_BoostOnErrors mae 0.083144 trained in 116.09 seconds
* Step ensemble will try to check up to 1 model
Ensemble mae 0.080733 trained in 33.46 seconds
* Step stack will try to check up to 59 models
28_CatBoost_GoldenFeatures_Stacked mae 0.078387 trained in 269.7 seconds
94_Xgboost_Stacked mae 0.078147 trained in 25.59 seconds
70_LightGBM_Stacked mae 0.077446 trained in 20.77 seconds
103_NeuralNetwork_Stacked mae 0.167912 trained in 19.3 seconds
105_RandomForest_Stacked mae 0.075608 trained in 27.82 seconds
112_ExtraTrees_Stacked mae 0.07693 trained in 22.86 seconds
89_CatBoost_GoldenFeatures_Stacked mae 0.078619 trained in 239.87 seconds
67_Xgboost_Stacked mae 0.07866 trained in 26.0 seconds
96_LightGBM_Stacked mae 0.077446 trained in 21.71 seconds
5_Default_NeuralNetwork_Stacked mae 0.166156 trained in 18.09 seconds
78_RandomForest_Stacked mae 0.076435 trained in 30.39 seconds
111_ExtraTrees_Stacked mae 0.076275 trained in 22.73 seconds
93_CatBoost_Stacked not 

81_RandomForest mae 0.219741 trained in 25.78 seconds
82_RandomForest mae 0.136341 trained in 27.67 seconds
83_ExtraTrees mae 0.255761 trained in 26.94 seconds
84_ExtraTrees mae 0.203542 trained in 28.14 seconds
85_ExtraTrees mae 0.318389 trained in 26.18 seconds
* Step hill_climbing_2 will try to check up to 11 models
86_CatBoost mae 0.089168 trained in 28.77 seconds
87_CatBoost mae 0.08572 trained in 63.74 seconds
88_CatBoost mae 0.089054 trained in 34.73 seconds
89_LightGBM mae 0.088901 trained in 28.57 seconds
90_Xgboost mae 0.092382 trained in 27.2 seconds
91_Xgboost mae 0.104765 trained in 22.88 seconds
92_NeuralNetwork mae 0.13223 trained in 27.51 seconds
93_NeuralNetwork mae 0.204227 trained in 24.63 seconds
* Step boost_on_errors will try to check up to 1 model
87_CatBoost_BoostOnErrors mae 0.085638 trained in 64.42 seconds
* Step ensemble will try to check up to 1 model
Ensemble mae 0.082012 trained in 22.08 seconds
* Step stack will try to check up to 59 models
87_CatBoost_S

73_CatBoost mae 0.011025 trained in 38.67 seconds
74_CatBoost mae 0.011059 trained in 34.08 seconds
75_CatBoost mae 0.01128 trained in 47.29 seconds
76_Xgboost mae 0.011361 trained in 30.1 seconds
77_Xgboost mae 0.011495 trained in 29.5 seconds
78_LightGBM mae 0.011611 trained in 28.28 seconds
79_LightGBM mae 0.011768 trained in 27.6 seconds
80_LightGBM mae 0.011768 trained in 27.73 seconds
* Step boost_on_errors will try to check up to 1 model
61_CatBoost_BoostOnErrors mae 0.010814 trained in 42.67 seconds
* Step ensemble will try to check up to 1 model
Ensemble mae 0.010421 trained in 15.54 seconds
* Step stack will try to check up to 56 models
61_CatBoost_Stacked mae 0.01073 trained in 84.66 seconds
76_Xgboost_Stacked mae 0.010633 trained in 37.83 seconds
24_LightGBM_Stacked mae 0.010828 trained in 32.75 seconds
5_Default_NeuralNetwork_Stacked mae 0.044382 trained in 33.92 seconds
41_RandomForest_Stacked mae 0.010494 trained in 45.08 seconds
50_ExtraTrees_Stacked mae 0.010359 traine

59_CatBoost_Stacked not trained. Stop training after the first fold. Time needed to train on the first fold 15.0 seconds. The time estimate for training on all folds is larger than total_time_limit.
68_Xgboost_Stacked mae 0.00876 trained in 44.03 seconds
22_LightGBM_Stacked mae 0.008709 trained in 36.69 seconds
5_Default_NeuralNetwork_Stacked mae 0.042902 trained in 38.78 seconds
6_Default_RandomForest_Stacked not trained. Stop training after the first fold. Time needed to train on the first fold 2.0 seconds. The time estimate for training on all folds is larger than total_time_limit.
49_ExtraTrees_Stacked not trained. Stop training after the first fold. Time needed to train on the first fold 1.0 seconds. The time estimate for training on all folds is larger than total_time_limit.
60_CatBoost_Stacked not trained. Stop training after the first fold. Time needed to train on the first fold 24.0 seconds. The time estimate for training on all folds is larger than total_time_limit.
* Step en

7_Default_ExtraTrees mae 0.014292 trained in 46.39 seconds
* Step not_so_random will try to check up to 54 models
17_LightGBM mae 0.0063 trained in 42.76 seconds
8_Xgboost mae 0.006757 trained in 45.88 seconds
26_CatBoost mae 0.005126 trained in 49.77 seconds
35_RandomForest mae 0.009721 trained in 47.89 seconds
44_ExtraTrees mae 0.01429 trained in 47.29 seconds
53_NeuralNetwork mae 0.007983 trained in 43.33 seconds
18_LightGBM mae 0.005463 trained in 41.53 seconds
9_Xgboost mae 0.006408 trained in 43.52 seconds
27_CatBoost mae 0.00523 trained in 45.68 seconds
36_RandomForest mae 0.007858 trained in 50.78 seconds
45_ExtraTrees mae 0.012826 trained in 49.57 seconds
54_NeuralNetwork mae 0.007256 trained in 44.79 seconds
19_LightGBM mae 0.005618 trained in 43.03 seconds
10_Xgboost mae 0.00625 trained in 48.87 seconds
28_CatBoost mae 0.004978 trained in 98.12 seconds
37_RandomForest mae 0.013576 trained in 48.38 seconds
46_ExtraTrees mae 0.017926 trained in 51.04 seconds
55_NeuralNetwork m

60_CatBoost mae 0.053794 trained in 64.56 seconds
61_CatBoost mae 0.054862 trained in 53.52 seconds
62_Xgboost mae 0.055951 trained in 52.96 seconds
63_LightGBM mae 0.067918 trained in 52.74 seconds
64_LightGBM mae 0.056278 trained in 50.27 seconds
65_LightGBM mae 0.063663 trained in 53.14 seconds
* Step hill_climbing_2 will try to check up to 27 models
66_CatBoost mae 0.05159 trained in 97.1 seconds
67_Xgboost mae 0.054441 trained in 54.79 seconds
68_Xgboost mae 0.055091 trained in 52.13 seconds
69_Xgboost mae 0.055418 trained in 53.4 seconds
* Step boost_on_errors will try to check up to 1 model
26_CatBoost_BoostOnErrors mae 0.052446 trained in 61.7 seconds
* Step ensemble will try to check up to 1 model
Ensemble mae 0.049517 trained in 5.64 seconds
* Step stack will try to check up to 43 models
26_CatBoost_Stacked mae 0.051969 trained in 75.0 seconds
67_Xgboost_Stacked mae 0.05077 trained in 58.07 seconds
18_LightGBM_Stacked mae 0.050063 trained in 49.62 seconds
55_NeuralNetwork_Sta

83_LightGBM_Stacked mae 0.049003 trained in 30.99 seconds
58_NeuralNetwork_Stacked mae 0.271548 trained in 30.07 seconds
41_RandomForest_Stacked mae 0.047323 trained in 34.03 seconds
50_ExtraTrees_Stacked mae 0.047075 trained in 33.85 seconds
76_CatBoost_Stacked mae 0.048885 trained in 145.18 seconds
3_Default_Xgboost_Stacked mae 0.049638 trained in 32.8 seconds
81_LightGBM_Stacked mae 0.049003 trained in 31.4 seconds
5_Default_NeuralNetwork_Stacked mae 0.300736 trained in 29.84 seconds
36_RandomForest_Stacked mae 0.04717 trained in 34.74 seconds
* Step ensemble_stacked will try to check up to 1 model
Ensemble_Stacked mae 0.046396 trained in 22.14 seconds
AutoML fit time: 3640.73 seconds
AutoML best model: Ensemble_Stacked
AutoML directory: ../model/split_build/61/
The task is regression with evaluation metric mae
AutoML will use algorithms: ['Baseline', 'CatBoost', 'Xgboost', 'Random Forest', 'Extra Trees', 'LightGBM', 'Neural Network']
AutoML will stack models
AutoML will ensemble av

2_Default_LightGBM mae 0.037172 trained in 36.92 seconds
3_Default_Xgboost mae 0.035797 trained in 36.62 seconds
4_Default_CatBoost mae 0.030493 trained in 38.7 seconds
5_Default_NeuralNetwork mae 0.047342 trained in 33.34 seconds
6_Default_RandomForest mae 0.08653 trained in 35.28 seconds
7_Default_ExtraTrees mae 0.103839 trained in 35.07 seconds
* Step not_so_random will try to check up to 54 models
17_LightGBM mae 0.036004 trained in 34.49 seconds
8_Xgboost mae 0.045655 trained in 36.39 seconds
26_CatBoost mae 0.030382 trained in 41.18 seconds
35_RandomForest mae 0.089596 trained in 36.65 seconds
44_ExtraTrees mae 0.099002 trained in 35.47 seconds
53_NeuralNetwork mae 0.053963 trained in 33.03 seconds
18_LightGBM mae 0.034622 trained in 33.57 seconds
9_Xgboost mae 0.037134 trained in 33.98 seconds
27_CatBoost mae 0.031759 trained in 36.76 seconds
36_RandomForest mae 0.070538 trained in 38.8 seconds
45_ExtraTrees mae 0.094824 trained in 37.44 seconds
54_NeuralNetwork mae 0.050645 tra

38_RandomForest mae 0.093085 trained in 39.32 seconds
47_ExtraTrees mae 0.110116 trained in 38.72 seconds
56_NeuralNetwork mae 0.068036 trained in 37.21 seconds
21_LightGBM mae 0.04847 trained in 36.49 seconds
12_Xgboost mae 0.052474 trained in 38.35 seconds
30_CatBoost mae 0.037904 trained in 53.46 seconds
39_RandomForest mae 0.106969 trained in 40.0 seconds
48_ExtraTrees mae 0.117368 trained in 37.96 seconds
57_NeuralNetwork mae 0.060452 trained in 36.73 seconds
22_LightGBM mae 0.042214 trained in 37.9 seconds
13_Xgboost mae 0.044968 trained in 38.6 seconds
31_CatBoost mae 0.043375 trained in 40.73 seconds
40_RandomForest mae 0.092461 trained in 39.46 seconds
49_ExtraTrees mae 0.108171 trained in 38.9 seconds
58_NeuralNetwork mae 0.053004 trained in 38.09 seconds
23_LightGBM mae 0.04222 trained in 38.27 seconds
* Step golden_features will try to check up to 3 models
None 10
Add Golden Feature: 시_sum_기온
Add Golden Feature: 월_multiply_시
Add Golden Feature: 시_multiply_기온
Add Golden Feat

60_CatBoost mae 0.026642 trained in 68.03 seconds
61_CatBoost mae 0.026692 trained in 44.37 seconds
62_CatBoost mae 0.025766 trained in 45.8 seconds
63_LightGBM mae 0.028949 trained in 41.95 seconds
64_LightGBM mae 0.029683 trained in 39.87 seconds
65_LightGBM mae 0.029818 trained in 40.54 seconds
66_Xgboost mae 0.030082 trained in 42.91 seconds
* Step hill_climbing_2 will try to check up to 28 models
67_CatBoost mae 0.025837 trained in 54.0 seconds
68_CatBoost mae 0.025806 trained in 51.99 seconds
69_CatBoost mae 0.02664 trained in 44.46 seconds
70_CatBoost mae 0.025966 trained in 49.21 seconds
71_LightGBM mae 0.029307 trained in 41.49 seconds
72_LightGBM mae 0.029523 trained in 40.89 seconds
* Step boost_on_errors will try to check up to 1 model
59_CatBoost_BoostOnErrors mae 0.026068 trained in 63.75 seconds
* Step ensemble will try to check up to 1 model
Ensemble mae 0.02496 trained in 7.97 seconds
* Step stack will try to check up to 46 models
59_CatBoost_Stacked mae 0.027285 train

1_DecisionTree mae 0.052412 trained in 8.69 seconds
Adjust validation. Remove: 1_DecisionTree
Validation strategy: 5-fold CV Shuffle
* Step simple_algorithms will try to check up to 1 model
1_Baseline mae 0.114735 trained in 49.96 seconds
* Step default_algorithms will try to check up to 6 models
2_Default_LightGBM mae 0.027464 trained in 45.0 seconds
3_Default_Xgboost mae 0.027419 trained in 45.22 seconds
4_Default_CatBoost mae 0.025557 trained in 46.39 seconds
5_Default_NeuralNetwork mae 0.036675 trained in 45.36 seconds
6_Default_RandomForest mae 0.049104 trained in 47.92 seconds
7_Default_ExtraTrees mae 0.057577 trained in 47.49 seconds
* Step not_so_random will try to check up to 54 models
17_LightGBM mae 0.028312 trained in 43.84 seconds
8_Xgboost mae 0.031012 trained in 45.56 seconds
26_CatBoost mae 0.025223 trained in 48.33 seconds
35_RandomForest mae 0.049678 trained in 48.62 seconds
44_ExtraTrees mae 0.055617 trained in 46.51 seconds
53_NeuralNetwork mae 0.035944 trained in 4

* Step golden_features will try to check up to 3 models
None 10
Add Golden Feature: 시_sum_기온
Add Golden Feature: 월_multiply_시
Add Golden Feature: 시_multiply_기온
Add Golden Feature: 월_sum_시
Add Golden Feature: 시_multiply_습도
Add Golden Feature: 시_diff_월
Add Golden Feature: 시_ratio_월
Add Golden Feature: 월_ratio_시
Add Golden Feature: 습도_ratio_시
Add Golden Feature: 시_ratio_습도
Created 10 Golden Features in 16.13 seconds.
26_CatBoost_GoldenFeatures mae 0.031468 trained in 76.62 seconds
30_CatBoost_GoldenFeatures mae 0.031091 trained in 73.8 seconds
* Step kmeans_features will try to check up to 3 models
26_CatBoost_KMeansFeatures mae 0.031642 trained in 62.21 seconds
30_CatBoost_KMeansFeatures mae 0.030484 trained in 89.57 seconds
* Step insert_random_feature will try to check up to 1 model
26_CatBoost_RandomFeature mae 0.031425 trained in 60.29 seconds
Drop features ['random_feature']
Skip features_selection because no parameters were generated.
* Step hill_climbing_1 will try to check up to 

* Step ensemble_stacked will try to check up to 1 model
Ensemble_Stacked mae 0.028368 trained in 8.56 seconds
AutoML fit time: 3624.55 seconds
AutoML best model: Ensemble_Stacked
AutoML directory: ../model/split_build/69/
The task is regression with evaluation metric mae
AutoML will use algorithms: ['Baseline', 'CatBoost', 'Xgboost', 'Random Forest', 'Extra Trees', 'LightGBM', 'Neural Network']
AutoML will stack models
AutoML will ensemble available models
AutoML steps: ['adjust_validation', 'simple_algorithms', 'default_algorithms', 'not_so_random', 'golden_features', 'kmeans_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'boost_on_errors', 'ensemble', 'stack', 'ensemble_stacked']
* Step adjust_validation will try to check up to 1 model
1_DecisionTree mae 0.073784 trained in 10.05 seconds
Adjust validation. Remove: 1_DecisionTree
Validation strategy: 5-fold CV Shuffle
* Step simple_algorithms will try to check up to 1 model
1_Baseline m

None 10
Add Golden Feature: 월_sum_시
Add Golden Feature: 시_diff_월
Add Golden Feature: 월_multiply_시
Add Golden Feature: 시_diff_주
Add Golden Feature: 풍속_sum_시
Add Golden Feature: 시_ratio_월
Add Golden Feature: 월_ratio_시
Add Golden Feature: 시_diff_풍속
Add Golden Feature: 시_multiply_습도
Add Golden Feature: 기온_ratio_시
Created 10 Golden Features in 16.46 seconds.
28_CatBoost_GoldenFeatures mae 0.069546 trained in 120.26 seconds
* Step kmeans_features will try to check up to 3 models
28_CatBoost_KMeansFeatures mae 0.068508 trained in 120.8 seconds
Not enough time to perform features selection. Skip
Time needed for features selection ~ 462.0 seconds
Please increase total_time_limit to at least (4679 seconds) to have features selection
Skip insert_random_feature because no parameters were generated.
Skip features_selection because no parameters were generated.
* Step hill_climbing_1 will try to check up to 29 models
57_CatBoost mae 0.061433 trained in 92.82 seconds
58_CatBoost mae 0.068392 trained 

## Submission

In [67]:
submission = pd.read_csv('../data/sample_submission.csv')
submission

Unnamed: 0,num_date_time,answer
0,1_20220825 00,0
1,1_20220825 01,0
2,1_20220825 02,0
3,1_20220825 03,0
4,1_20220825 04,0
...,...,...
16795,100_20220831 19,0
16796,100_20220831 20,0
16797,100_20220831 21,0
16798,100_20220831 22,0


## Data Collecting

In [85]:
preds = pd.DataFrame()
for i in tqdm.notebook.tqdm(range(1, 101)):
    # load data
    tmp = pd.read_csv('../data/split_build/split_build'+str(i)+'.csv')
    
    # merging
    preds = pd.concat([preds, tmp], ignore_index=True)

  0%|          | 0/100 [00:00<?, ?it/s]

In [86]:
preds.columns = ['pred']

In [88]:
submission['answer'] = preds['pred']**(1/boxcox)
submission

Unnamed: 0,num_date_time,answer
0,1_20220825 00,2014.297444
1,1_20220825 01,2023.343391
2,1_20220825 02,1855.747162
3,1_20220825 03,1808.738063
4,1_20220825 04,1835.059301
...,...,...
16795,100_20220831 19,876.176108
16796,100_20220831 20,779.115975
16797,100_20220831 21,698.410587
16798,100_20220831 22,639.890263


In [89]:
submission.to_csv('../data/AutoML_0.21_NA0_split_build.csv', index=False)