## Import

In [1]:
# preprocessing
import numpy as np
import pandas as pd
import tqdm
import random
import os

# imputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

# 경고 무시
import warnings
warnings.filterwarnings('ignore')

# model learning
from sklearn.model_selection import train_test_split
from supervised.automl import AutoML
from sklearn.utils.class_weight import compute_sample_weight
import statsmodels.api as sm

# 평가 지표
from sklearn.metrics import mean_absolute_error

# 모델 저장
import pickle

## Fixed Random-Seed

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

## Load Data

In [3]:
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')
building_info = pd.read_csv('../data/building_info.csv')

## Change Names

In [4]:
#translation_dict = {
#    '건물기타': 'Other Buildings',
#    '공공': 'Public',
#    '대학교': 'University',
#    '데이터센터': 'Data Center',
#    '백화점및아울렛': 'Department Store and Outlet',
#    '병원': 'Hospital',
#    '상용': 'Commercial',
#    '아파트': 'Apartment',
#    '연구소': 'Research Institute',
#    '지식산업센터': 'Knowledge Industry Center',
#    '할인마트': 'Discount Mart',
#    '호텔및리조트': 'Hotel and Resort'
#}

#building_info['건물유형'] = building_info['건물유형'].replace(translation_dict)
# building_info.drop('Unnamed: 0', axis = 1 , inplace=True)

In [5]:
train_df.columns = ['num_date_time', '건물번호', '일시', '기온', '강수량', '풍속', '습도',
       '일조', '일사', '전력소비량']

In [6]:
test_df.columns = ['num_date_time', '건물번호', '일시', '기온', '강수량', '풍속', '습도']

In [7]:
building_info.columns = ['건물번호', '건물유형', '연면적', '냉방면적', '태양광용량', 'ESS저장용량',
       'PCS용량']

## Encoding

In [8]:
# one hot encoding 생성
#building_info = pd.get_dummies(building_info, columns=['건물유형'], drop_first=True)

In [9]:
# LabelEncoder를 객체로 생성
#encoder = LabelEncoder()

# fit, transform 메소드를 통한 레이블 인코딩
#encoder.fit(building_info['건물유형'])
#building_info['건물유형'] = encoder.transform(building_info['건물유형'])

## Merge building info

In [10]:
#building_info['태양광용량'][building_info['태양광용량'] == '-'] = np.nan
#building_info['ESS저장용량'][building_info['ESS저장용량'] == '-'] = np.nan
#building_info['PCS용량'][building_info['PCS용량'] == '-'] = np.nan

In [11]:
# 태양광 사용하는지 마는지 여부
#building_info['태양광사용여부'] = 0

In [12]:
#building_info['태양광사용여부'][(building_info['태양광용량'].notnull()) | (building_info['ESS저장용량'].notnull()) | (building_info['PCS용량'].notnull())] = 1

In [13]:
train_df = pd.merge(train_df, building_info, on='건물번호', how='left')
test_df = pd.merge(test_df, building_info, on='건물번호', how='left')

## Train Data Pre-Processing

In [14]:
train_df.isna().sum()

num_date_time         0
건물번호                  0
일시                    0
기온                    0
강수량              160069
풍속                   19
습도                    9
일조                75182
일사                87913
전력소비량                 0
건물유형                  0
연면적                   0
냉방면적                  0
태양광용량                 0
ESS저장용량               0
PCS용량                 0
dtype: int64

- 결측치가 시급하다. 아무리봐도 처리 해야할 것 같다.

In [15]:
#시계열 특성을 학습에 반영하기 위해 일시를 월, 일, 시간으로 나눕니다
train_df['월'] = train_df['일시'].apply(lambda x : int(x[4:6]))
#train_df['일'] = train_df['일시'].apply(lambda x : int(x[6:8]))
train_df['시'] = train_df['일시'].apply(lambda x : int(x[9:11]))

In [16]:
train_df['일시'] = pd.to_datetime(train_df['일시'], format='%Y%m%d %H')

In [17]:
# 요일 추가 (0이 월요일 6이 일요일)
train_df['주'] = train_df['일시'].dt.dayofweek

In [18]:
#train_df['일조'][(train_df['일조'].isna()) & ((train_df['시'] <= 5) | (train_df['시'] >= 21))] = 0
#train_df['일사'][(train_df['일사'].isna()) & ((train_df['시'] <= 5) | (train_df['시'] >= 21))] = 0

In [19]:
train_df['풍속'][train_df['풍속'].isna()] = 0
train_df['습도'][train_df['습도'].isna()] = 0

In [20]:
# imputation for mice
#imputer_mice = IterativeImputer(random_state=42)
#imputer_mice.fit(train_df[['기온', '풍속', '습도', '일조', '일사']])

# 데이터 변환 (array로 반환하기 때문에 필요에 맞는 형태로 변환 후 사용)
#train_df[['기온', '풍속', '습도', '일조', '일사']] = pd.DataFrame(imputer_mice.transform(train_df[['기온', '풍속', '습도', '일조', '일사']]),
#                         columns=['기온', '풍속', '습도', '일조', '일사'])

In [21]:
# 숫자 보정
#train_df['일조'][train_df['일조'] < 0] = 0
#train_df['일사'][train_df['일사'] < 0] = 0

In [22]:
# 변수 정리
train_df2 = train_df.drop(columns=['num_date_time', '일시',
                                 '태양광용량','ESS저장용량','PCS용량', '강수량', '일조', '일사', '건물유형'])

In [23]:
# 강수량 결측치는 0으로 보정 -> 비 안왔을것이라 예상하고 진행함
# train_df2['강수량(mm)'][train_df2['강수량(mm)'].isna()] = 0

In [24]:
# imputation for mice
#imputer_mice = IterativeImputer(random_state=42)
#imputer_mice.fit(train_df2)

# 데이터 변환 (array로 반환하기 때문에 필요에 맞는 형태로 변환 후 사용)
#train_df2 = pd.DataFrame(imputer_mice.transform(train_df2),
#                         columns=train_df2.columns)

In [25]:
# 그냥 0으로 채우기
#train_df2[train_df2.isna()] = 0

## Append Value

In [26]:
# 온도에 따른 포화 수증기압(mb)
#train_df2['포화수증기압'] = 6.11*10**((7.5*train_df2['기온'])/(237.3 + train_df2['기온']))

In [27]:
# 현재 대기의 수증기압(mb)
#train_df2['대기압'] = train_df2['습도'] * train_df2['포화수증기압'] / 100

In [28]:
# 절대습도(g/m^3)
#train_df2['절대습도'] = (0.794*train_df2['대기압'])/(1+0.00366*train_df2['기온'])

In [29]:
# 공기중의 혼합비(mb)
#train_df2['혼합비'] = 0.622 * train_df2['대기압']/(1013.25 - train_df2['대기압'])

In [30]:
# 습구온도
#train_df2['습구온도'] = train_df2['기온']*np.arctan(0.151977+(train_df2['습도'] + 8.313659)**0.5) + np.arctan(train_df2['기온'] + train_df2['습도']) - np.arctan(train_df2['습도'] - 1.676331) +0.00391838*(train_df2['습도']**1.5)*np.arctan(0.023101*train_df2['습도'])-4.686035

In [31]:
# 불쾌지수
#train_df2['불쾌지수'] = 9/5*train_df2['기온'] - 0.55*(1-0.01*train_df2['습도'])*(9/5*train_df2['기온'] - 26) + 32

In [32]:
# 더위체감지수
#train_df2['더위체감지수'] = -0.24418 + 0.553991*train_df2['습구온도'] + 0.455346*train_df2['기온'] - 0.00217*train_df2['습구온도']**2 + 0.002782*train_df2['습구온도']*train_df2['기온'] + 3

In [33]:
# np.select
#condlist = [
#    (train_df2['더위체감지수'] < 21),
#    (train_df2['더위체감지수'] < 25) & (train_df2['더위체감지수'] >= 21),
#    (train_df2['더위체감지수'] < 28) & (train_df2['더위체감지수'] >= 25),
#    (train_df2['더위체감지수'] < 31) & (train_df2['더위체감지수'] >= 28),
#    (train_df2['더위체감지수'] >= 31)
#            ]

#choicelist = [0, 1, 2, 3, 4]

In [34]:
# 범주형 체감지수
#train_df2['더위체감지수_범주'] = np.select(condlist, choicelist)

In [35]:
# 열지수
#train_df2['열지수'] = (5/9)*(-42.379 + (2.04901523 * ((9/5)*train_df2['기온']+32)) + (10.14333127 * train_df2['습도']) - (0.22475541 * ((9/5)*train_df2['기온']+32)*train_df2['습도']) - (6.83783e-3 * ((9/5)*train_df2['기온']+32)**2) - (5.481717e-2 * train_df2['습도']**2) + (1.22874e-3 * ((9/5)*train_df2['기온']+32)**2*train_df2['습도']) + (8.5282e-4 * ((9/5)*train_df2['기온']+32)*train_df2['습도']**2) - (1.99e-6 * ((9/5)*train_df2['기온']+32)**2*train_df2['습도']**2) - 32)

In [36]:
# 평균냉방면적
#train_df2['평균냉방면적'] = train_df2['냉방면적']/train_df2['연면적']

In [37]:
# onehot 주
#train_df2 = pd.get_dummies(train_df2, columns=['주'], drop_first=True)

In [38]:
# 훈련 및 테스트 데이터 셋
train_x = train_df2[train_df2.columns.difference(['전력소비량', '냉방면적', '연면적'])]
train_y = train_df2['전력소비량']

# BoxCox

In [39]:
# 가중치
boxcox = 0.21

In [40]:
# 적용
train_y_boxcox = train_y**boxcox

## Test Data Pre-Processing

In [41]:
test_df['월'] = test_df['일시'].apply(lambda x : int(x[4:6]))
#test_df['일'] = test_df['일시'].apply(lambda x : int(x[6:8]))
test_df['시'] = test_df['일시'].apply(lambda x : int(x[9:11]))

In [42]:
test_df['일시'] = pd.to_datetime(test_df['일시'], format='%Y%m%d %H')

In [43]:
# 요일 추가 (0이 월요일 6이 일요일)
test_df['주'] = test_df['일시'].dt.dayofweek

In [44]:
# 없는 변수 추가하기
#test_df['일조'] = np.nan
#test_df['일사'] = np.nan

# 데이터 변환 (array로 반환하기 때문에 필요에 맞는 형태로 변환 후 사용)
#test_df[['기온', '풍속', '습도', '일조', '일사']] = pd.DataFrame(imputer_mice.transform(test_df[['기온', '풍속', '습도', '일조', '일사']]),
#                         columns=['기온', '풍속', '습도', '일조', '일사'])

In [45]:
# 숫자 보정
#test_df['일조'][test_df['일조'] < 0] = 0
#test_df['일사'][test_df['일사'] < 0] = 0

In [46]:
# 변수 정리
test_df2 = test_df.drop(columns=['num_date_time', '일시',
                                 '태양광용량','ESS저장용량','PCS용량', '강수량', '건물유형', '연면적',
                                '냉방면적'])

## Append Value

In [47]:
# 온도에 따른 포화 수증기압(mb)
#test_df2['포화수증기압'] = 6.11*10**((7.5*test_df2['기온'])/(237.3 + test_df2['기온']))

In [48]:
# 현재 대기의 수증기압(mb)
#test_df2['대기압'] = test_df2['습도'] * test_df2['포화수증기압'] / 100

In [49]:
# 절대습도(g/m^3)
#test_df2['절대습도'] = (0.794*test_df2['대기압'])/(1+0.00366*test_df2['기온'])

In [50]:
# 공기중의 혼합비(mb)
#test_df2['혼합비'] = 0.622 * test_df2['대기압']/(1013.25 - test_df2['대기압'])

In [51]:
# 습구온도
#test_df2['습구온도'] = test_df2['기온']*np.arctan(0.151977+(test_df2['습도'] + 8.313659)**0.5) + np.arctan(test_df2['기온'] + test_df2['습도']) - np.arctan(test_df2['습도'] - 1.676331) +0.00391838*(test_df2['습도']**1.5)*np.arctan(0.023101*test_df2['습도'])-4.686035

In [52]:
# 불쾌지수
#test_df2['불쾌지수'] = 9/5*test_df2['기온'] - 0.55*(1-0.01*test_df2['습도'])*(9/5*test_df2['기온'] - 26) + 32

In [53]:
# 더위체감지수
#test_df2['더위체감지수'] = -0.24418 + 0.553991*test_df2['습구온도'] + 0.455346*test_df2['기온'] - 0.00217*test_df2['습구온도']**2 + 0.002782*test_df2['습구온도']*test_df2['기온'] + 3

In [54]:
# np.select
#condlist = [
#    (test_df2['더위체감지수'] < 21),
#    (test_df2['더위체감지수'] < 25) & (test_df2['더위체감지수'] >= 21),
#    (test_df2['더위체감지수'] < 28) & (test_df2['더위체감지수'] >= 25),
#    (test_df2['더위체감지수'] < 31) & (test_df2['더위체감지수'] >= 28),
#    (test_df2['더위체감지수'] >= 31)
#           ]

#choicelist = [0, 1, 2, 3, 4]

In [55]:
# 범주형 체감지수
#test_df2['더위체감지수_범주'] = np.select(condlist, choicelist)

In [56]:
# 열지수
#test_df2['열지수'] = (5/9)*(-42.379 + (2.04901523 * ((9/5)*test_df2['기온']+32)) + (10.14333127 * test_df2['습도']) - (0.22475541 * ((9/5)*test_df2['기온']+32)*test_df2['습도']) - (6.83783e-3 * ((9/5)*test_df2['기온']+32)**2) - (5.481717e-2 * test_df2['습도']**2) + (1.22874e-3 * ((9/5)*test_df2['기온']+32)**2*test_df2['습도']) + (8.5282e-4 * ((9/5)*test_df2['기온']+32)*test_df2['습도']**2) - (1.99e-6 * ((9/5)*test_df2['기온']+32)**2*test_df2['습도']**2) - 32)

In [57]:
# 평균냉방면적
#test_df2['평균냉방면적'] = test_df2['냉방면적']/test_df2['연면적']

In [58]:
# onehot 주
#test_df2 = pd.get_dummies(test_df2, columns=['주'], drop_first=True)

In [59]:
# test_df2 = std_scaler.fit_transform(test_df2)

In [60]:
# 훈련 및 테스트 데이터 셋
#test_df2 = test_df2[test_df2.columns.difference(['냉방면적', '연면적', '습도', '월', '건물번호', '기온'])]

In [61]:
test_df2

Unnamed: 0,건물번호,기온,풍속,습도,월,시,주
0,1,23.5,2.2,72,8,0,3
1,1,23.0,0.9,72,8,1,3
2,1,22.7,1.5,75,8,2,3
3,1,22.1,1.3,78,8,3,3
4,1,21.8,1.0,77,8,4,3
...,...,...,...,...,...,...,...
16795,100,22.5,0.9,84,8,19,2
16796,100,20.7,0.4,95,8,20,2
16797,100,20.2,0.4,98,8,21,2
16798,100,20.1,1.1,97,8,22,2


In [62]:
train_x

Unnamed: 0,건물번호,기온,습도,시,월,주,풍속
0,1,18.6,42.0,0,6,2,0.9
1,1,18.0,45.0,1,6,2,1.1
2,1,17.7,45.0,2,6,2,1.5
3,1,16.7,48.0,3,6,2,1.4
4,1,18.4,43.0,4,6,2,2.8
...,...,...,...,...,...,...,...
203995,100,23.1,86.0,19,8,2,0.9
203996,100,22.4,86.0,20,8,2,1.3
203997,100,21.3,92.0,21,8,2,1.0
203998,100,21.0,94.0,22,8,2,0.3


## Regression Model Fit by

In [63]:
# 건물 번호별로 fitting하기
preds = pd.DataFrame()
for i in tqdm.notebook.tqdm(range(1, 2)):
    # train x, y 만들고 적합하기
    globals()['train_buildnum{}_x'.format(i)] = train_x[['기온','습도','시','월','주','풍속']][train_x['건물번호'] == i]
    globals()['train_buildnum{}_y'.format(i)] = train_y_boxcox[train_x['건물번호'] == i]
    
    # automl modeling
    automl = AutoML(mode="Compete",
                    algorithms=["Baseline",
                                "CatBoost",
                                "Xgboost",
                                "Random Forest",
                                "Extra Trees",
                                "LightGBM",
                                "Neural Network"], 
                    ml_task = "regression",
                    eval_metric = 'mape',
                    random_state = 42,
                    results_path = '../model/test_sample' + str(i) + '/'
                   )
    
    # fitting
    automl.fit(globals()['train_buildnum{}_x'.format(i)], globals()['train_buildnum{}_y'.format(i)])
    
    # predict
    globals()['pred_buildnum{}'.format(i)] = automl.predict(test_df2[['기온','습도','시','월','주','풍속']][test_df2['건물번호'] == i])
    
    # 혹시몰라서 saving
    pd.DataFrame(globals()['pred_buildnum{}'.format(i)]).to_csv('../data/test_sample{}.csv'.format(i), index = False)
    

  0%|          | 0/1 [00:00<?, ?it/s]

AutoML directory: ../model/test_sample1/
The task is regression with evaluation metric mape
AutoML will use algorithms: ['Baseline', 'CatBoost', 'Xgboost', 'Random Forest', 'Extra Trees', 'LightGBM', 'Neural Network']
AutoML will stack models
AutoML will ensemble available models
AutoML steps: ['adjust_validation', 'simple_algorithms', 'default_algorithms', 'not_so_random', 'golden_features', 'kmeans_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'boost_on_errors', 'ensemble', 'stack', 'ensemble_stacked']
* Step adjust_validation will try to check up to 1 model
1_DecisionTree mape 0.026914 trained in 0.75 seconds
Adjust validation. Remove: 1_DecisionTree
Validation strategy: 10-fold CV Shuffle
* Step simple_algorithms will try to check up to 1 model
1_Baseline mape 0.08025 trained in 2.88 seconds
* Step default_algorithms will try to check up to 6 models
2_Default_LightGBM mape 0.011329 trained in 13.51 seconds
3_Default_Xgboost mape 0.0


KeyboardInterrupt



## Submission

In [None]:
submission = pd.read_csv('../data/sample_submission.csv')
submission

## Data Collecting

In [None]:
preds = pd.DataFrame()
for i in tqdm.notebook.tqdm(range(1, 101)):
    # load data
    tmp = pd.read_csv('../data/split_build/split_build'+str(i)+'.csv')
    
    # merging
    preds = pd.concat([preds, tmp], ignore_index=True)

In [None]:
preds.columns = ['pred']

In [None]:
submission['answer'] = preds['pred']**(1/boxcox)
submission

In [None]:
submission.to_csv('../data/AutoML_0.21_NA0_split_build.csv', index=False)