In [93]:
import pandas as pd
import datetime as dt
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesRegressor
import optuna
from catboost import CatBoostRegressor
from pycaret.regression import *

In [94]:
train = pd.read_csv('./data/train_0823.csv', encoding='cp949')

train.solar_yes = train.solar_yes.astype(int)
train.ess_yes = train.ess_yes.astype(int)
train.sy_ey = train.sy_ey.astype(int)
train.sy_en = train.sy_en.astype(int)

# train.drop(['building_number','date_time','building_type'])

train['hour_sin'] = np.sin(2 * np.pi * train['hour']/23.0)
train['hour_cos'] = np.cos(2 * np.pi * train['hour']/23.0)

train['date_sin'] = -np.sin(2 * np.pi * (train['month']+train['day']/31)/12)
train['date_cos'] = -np.cos(2 * np.pi * (train['month']+train['day']/31)/12)

train['month_sin'] = -np.sin(2 * np.pi * train['month']/12.0)
train['month_cos'] = -np.cos(2 * np.pi * train['month']/12.0)
## cyclic transformation on weekday
train['weekday_sin'] = -np.sin(2 * np.pi * (train['weekday']+1)/7.0)
train['weekday_cos'] = -np.cos(2 * np.pi * (train['weekday']+1)/7.0)


train['work_time'] = 0
train['low_day'] = 0
train['particular'] = 0
trains = [train[train.building_number == num] for num in range(1,101,1)]
# aa = trains[9]
# aa['work_time']  = (aa['hour'].isin([6,7,8])).astype(int)
# aa.describe()

## cluster 0
for num in [10,13]:
    temp_df = trains[num-1]
    temp_df['work_time'] = (temp_df['hour'].isin([6,7,8])).astype(int)
    
    trains[num-1] = temp_df
for num in [14]:
    temp_df = trains[num-1]
    temp_df['work_time'] = (temp_df['hour'].isin([6,7])).astype(int)
    temp_df['particular'] = temp_df['hour'].isin([0,8,12,18]).astype(int)
    trains[num-1] = temp_df
for num in [15]:
    temp_df = trains[num-1]
    temp_df['work_time'] = (temp_df['hour'].isin([4,5,6,7,8])).astype(int)
    trains[num-1] = temp_df

## cluster 1
for num in [17,18,19,21,22,69,70,72]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=18)&(temp_df['hour']>=6)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    trains[num-1] = temp_df
for num in [45,46,47,48,49,51,52,73]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=21)&(temp_df['hour']>=6)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    trains[num-1] = temp_df
for num in [26,28,76,78,80,82,83,84]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=18)&(temp_df['hour']>=8)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    trains[num-1] = temp_df
for num in [71]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=18)*(temp_df['hour']>=8)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    temp_df['particular'] = (((temp_df['weekday']==5)&(temp_df['hour'].isin([10,11,13,14])))
                               |((temp_df['weekday'].isin([0,1,2,3,4,5]))&(temp_df['hour'].isin([0,1,2,3,4])))).astype(int)
    trains[num-1] = temp_df
for num in [53]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=21)&(temp_df['hour']>=4)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    trains[num-1] = temp_df
for num in [55]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=19)&(temp_df['hour']>=5)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    trains[num-1] = temp_df
for num in [50]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=20)&(temp_df['hour']>=5)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    trains[num-1] = temp_df
for num in [56]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=20)&(temp_df['hour']>=5)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    temp_df['particular'] = ((temp_df['weekday']==2)&(temp_df['hour']==16)).astype(int)
    trains[num-1] = temp_df
for num in [60]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=21)&(temp_df['hour']>=5)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    trains[num-1] = temp_df
for num in [20,23]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=17)&(temp_df['hour']>=6)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    trains[num-1] = temp_df
for num in [58,59,77]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=20)&(temp_df['hour']>=6)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    trains[num-1] = temp_df
for num in [45,46,47,48,49,51,52,73]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=21)&(temp_df['hour']>=6)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    trains[num-1] = temp_df
for num in [27,74,79]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=18)&(temp_df['hour']>=7)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    trains[num-1] = temp_df
for num in [57]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=22)&(temp_df['hour']>=7)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    trains[num-1] = temp_df
for num in [24,25]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=20)&(temp_df['hour']>=8)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    trains[num-1] = temp_df
for num in [29,30]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=17)&(temp_df['hour']>=9)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    trains[num-1] = temp_df
for num in [31]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=20)&(temp_df['hour']>=9)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    trains[num-1] = temp_df
for num in [50]:
    temp_df = trains[num-1]
    temp_df['particular'] = (((temp_df['weekday']==2)&(temp_df['hour']==16))|((temp_df['weekday']==2)&(temp_df['hour']==16))).astype(int)
    trains[num-1] = temp_df
for num in [1]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=21)&(temp_df['hour']>=7)).astype(int)
    temp_df['low_day'] = (temp_df['weekday']==6).astype(int)
    trains[num-1] = temp_df
for num in [8]:
    temp_df = trains[num-1]
    temp_df['low_day'] = (temp_df['weekday']==6).astype(int)
    trains[num-1] = temp_df
for num in [12]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=21)&(temp_df['hour']>=5)).astype(int)
    temp_df['low_day'] = (temp_df['weekday']==6).astype(int)
    trains[num-1] = temp_df
for num in [88,90]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=22)&(temp_df['hour']>=8)).astype(int)
    trains[num-1] = temp_df
for num in [86]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=22)&(temp_df['hour']>=8)).astype(int)
    temp_df['particular'] = ((temp_df['month']==6)&(temp_df['day']==26)).astype(int)
    trains[num-1] = temp_df
for num in [87,89,91]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=21)&(temp_df['hour']>=9)).astype(int)
    trains[num-1] = temp_df
for num in [4]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=18)&(temp_df['hour']>=8)).astype(int)
    trains[num-1] = temp_df
for num in [7,16]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=22)&(temp_df['hour']>=5)).astype(int)
    trains[num-1] = temp_df
for num in [9]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=23)&(temp_df['hour']>=7)).astype(int)
    trains[num-1] = temp_df
for num in [11]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=21)&(temp_df['hour']>=7)).astype(int)
    trains[num-1] = temp_df
for num in [36]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=20)&(temp_df['hour']>=9)).astype(int)
    trains[num-1] = temp_df
for num in [75]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=15)&(temp_df['hour']>=7)).astype(int)
    trains[num-1] = temp_df
for num in [92]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=22)&(temp_df['hour']>=6)).astype(int)
    trains[num-1] = temp_df

## cluster 2
for num in [32,33,34]:
    temp_df = trains[num-1]
    temp_df = temp_df[temp_df.month>=7]
    trains[num-1] = temp_df
for num in [61,65,66,67,68]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']>=7)&(temp_df['hour']<=23)).astype(int)
    trains[num-1] = temp_df
for num in [62,63,64]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']>=8)&(temp_df['hour']<=23)).astype(int)
    trains[num-1] = temp_df

## cluster 3
for num in [93,94,96,97,98,100]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']>=6)&(temp_df['hour']<=22)).astype(int)
    trains[num-1] = temp_df
for num in [42,43,44]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']>=9)&(temp_df['hour']<=20)).astype(int)
    trains[num-1] = temp_df
for num in [2,85,99]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']>=8)&(temp_df['hour']<=21)).astype(int)
    trains[num-1] = temp_df
for num in [38,40,95]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']>=8)&(temp_df['hour']<=20)).astype(int)
    trains[num-1] = temp_df
for num in [3,81]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']>=9)&(temp_df['hour']<=21)).astype(int)
    trains[num-1] = temp_df
for num in [6,41]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']>=8)&(temp_df['hour']<=22)).astype(int)
    trains[num-1] = temp_df
for num in [37,39]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']>=7)&(temp_df['hour']<=21)).astype(int)
    trains[num-1] = temp_df
for num in [35]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']>=7)&(temp_df['hour']<=23)).astype(int)
    trains[num-1] = temp_df
for num in [54]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']>=5)&(temp_df['hour']<=20)).astype(int)
    temp_df['low_day'] = (temp_df['weekday']==0).astype(int)
    trains[num-1] = temp_df
for num in [5]:
    temp_df = trains[num-1]
    temp_df['particular'] = (((temp_df['weekday']==4)&(temp_df['hour'].isin([11,12,13,14,15,16,17,18,19,20])))|
                              ((temp_df['weekday']==5)&(temp_df['hour'].isin([11,12,13,14,15,16,17,18,19,20])))|
                              ((temp_df['weekday']==6)&(temp_df['hour'].isin([8,9,10,11,12,13,14,15,16,17,18])))).astype(int)
    trains[num-1] = temp_df
    

df = pd.concat(trains, ignore_index=True)
df = df.drop(['date_time','hour','day','month','weekday','date'], axis=1)



df_0 = df[df.km_cluster==0]
df_1 = df[df.km_cluster==1]
df_2 = df[df.km_cluster==2]
df_3 = df[df.km_cluster==3]





X_0 = df_0.drop('power_consumption', axis=1)
y_0 = df_0['power_consumption']
y_0_log = np.log(y_0)
X_0_train = X_0.drop('km_cluster', axis=1)

X_1 = df_1.drop('power_consumption', axis=1)
y_1 = df_1['power_consumption']
y_1_log = np.log(y_1)
X_1_train = X_1.drop('km_cluster', axis=1)

X_2 = df_2.drop('power_consumption', axis=1)
y_2 = df_2['power_consumption']
y_2_log = np.log(y_2)
X_2_train = X_2.drop('km_cluster', axis=1)

X_3 = df_3.drop('power_consumption', axis=1)
y_3 = df_3['power_consumption']
y_3_log = np.log(y_3)
X_3_train = X_3.drop('km_cluster', axis=1)



# Standard Scaling for continuous data
con_li = ['hour_sin', 'hour_cos', 'date_sin', 'date_cos', 'month_sin', 'month_cos', 'weekday_sin', 'weekday_cos', 'temperature', 'windspeed', 
'humidity', 'total_area', 'cooling_area', 'solar_power_capacity', 'ess_capacity', 'pcs_capacity', 'discomfort', 'temperature_3', 
'discomfort_3', 'ma_dis_3', 'ma_dis_5', 'CDH', 'temperature_mean', 'discomfort_mean', 'CDH_mean', 'sensory_temperature']

# ## one-hot encoding for categorical data
cat_li = ['building_number', 'holiday','air_yes','solar_yes',
         'ess_yes','sy_ey','sy_en','low_day','particular']

T = [X_0_train,X_1_train,X_2_train,X_3_train]

for train in T :


    for col in con_li :
        if train[col].std() == 0:
            train = train.drop(col, axis=1)
        else :
            train[col] = (train[col] - train[col].mean())/train[col].std()

    for col in cat_li :
        if train[col].nunique() == 1:        
            train = train.drop(col,axis=1)


X_0_train.building_type = X_0_train.building_type.astype('category')
X_0_train = pd.get_dummies(X_0_train)
X_1_train.building_type = X_1_train.building_type.astype('category')
X_1_train = pd.get_dummies(X_1_train)
X_2_train.building_type = X_2_train.building_type.astype('category')
X_2_train = pd.get_dummies(X_2_train)
X_3_train.building_type = X_3_train.building_type.astype('category')
X_3_train = pd.get_dummies(X_3_train)
        
        
train = pd.read_csv('./data/train_0823.csv', encoding='cp949')
test = pd.read_csv('./data/test_0823.csv', encoding='cp949')
# cluster_dict = train.groupby('building_number')['km_cluster'].first().to_dict()
# test['km_cluster'] = test['building_number'].map(cluster_dict)

test.solar_yes = test.solar_yes.astype(int)
test.ess_yes = test.ess_yes.astype(int)
test.sy_ey = test.sy_ey.astype(int)
test.sy_en = test.sy_en.astype(int)

# test.drop(['building_number','date_time','building_type'])

test['hour_sin'] = np.sin(2 * np.pi * test['hour']/23.0)
test['hour_cos'] = np.cos(2 * np.pi * test['hour']/23.0)

test['date_sin'] = -np.sin(2 * np.pi * (test['month']+test['day']/31)/12)
test['date_cos'] = -np.cos(2 * np.pi * (test['month']+test['day']/31)/12)

test['month_sin'] = -np.sin(2 * np.pi * test['month']/12.0)
test['month_cos'] = -np.cos(2 * np.pi * test['month']/12.0)
## cyclic transformation on weekday
test['weekday_sin'] = -np.sin(2 * np.pi * (test['weekday']+1)/7.0)
test['weekday_cos'] = -np.cos(2 * np.pi * (test['weekday']+1)/7.0)


test['work_time'] = 0
test['low_day'] = 0
test['particular'] = 0
tests = [test[test.building_number == num] for num in range(1,101,1)]
# aa = tests[9]
# aa['work_time']  = (aa['hour'].isin([6,7,8])).astype(int)
# aa.describe()

## cluster 0
for num in [10,13]:
    temp_df = tests[num-1]
    temp_df['work_time'] = (temp_df['hour'].isin([6,7,8])).astype(int)
    
    tests[num-1] = temp_df
for num in [14]:
    temp_df = tests[num-1]
    temp_df['work_time'] = (temp_df['hour'].isin([6,7])).astype(int)
    temp_df['particular'] = temp_df['hour'].isin([0,8,12,18]).astype(int)
    tests[num-1] = temp_df
for num in [15]:
    temp_df = tests[num-1]
    temp_df['work_time'] = (temp_df['hour'].isin([4,5,6,7,8])).astype(int)
    tests[num-1] = temp_df

## cluster 1
for num in [17,18,19,21,22,69,70,72]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=18)&(temp_df['hour']>=6)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    tests[num-1] = temp_df
for num in [45,46,47,48,49,51,52,73]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=21)&(temp_df['hour']>=6)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    tests[num-1] = temp_df
for num in [26,28,76,78,80,82,83,84]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=18)&(temp_df['hour']>=8)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    tests[num-1] = temp_df
for num in [71]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=18)*(temp_df['hour']>=8)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    temp_df['particular'] = (((temp_df['weekday']==5)&(temp_df['hour'].isin([10,11,13,14])))
                               |((temp_df['weekday'].isin([0,1,2,3,4,5]))&(temp_df['hour'].isin([0,1,2,3,4])))).astype(int)
    tests[num-1] = temp_df
for num in [53]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=21)&(temp_df['hour']>=4)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    tests[num-1] = temp_df
for num in [55]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=19)&(temp_df['hour']>=5)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    tests[num-1] = temp_df
for num in [50]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=20)&(temp_df['hour']>=5)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    tests[num-1] = temp_df
for num in [56]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=20)&(temp_df['hour']>=5)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    temp_df['particular'] = ((temp_df['weekday']==2)&(temp_df['hour']==16)).astype(int)
    tests[num-1] = temp_df
for num in [60]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=21)&(temp_df['hour']>=5)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    tests[num-1] = temp_df
for num in [20,23]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=17)&(temp_df['hour']>=6)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    tests[num-1] = temp_df
for num in [58,59,77]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=20)&(temp_df['hour']>=6)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    tests[num-1] = temp_df
for num in [45,46,47,48,49,51,52,73]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=21)&(temp_df['hour']>=6)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    tests[num-1] = temp_df
for num in [27,74,79]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=18)&(temp_df['hour']>=7)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    tests[num-1] = temp_df
for num in [57]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=22)&(temp_df['hour']>=7)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    tests[num-1] = temp_df
for num in [24,25]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=20)&(temp_df['hour']>=8)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    tests[num-1] = temp_df
for num in [29,30]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=17)&(temp_df['hour']>=9)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    tests[num-1] = temp_df
for num in [31]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=20)&(temp_df['hour']>=9)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    tests[num-1] = temp_df
for num in [50]:
    temp_df = tests[num-1]
    temp_df['particular'] = (((temp_df['weekday']==2)&(temp_df['hour']==16))|((temp_df['weekday']==2)&(temp_df['hour']==16))).astype(int)
    tests[num-1] = temp_df
for num in [1]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=21)&(temp_df['hour']>=7)).astype(int)
    temp_df['low_day'] = (temp_df['weekday']==6).astype(int)
    tests[num-1] = temp_df
for num in [8]:
    temp_df = tests[num-1]
    temp_df['low_day'] = (temp_df['weekday']==6).astype(int)
    tests[num-1] = temp_df
for num in [12]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=21)&(temp_df['hour']>=5)).astype(int)
    temp_df['low_day'] = (temp_df['weekday']==6).astype(int)
    tests[num-1] = temp_df
for num in [88,90]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=22)&(temp_df['hour']>=8)).astype(int)
    tests[num-1] = temp_df
for num in [86]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=22)&(temp_df['hour']>=8)).astype(int)
    temp_df['particular'] = ((temp_df['month']==6)&(temp_df['day']==26)).astype(int)
    tests[num-1] = temp_df
for num in [87,89,91]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=21)&(temp_df['hour']>=9)).astype(int)
    tests[num-1] = temp_df
for num in [4]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=18)&(temp_df['hour']>=8)).astype(int)
    tests[num-1] = temp_df
for num in [7,16]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=22)&(temp_df['hour']>=5)).astype(int)
    tests[num-1] = temp_df
for num in [9]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=23)&(temp_df['hour']>=7)).astype(int)
    tests[num-1] = temp_df
for num in [11]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=21)&(temp_df['hour']>=7)).astype(int)
    tests[num-1] = temp_df
for num in [36]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=20)&(temp_df['hour']>=9)).astype(int)
    tests[num-1] = temp_df
for num in [75]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=15)&(temp_df['hour']>=7)).astype(int)
    tests[num-1] = temp_df
for num in [92]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=22)&(temp_df['hour']>=6)).astype(int)
    tests[num-1] = temp_df

## cluster 2
for num in [32,33,34]:
    temp_df = tests[num-1]
    temp_df = temp_df[temp_df.month>=7]
    tests[num-1] = temp_df
for num in [61,65,66,67,68]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']>=7)&(temp_df['hour']<=23)).astype(int)
    tests[num-1] = temp_df
for num in [62,63,64]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']>=8)&(temp_df['hour']<=23)).astype(int)
    tests[num-1] = temp_df

## cluster 3
for num in [93,94,96,97,98,100]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']>=6)&(temp_df['hour']<=22)).astype(int)
    tests[num-1] = temp_df
for num in [42,43,44]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']>=9)&(temp_df['hour']<=20)).astype(int)
    tests[num-1] = temp_df
for num in [2,85,99]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']>=8)&(temp_df['hour']<=21)).astype(int)
    tests[num-1] = temp_df
for num in [38,40,95]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']>=8)&(temp_df['hour']<=20)).astype(int)
    tests[num-1] = temp_df
for num in [3,81]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']>=9)&(temp_df['hour']<=21)).astype(int)
    tests[num-1] = temp_df
for num in [6,41]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']>=8)&(temp_df['hour']<=22)).astype(int)
    tests[num-1] = temp_df
for num in [37,39]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']>=7)&(temp_df['hour']<=21)).astype(int)
    tests[num-1] = temp_df
for num in [35]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']>=7)&(temp_df['hour']<=23)).astype(int)
    tests[num-1] = temp_df
for num in [54]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']>=5)&(temp_df['hour']<=20)).astype(int)
    temp_df['low_day'] = (temp_df['weekday']==0).astype(int)
    tests[num-1] = temp_df
for num in [5]:
    temp_df = tests[num-1]
    temp_df['particular'] = (((temp_df['weekday']==4)&(temp_df['hour'].isin([11,12,13,14,15,16,17,18,19,20])))|
                              ((temp_df['weekday']==5)&(temp_df['hour'].isin([11,12,13,14,15,16,17,18,19,20])))|
                              ((temp_df['weekday']==6)&(temp_df['hour'].isin([8,9,10,11,12,13,14,15,16,17,18])))).astype(int)
    tests[num-1] = temp_df
    

df = pd.concat(tests, ignore_index=True)
df = df.drop(['date_time','hour','day','month','weekday','date'], axis=1)
df.building_type = df.building_type.astype('category')

df_0 = df[df.km_cluster==0]
df_1 = df[df.km_cluster==1]
df_2 = df[df.km_cluster==2]
df_3 = df[df.km_cluster==3]

X_0_test = df_0.drop('km_cluster', axis=1)
X_1_test = df_1.drop('km_cluster', axis=1)
X_2_test = df_2.drop('km_cluster', axis=1)
X_3_test = df_3.drop('km_cluster', axis=1)


# Standard Scaling for continuous data
con_li = ['hour_sin', 'hour_cos', 'date_sin', 'date_cos', 'month_sin', 'month_cos', 'weekday_sin', 'weekday_cos', 'temperature', 'windspeed', 
'humidity', 'total_area', 'cooling_area', 'solar_power_capacity', 'ess_capacity', 'pcs_capacity', 'discomfort', 'temperature_3', 
'discomfort_3', 'ma_dis_3', 'ma_dis_5', 'CDH', 'temperature_mean', 'discomfort_mean', 'CDH_mean', 'sensory_temperature']


cat_li = ['building_number','building_type','holiday','air_yes','solar_yes',
         'ess_yes','sy_ey','sy_en','low_day','particular']

T = [X_0_test, X_1_test, X_2_test, X_3_test]

for test in T :
    for col in con_li :
        if test[col].std() == 0:
            test = test.drop(col, axis=1)
        else :
            test[col] = (test[col] - test[col].mean())/test[col].std()

    for col in cat_li :
        if test[col].nunique() == 1:        
            test = test.drop(col,axis=1)
    test = pd.get_dummies(test)


X_0_test.building_type = X_0_test.building_type.astype('category')
X_0_test = pd.get_dummies(X_0_test)
X_1_test.building_type = X_1_test.building_type.astype('category')
X_1_test = pd.get_dummies(X_1_test)
X_2_test.building_type = X_2_test.building_type.astype('category')
X_2_test = pd.get_dummies(X_2_test)
X_3_test.building_type = X_3_test.building_type.astype('category')
X_3_test = pd.get_dummies(X_3_test)


In [95]:
col_0 = ['cooling_area','work_time','building_type_Hospital','building_type_University','building_type_Research Institute',
        'solar_power_capacity','low_day','total_area','sy_en','holiday','building_number','building_type_Commercial',
 'hour_cos','temperature','hour_sin','date_sin','discomfort','weekday_sin','weekday_cos','CDH']
col_1 = ['building_number','total_area','cooling_area','work_time','ma_dis_5','hour_cos','date_sin','hour_sin','CDH','holiday',
 'humidity','ma_dis_3','discomfort','weekday_cos','weekday_sin','discomfort_3','CDH_mean','temperature_3','discomfort_mean',
'temperature_mean','sensory_temperature','temperature']
col_2 = ['work_time','total_area','cooling_area','building_number','hour_cos','hour_sin','temperature','holiday',
 'discomfort','weekday_sin','date_sin','CDH_mean','weekday_cos','discomfort_mean','temperature_mean','solar_power_capacity',
 'ma_dis_3','CDH','windspeed','humidity','sensory_temperature','ma_dis_5','discomfort_3','temperature_3']
col_3 = ['work_time','building_number','total_area','particular','hour_cos','cooling_area','hour_sin','ess_capacity',
 'solar_power_capacity','weekday_cos', 'date_sin','low_day','temperature','weekday_sin','discomfort','temperature_mean',
 'holiday','CDH_mean','ma_dis_5','discomfort_mean','humidity','ma_dis_3','CDH','sensory_temperature','temperature_3',
'windspeed','discomfort_3']

In [96]:
X_0_train = X_0_train[col_0]
X_1_train = X_1_train[col_1]
X_2_train = X_2_train[col_2]
X_3_train = X_3_train[col_3]

X_0_test = X_0_test[col_0]
X_1_test = X_1_test[col_1]
X_2_test = X_2_test[col_2]
X_3_test = X_3_test[col_3]

In [97]:
def smape(true, pred):
    v = 2 * abs(pred - true) / ((abs(pred) + abs(true)))
    result = np.mean(v) * 100
    return result

# Cluster 0

In [11]:
X_train, X_valid, y_train, y_valid = train_test_split(X_0_train, y_0_log, test_size=168*X_0_train.building_number.nunique())

train_0_merged = pd.DataFrame(X_train, columns=col_0)
train_0_merged['power_consumption'] = y_train
train_0_merged

Unnamed: 0,cooling_area,work_time,building_type_Hospital,building_type_University,building_type_Research Institute,solar_power_capacity,low_day,total_area,sy_en,holiday,building_number,building_type_Commercial,hour_cos,temperature,hour_sin,date_sin,discomfort,weekday_sin,weekday_cos,CDH,power_consumption
102840,-0.164737,0,1,0,0,-0.269641,0,-0.181926,1,0,52,0,1.330118,0.652404,5.346294e-17,0.248048,0.902991,0.623126,1.254713,1.428397,7.461813
139027,-0.132025,0,0,0,1,-0.237409,1,-0.154809,1,1,70,0,0.580715,1.526421,-1.282659e+00,0.989727,1.312575,1.117023,-0.893562,1.924104,7.926935
31156,-0.148019,0,0,0,0,-0.316491,0,-0.100430,1,0,16,0,0.580715,0.784830,1.282659e+00,1.033992,0.847780,-1.102532,-0.893562,1.022121,7.439042
116257,-0.154960,0,0,0,0,-0.503889,1,-0.174311,0,1,59,1,1.278649,-0.142158,3.897546e-01,-0.283633,0.055796,1.117023,-0.893562,0.812595,6.478878
157314,-0.142903,1,0,0,0,-0.503889,0,-0.115280,0,0,79,0,0.224555,1.314538,-1.414407e+00,0.966294,1.451647,1.391116,0.298641,1.326189,8.068854
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50726,0.028922,1,0,1,0,-0.035393,0,0.057713,1,0,25,0,-1.134479,1.473450,-9.116841e-01,0.653550,1.076161,-1.102532,-0.893562,0.204460,9.326725
110959,-0.204661,1,0,0,0,-0.503889,1,-0.208816,0,1,56,1,-0.522627,-0.221614,1.361212e+00,1.129201,-0.036380,1.117023,-0.893562,-0.061280,8.316730
152810,-0.184564,0,0,0,1,0.433104,0,-0.211012,1,0,76,0,1.128059,0.043240,7.506029e-01,-0.428720,0.312206,0.623126,1.254713,-0.058725,6.977020
117637,-0.154960,1,0,0,0,-0.503889,0,-0.174311,0,0,59,1,-1.330873,-0.565924,-5.755393e-01,0.516157,-0.452288,0.623126,1.254713,-1.055250,7.949035


In [19]:
exp_0 = setup(train_0_merged, target='power_consumption',fold=5)

Unnamed: 0,Description,Value
0,Session id,1708
1,Target,power_consumption
2,Target type,Regression
3,Original data shape,"(89856, 21)"
4,Transformed data shape,"(89856, 21)"
5,Transformed train set shape,"(62899, 21)"
6,Transformed test set shape,"(26957, 21)"
7,Numeric features,20
8,Preprocess,True
9,Imputation type,simple


In [13]:
best_0 = compare_models(sort='mae')

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,0.0348,0.0038,0.0617,0.9926,0.0075,0.0047,15.268
rf,Random Forest Regressor,0.036,0.0038,0.0615,0.9926,0.0075,0.0048,13.17
dt,Decision Tree Regressor,0.0463,0.007,0.0838,0.9864,0.0102,0.0062,0.464
catboost,CatBoost Regressor,0.0486,0.0058,0.0763,0.9887,0.0093,0.0065,7.46
xgboost,Extreme Gradient Boosting,0.0529,0.0065,0.0803,0.9875,0.0098,0.0071,3.656
lightgbm,Light Gradient Boosting Machine,0.0669,0.0096,0.0978,0.9814,0.0118,0.009,1.1
gbr,Gradient Boosting Regressor,0.1417,0.0368,0.1918,0.9286,0.0231,0.019,4.706
ada,AdaBoost Regressor,0.3425,0.1613,0.4016,0.6869,0.0467,0.0447,4.056
lr,Linear Regression,0.4572,0.3479,0.5898,0.3247,0.0669,0.0595,2.452
ridge,Ridge Regression,0.4574,0.3479,0.5898,0.3247,0.0669,0.0595,0.21


Processing:   0%|          | 0/85 [00:00<?, ?it/s]

In [12]:
bestli = []
paramli = []

# Cluster 1

In [6]:
X_train, X_valid, y_train, y_valid = train_test_split(X_1_train, y_1_log, test_size=168*X_1_train.building_number.nunique())

train_1_merged = pd.DataFrame(X_train, columns=col_1)
train_1_merged['power_consumption'] = y_train
exp_1 = setup(train_1_merged, target='power_consumption',fold=5)

Unnamed: 0,Description,Value
0,Session id,5952
1,Target,power_consumption
2,Target type,Regression
3,Original data shape,"(22176, 24)"
4,Transformed data shape,"(22176, 24)"
5,Transformed train set shape,"(15523, 24)"
6,Transformed test set shape,"(6653, 24)"
7,Numeric features,23
8,Preprocess,True
9,Imputation type,simple


In [7]:
best_1 = compare_models(sort='mae')

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,0.024,0.0015,0.0388,0.9982,0.005,0.0035,4.622
et,Extra Trees Regressor,0.0271,0.0021,0.0453,0.9976,0.0058,0.0039,2.308
xgboost,Extreme Gradient Boosting,0.0307,0.0023,0.0475,0.9973,0.0061,0.0044,2.002
lightgbm,Light Gradient Boosting Machine,0.0313,0.0023,0.0476,0.9973,0.0061,0.0045,0.912
rf,Random Forest Regressor,0.0326,0.0028,0.0528,0.9967,0.0067,0.0047,4.972
dt,Decision Tree Regressor,0.0456,0.0063,0.0794,0.9926,0.0102,0.0066,0.296
gbr,Gradient Boosting Regressor,0.0638,0.0079,0.0889,0.9907,0.0114,0.0091,1.956
ada,AdaBoost Regressor,0.1631,0.0441,0.2098,0.9485,0.0263,0.0232,1.294
ridge,Ridge Regression,0.3718,0.2029,0.4505,0.7627,0.0547,0.051,0.204
br,Bayesian Ridge,0.3718,0.2029,0.4505,0.7626,0.0547,0.051,0.226


Processing:   0%|          | 0/85 [00:00<?, ?it/s]

# Cluster 2

In [8]:
X_train, X_valid, y_train, y_valid = train_test_split(X_2_train, y_2_log, test_size=168*X_2_train.building_number.nunique())

train_2_merged = pd.DataFrame(X_train, columns=col_2)
train_2_merged['power_consumption'] = y_train
exp_2 = setup(train_2_merged, target='power_consumption',fold=5)

Unnamed: 0,Description,Value
0,Session id,3348
1,Target,power_consumption
2,Target type,Regression
3,Original data shape,"(29952, 25)"
4,Transformed data shape,"(29952, 25)"
5,Transformed train set shape,"(20966, 25)"
6,Transformed test set shape,"(8986, 25)"
7,Numeric features,24
8,Preprocess,True
9,Imputation type,simple


In [9]:
best_2 = compare_models(sort='mae')

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,0.0555,0.0092,0.0956,0.9838,0.012,0.0079,3.198
rf,Random Forest Regressor,0.0595,0.0109,0.1043,0.9807,0.013,0.0085,9.234
catboost,CatBoost Regressor,0.0644,0.0097,0.0985,0.9828,0.0123,0.0091,5.734
xgboost,Extreme Gradient Boosting,0.068,0.0104,0.102,0.9815,0.0127,0.0096,2.614
lightgbm,Light Gradient Boosting Machine,0.0705,0.011,0.105,0.9804,0.0131,0.01,0.876
dt,Decision Tree Regressor,0.077,0.0194,0.139,0.9656,0.0173,0.011,0.49
gbr,Gradient Boosting Regressor,0.125,0.0325,0.1803,0.9423,0.0223,0.0177,2.862
ada,AdaBoost Regressor,0.2944,0.1242,0.3524,0.7798,0.0422,0.0402,1.62
ridge,Ridge Regression,0.3426,0.1748,0.418,0.6899,0.0517,0.0481,0.292
br,Bayesian Ridge,0.3426,0.1748,0.418,0.6899,0.0517,0.0481,0.334


Processing:   0%|          | 0/85 [00:00<?, ?it/s]

# Cluster 3

In [10]:
X_train, X_valid, y_train, y_valid = train_test_split(X_3_train, y_3_log, test_size=168*X_3_train.building_number.nunique())

train_3_merged = pd.DataFrame(X_train, columns=col_3)
train_3_merged['power_consumption'] = y_train
exp_3 = setup(train_3_merged, target='power_consumption',fold=5)

Unnamed: 0,Description,Value
0,Session id,4740
1,Target,power_consumption
2,Target type,Regression
3,Original data shape,"(43056, 28)"
4,Transformed data shape,"(43056, 28)"
5,Transformed train set shape,"(30139, 28)"
6,Transformed test set shape,"(12917, 28)"
7,Numeric features,27
8,Preprocess,True
9,Imputation type,simple


In [11]:
best_3 = compare_models(sort='mae')

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,0.0564,0.0093,0.0962,0.9729,0.012,0.0079,7.228
rf,Random Forest Regressor,0.0585,0.0091,0.0955,0.9733,0.0118,0.0081,13.002
catboost,CatBoost Regressor,0.0686,0.01,0.1,0.9708,0.0123,0.0095,6.046
xgboost,Extreme Gradient Boosting,0.0743,0.0114,0.1069,0.9666,0.0131,0.0103,2.574
dt,Decision Tree Regressor,0.0759,0.0174,0.1318,0.9492,0.0162,0.0105,0.38
lightgbm,Light Gradient Boosting Machine,0.0809,0.0136,0.1165,0.9603,0.0143,0.0112,0.844
gbr,Gradient Boosting Regressor,0.1275,0.0323,0.1797,0.9056,0.0223,0.0178,5.406
ada,AdaBoost Regressor,0.2768,0.1104,0.3321,0.6775,0.0404,0.038,3.406
ridge,Ridge Regression,0.3566,0.2065,0.4544,0.3964,0.0554,0.0497,0.202
br,Bayesian Ridge,0.3566,0.2065,0.4544,0.3963,0.0554,0.0497,0.216


Processing:   0%|          | 0/85 [00:00<?, ?it/s]

In [74]:
trains = [X_0_train, X_2_train, X_3_train]
ys = [y_0_log, y_2_log, y_3_log]
cols = [col_0, col_2, col_3]

for i, (train, y, col) in enumerate(zip(trains, ys, cols)) :

    X_train, X_valid, y_train, y_valid = train_test_split(train, y, test_size=168*train.building_number.nunique(), random_state=168)
    def objective(trial):
        param = {
            'n_estimators': trial.suggest_int('n_estimators', 200, 800),
    'max_depth' : trial.suggest_int('max_depth', 10, len(col)),
    'min_samples_split': trial.suggest_int('min_samples_split', 2, 5),
    'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5),
    'n_jobs' : -1
        }

        model = ExtraTreesRegressor(**param)
        model.fit(X_train,y_train)

        preds = model.predict(X_valid)

        sm = smape(np.exp(np.array(y_valid)), np.exp(np.array(preds)))

        return sm


    study = optuna.create_study(study_name = f'ET Optuna_cluster')
    study.optimize(objective, n_trials=30, timeout=24000)

    et = study.best_trial
    et_param = et.params
    print('Best Trial : score {},\nparams {}'.format(et.value, et_param))

    bestli.append(et.value)
    paramli.append(et_param)

[I 2023-08-23 23:11:38,077] A new study created in memory with name: ET Optuna_cluster
[I 2023-08-23 23:11:59,364] Trial 0 finished with value: 3.5759347176711067 and parameters: {'n_estimators': 214, 'max_depth': 17, 'min_samples_split': 4, 'min_samples_leaf': 2}. Best is trial 0 with value: 3.5759347176711067.
[I 2023-08-23 23:12:55,080] Trial 1 finished with value: 5.840103215648814 and parameters: {'n_estimators': 702, 'max_depth': 13, 'min_samples_split': 2, 'min_samples_leaf': 5}. Best is trial 0 with value: 3.5759347176711067.
[W 2023-08-23 23:13:01,401] Trial 2 failed with parameters: {'n_estimators': 574, 'max_depth': 17, 'min_samples_split': 5, 'min_samples_leaf': 2} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "C:\Users\magne\anaconda3\lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\magne\AppData\Local\Temp/ipykernel_2880/4213458422.py", line 18, in obje

KeyboardInterrupt: 

In [75]:
trains = [X_2_train, X_3_train]
ys = [y_2_log, y_3_log]
cols = [col_2, col_3]

for i, (train, y, col) in enumerate(zip(trains, ys, cols)) :

    X_train, X_valid, y_train, y_valid = train_test_split(train, y, test_size=168*train.building_number.nunique(), random_state=168)
    def objective(trial):
        param = {
            'n_estimators': trial.suggest_int('n_estimators', 200, 800),
    'max_depth' : trial.suggest_int('max_depth', 10, len(col)),
    'min_samples_split': trial.suggest_int('min_samples_split', 2, 5),
    'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5),
    'n_jobs' : -1
        }

        model = ExtraTreesRegressor(**param)
        model.fit(X_train,y_train)

        preds = model.predict(X_valid)

        sm = smape(np.exp(np.array(y_valid)), np.exp(np.array(preds)))

        return sm


    study = optuna.create_study(study_name = f'ET Optuna_cluster')
    study.optimize(objective, n_trials=30, timeout=24000)

    et = study.best_trial
    et_param = et.params
    print('Best Trial : score {},\nparams {}'.format(et.value, et_param))

    bestli.append(et.value)
    paramli.append(et_param)

[I 2023-08-23 23:13:07,195] A new study created in memory with name: ET Optuna_cluster
[I 2023-08-23 23:13:22,974] Trial 0 finished with value: 5.3003230297941375 and parameters: {'n_estimators': 538, 'max_depth': 24, 'min_samples_split': 2, 'min_samples_leaf': 5}. Best is trial 0 with value: 5.3003230297941375.
[I 2023-08-23 23:13:47,407] Trial 1 finished with value: 5.5365045308504355 and parameters: {'n_estimators': 728, 'max_depth': 14, 'min_samples_split': 5, 'min_samples_leaf': 2}. Best is trial 0 with value: 5.3003230297941375.
[I 2023-08-23 23:14:05,548] Trial 2 finished with value: 5.013532636071367 and parameters: {'n_estimators': 529, 'max_depth': 20, 'min_samples_split': 4, 'min_samples_leaf': 3}. Best is trial 2 with value: 5.013532636071367.
[I 2023-08-23 23:14:10,338] Trial 3 finished with value: 5.870033262446116 and parameters: {'n_estimators': 202, 'max_depth': 13, 'min_samples_split': 4, 'min_samples_leaf': 2}. Best is trial 2 with value: 5.013532636071367.
[I 2023-0

Best Trial : score 4.825135504512769,
params {'n_estimators': 323, 'max_depth': 22, 'min_samples_split': 2, 'min_samples_leaf': 1}


[I 2023-08-23 23:21:20,739] Trial 0 finished with value: 6.502220662430354 and parameters: {'n_estimators': 207, 'max_depth': 14, 'min_samples_split': 2, 'min_samples_leaf': 4}. Best is trial 0 with value: 6.502220662430354.
[I 2023-08-23 23:21:44,891] Trial 1 finished with value: 5.766408490409107 and parameters: {'n_estimators': 409, 'max_depth': 16, 'min_samples_split': 2, 'min_samples_leaf': 3}. Best is trial 1 with value: 5.766408490409107.
[I 2023-08-23 23:22:21,528] Trial 2 finished with value: 5.708326731739091 and parameters: {'n_estimators': 533, 'max_depth': 25, 'min_samples_split': 2, 'min_samples_leaf': 5}. Best is trial 2 with value: 5.708326731739091.
[I 2023-08-23 23:23:02,658] Trial 3 finished with value: 6.325133683507343 and parameters: {'n_estimators': 467, 'max_depth': 14, 'min_samples_split': 4, 'min_samples_leaf': 1}. Best is trial 2 with value: 5.708326731739091.
[I 2023-08-23 23:23:37,197] Trial 4 finished with value: 5.601037053066203 and parameters: {'n_estim

Best Trial : score 4.9292244147944215,
params {'n_estimators': 695, 'max_depth': 25, 'min_samples_split': 3, 'min_samples_leaf': 1}


# 1. 캣부스트로만 제출

# 2. pycaret에서 0,2,3이 et가 좋다했으니 0,2,3만 et 적용후 제출

# 3. 상위 알고리즘 4~5개 스태킹, 앙상블 제출

In [78]:
trains = [X_0_train, X_2_train, X_3_train]
ys = [y_0_log, y_2_log, y_3_log]

for i, (train, y) in enumerate(zip(trains, ys)) :

    X_train, X_valid, y_train, y_valid = train_test_split(train, y, test_size=168*train.building_number.nunique(), random_state=168)

    model = ExtraTreesRegressor()
    model.fit(X_train,y_train)
    preds = model.predict(X_valid)
    print(f'Cluster {i} Train SMAPE : {smape(np.exp(np.array(y_valid)), np.exp(np.array(preds)))}')

Cluster 0 Train SMAPE : 3.0163961900910063
Cluster 1 Train SMAPE : 4.8728633345587395
Cluster 2 Train SMAPE : 4.916899391925533


# cluster 0 ET 파라미터
3.222776207160076
params {'n_estimators': 552, 'max_depth': 20, 'min_samples_split': 4, 'min_samples_leaf': 2}
3.0163961900910063
default

# cluster 1 catboost 파라미터
1.79
{'learning_rate': 0.028, 'depth': 10, 'l2_leaf_reg': 3.0, 'min_child_samples': 1}

# cluster 2 ET 파라미터
Best Trial : score 4.825135504512769,
params {'n_estimators': 323, 'max_depth': 22, 'min_samples_split': 2, 'min_samples_leaf': 1}

# cluster 3 ET 파라미터
Best Trial : score 4.9292244147944215,
params {'n_estimators': 695, 'max_depth': 25, 'min_samples_split': 3, 'min_samples_leaf': 1}
4.916899391925533
default

In [98]:
trains = [X_0_train, X_3_train]
ys = [y_0_log, y_3_log]
tests = [X_0_test, X_3_test]
nums = ['0','3']

for i, (train, y, test, num) in enumerate(zip(trains, ys, tests, nums)) :

    model = ExtraTreesRegressor()
    model.fit(train,y)
    pred = model.predict(test)
    test['answer'] = np.exp(np.array(pred))
    test.to_csv(f'./data/Cluster_{num}_0823.csv', encoding='cp949', index=False)

In [99]:
model = CatBoostRegressor(learning_rate= 0.028, depth= 10, l2_leaf_reg= 3.0, min_child_samples= 1)
model.fit(X_1_train,y_1_log)
preds = model.predict(X_1_train)
pred = model.predict(X_1_test)
X_1_test['answer'] = np.exp(np.array(pred))
X_1_test.to_csv(f'./data/Cluster_1_0823.csv', encoding='cp949', index=False)

0:	learn: 0.9024304	total: 53.7ms	remaining: 53.6s
1:	learn: 0.8784514	total: 104ms	remaining: 52.1s
2:	learn: 0.8552608	total: 156ms	remaining: 51.8s
3:	learn: 0.8331856	total: 165ms	remaining: 41s
4:	learn: 0.8109217	total: 216ms	remaining: 43s
5:	learn: 0.7897113	total: 266ms	remaining: 44s
6:	learn: 0.7690663	total: 323ms	remaining: 45.9s
7:	learn: 0.7498004	total: 331ms	remaining: 41.1s
8:	learn: 0.7300727	total: 382ms	remaining: 42s
9:	learn: 0.7106496	total: 432ms	remaining: 42.8s
10:	learn: 0.6918713	total: 496ms	remaining: 44.6s
11:	learn: 0.6737925	total: 509ms	remaining: 41.9s
12:	learn: 0.6564154	total: 569ms	remaining: 43.2s
13:	learn: 0.6397177	total: 583ms	remaining: 41.1s
14:	learn: 0.6230179	total: 638ms	remaining: 41.9s
15:	learn: 0.6067504	total: 701ms	remaining: 43.1s
16:	learn: 0.5906395	total: 781ms	remaining: 45.2s
17:	learn: 0.5753896	total: 803ms	remaining: 43.8s
18:	learn: 0.5606658	total: 859ms	remaining: 44.3s
19:	learn: 0.5461972	total: 915ms	remaining: 44.

164:	learn: 0.0582632	total: 8.65s	remaining: 43.8s
165:	learn: 0.0579927	total: 8.71s	remaining: 43.7s
166:	learn: 0.0576905	total: 8.77s	remaining: 43.7s
167:	learn: 0.0574957	total: 8.83s	remaining: 43.7s
168:	learn: 0.0572350	total: 8.89s	remaining: 43.7s
169:	learn: 0.0569050	total: 8.95s	remaining: 43.7s
170:	learn: 0.0567031	total: 9.01s	remaining: 43.7s
171:	learn: 0.0565422	total: 9.06s	remaining: 43.6s
172:	learn: 0.0562569	total: 9.12s	remaining: 43.6s
173:	learn: 0.0560945	total: 9.17s	remaining: 43.6s
174:	learn: 0.0558341	total: 9.23s	remaining: 43.5s
175:	learn: 0.0556334	total: 9.28s	remaining: 43.5s
176:	learn: 0.0554686	total: 9.34s	remaining: 43.4s
177:	learn: 0.0553486	total: 9.39s	remaining: 43.4s
178:	learn: 0.0551340	total: 9.45s	remaining: 43.3s
179:	learn: 0.0549329	total: 9.5s	remaining: 43.3s
180:	learn: 0.0547692	total: 9.56s	remaining: 43.3s
181:	learn: 0.0545839	total: 9.62s	remaining: 43.2s
182:	learn: 0.0544024	total: 9.68s	remaining: 43.2s
183:	learn: 0

323:	learn: 0.0395974	total: 17.4s	remaining: 36.4s
324:	learn: 0.0395400	total: 17.5s	remaining: 36.4s
325:	learn: 0.0394701	total: 17.7s	remaining: 36.5s
326:	learn: 0.0394233	total: 17.8s	remaining: 36.6s
327:	learn: 0.0393837	total: 17.8s	remaining: 36.5s
328:	learn: 0.0393309	total: 17.9s	remaining: 36.5s
329:	learn: 0.0392471	total: 18s	remaining: 36.5s
330:	learn: 0.0391878	total: 18.1s	remaining: 36.6s
331:	learn: 0.0391687	total: 18.1s	remaining: 36.5s
332:	learn: 0.0391122	total: 18.3s	remaining: 36.6s
333:	learn: 0.0390266	total: 18.4s	remaining: 36.6s
334:	learn: 0.0389797	total: 18.5s	remaining: 36.6s
335:	learn: 0.0389235	total: 18.5s	remaining: 36.6s
336:	learn: 0.0388420	total: 18.6s	remaining: 36.7s
337:	learn: 0.0387854	total: 18.7s	remaining: 36.7s
338:	learn: 0.0387222	total: 18.9s	remaining: 36.9s
339:	learn: 0.0386540	total: 19s	remaining: 36.9s
340:	learn: 0.0386019	total: 19.1s	remaining: 36.8s
341:	learn: 0.0385599	total: 19.1s	remaining: 36.8s
342:	learn: 0.03

485:	learn: 0.0322021	total: 28s	remaining: 29.6s
486:	learn: 0.0321565	total: 28s	remaining: 29.5s
487:	learn: 0.0321179	total: 28.1s	remaining: 29.5s
488:	learn: 0.0320893	total: 28.1s	remaining: 29.4s
489:	learn: 0.0320525	total: 28.2s	remaining: 29.4s
490:	learn: 0.0320213	total: 28.3s	remaining: 29.3s
491:	learn: 0.0319990	total: 28.3s	remaining: 29.2s
492:	learn: 0.0319731	total: 28.4s	remaining: 29.2s
493:	learn: 0.0319349	total: 28.4s	remaining: 29.1s
494:	learn: 0.0319049	total: 28.5s	remaining: 29s
495:	learn: 0.0318564	total: 28.5s	remaining: 29s
496:	learn: 0.0318236	total: 28.6s	remaining: 28.9s
497:	learn: 0.0318012	total: 28.6s	remaining: 28.9s
498:	learn: 0.0317475	total: 28.7s	remaining: 28.8s
499:	learn: 0.0317115	total: 28.7s	remaining: 28.7s
500:	learn: 0.0316637	total: 28.8s	remaining: 28.7s
501:	learn: 0.0316313	total: 28.8s	remaining: 28.6s
502:	learn: 0.0315918	total: 28.9s	remaining: 28.6s
503:	learn: 0.0315652	total: 28.9s	remaining: 28.5s
504:	learn: 0.031510

644:	learn: 0.0277544	total: 37.4s	remaining: 20.6s
645:	learn: 0.0277341	total: 37.5s	remaining: 20.5s
646:	learn: 0.0277143	total: 37.5s	remaining: 20.5s
647:	learn: 0.0276833	total: 37.6s	remaining: 20.4s
648:	learn: 0.0276725	total: 37.6s	remaining: 20.3s
649:	learn: 0.0276502	total: 37.7s	remaining: 20.3s
650:	learn: 0.0276230	total: 37.7s	remaining: 20.2s
651:	learn: 0.0276071	total: 37.8s	remaining: 20.2s
652:	learn: 0.0275866	total: 37.8s	remaining: 20.1s
653:	learn: 0.0275617	total: 37.9s	remaining: 20.1s
654:	learn: 0.0275399	total: 38s	remaining: 20s
655:	learn: 0.0275282	total: 38s	remaining: 19.9s
656:	learn: 0.0275077	total: 38.1s	remaining: 19.9s
657:	learn: 0.0274876	total: 38.1s	remaining: 19.8s
658:	learn: 0.0274637	total: 38.2s	remaining: 19.8s
659:	learn: 0.0274316	total: 38.3s	remaining: 19.7s
660:	learn: 0.0273979	total: 38.3s	remaining: 19.6s
661:	learn: 0.0273637	total: 38.4s	remaining: 19.6s
662:	learn: 0.0273519	total: 38.4s	remaining: 19.5s
663:	learn: 0.0273

804:	learn: 0.0246274	total: 46.7s	remaining: 11.3s
805:	learn: 0.0246114	total: 46.8s	remaining: 11.3s
806:	learn: 0.0245978	total: 46.8s	remaining: 11.2s
807:	learn: 0.0245914	total: 46.9s	remaining: 11.1s
808:	learn: 0.0245794	total: 46.9s	remaining: 11.1s
809:	learn: 0.0245593	total: 47s	remaining: 11s
810:	learn: 0.0245422	total: 47s	remaining: 11s
811:	learn: 0.0245267	total: 47.1s	remaining: 10.9s
812:	learn: 0.0245210	total: 47.1s	remaining: 10.8s
813:	learn: 0.0245048	total: 47.2s	remaining: 10.8s
814:	learn: 0.0244908	total: 47.2s	remaining: 10.7s
815:	learn: 0.0244844	total: 47.3s	remaining: 10.7s
816:	learn: 0.0244628	total: 47.3s	remaining: 10.6s
817:	learn: 0.0244421	total: 47.4s	remaining: 10.5s
818:	learn: 0.0244323	total: 47.4s	remaining: 10.5s
819:	learn: 0.0244086	total: 47.5s	remaining: 10.4s
820:	learn: 0.0243915	total: 47.5s	remaining: 10.4s
821:	learn: 0.0243736	total: 47.6s	remaining: 10.3s
822:	learn: 0.0243550	total: 47.7s	remaining: 10.3s
823:	learn: 0.024346

963:	learn: 0.0223008	total: 56.3s	remaining: 2.1s
964:	learn: 0.0222932	total: 56.4s	remaining: 2.04s
965:	learn: 0.0222781	total: 56.4s	remaining: 1.99s
966:	learn: 0.0222554	total: 56.5s	remaining: 1.93s
967:	learn: 0.0222422	total: 56.6s	remaining: 1.87s
968:	learn: 0.0222304	total: 56.6s	remaining: 1.81s
969:	learn: 0.0222238	total: 56.7s	remaining: 1.75s
970:	learn: 0.0222063	total: 56.7s	remaining: 1.69s
971:	learn: 0.0222018	total: 56.8s	remaining: 1.64s
972:	learn: 0.0221857	total: 56.9s	remaining: 1.58s
973:	learn: 0.0221780	total: 56.9s	remaining: 1.52s
974:	learn: 0.0221722	total: 56.9s	remaining: 1.46s
975:	learn: 0.0221659	total: 57s	remaining: 1.4s
976:	learn: 0.0221523	total: 57s	remaining: 1.34s
977:	learn: 0.0221358	total: 57.1s	remaining: 1.28s
978:	learn: 0.0221285	total: 57.2s	remaining: 1.23s
979:	learn: 0.0221124	total: 57.2s	remaining: 1.17s
980:	learn: 0.0220934	total: 57.3s	remaining: 1.11s
981:	learn: 0.0220778	total: 57.3s	remaining: 1.05s
982:	learn: 0.0220

In [100]:
model = ExtraTreesRegressor(n_estimators=323, max_depth=22, min_samples_split=2, min_samples_leaf=1)
model.fit(X_2_train,y_2_log)
preds = model.predict(X_2_train)
pred = model.predict(X_2_test)
X_2_test['answer'] = np.exp(np.array(pred))
X_2_test.to_csv(f'./data/Cluster_2_0823.csv', encoding='cp949', index=False)