In [19]:
import pandas as pd
import datetime as dt
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesRegressor
import optuna
from catboost import CatBoostRegressor
from pycaret.regression import *

In [20]:
train = pd.read_csv('./data/train_0821.csv', encoding='cp949')

train.solar_yes = train.solar_yes.astype(int)
train.ess_yes = train.ess_yes.astype(int)
train.sy_ey = train.sy_ey.astype(int)
train.sy_en = train.sy_en.astype(int)

# train.drop(['building_number','date_time','building_type'])

train['hour_sin'] = np.sin(2 * np.pi * train['hour']/23.0)
train['hour_cos'] = np.cos(2 * np.pi * train['hour']/23.0)

train['date_sin'] = -np.sin(2 * np.pi * (train['month']+train['day']/31)/12)
train['date_cos'] = -np.cos(2 * np.pi * (train['month']+train['day']/31)/12)

train['month_sin'] = -np.sin(2 * np.pi * train['month']/12.0)
train['month_cos'] = -np.cos(2 * np.pi * train['month']/12.0)
## cyclic transformation on weekday
train['weekday_sin'] = -np.sin(2 * np.pi * (train['weekday']+1)/7.0)
train['weekday_cos'] = -np.cos(2 * np.pi * (train['weekday']+1)/7.0)


train['work_time'] = 0
train['low_day'] = 0
train['particular'] = 0
trains = [train[train.building_number == num] for num in range(1,101,1)]
# aa = trains[9]
# aa['work_time']  = (aa['hour'].isin([6,7,8])).astype(int)
# aa.describe()

## cluster 0
for num in [10,13]:
    temp_df = trains[num-1]
    temp_df['work_time'] = (temp_df['hour'].isin([6,7,8])).astype(int)
    
    trains[num-1] = temp_df
for num in [14]:
    temp_df = trains[num-1]
    temp_df['work_time'] = (temp_df['hour'].isin([6,7])).astype(int)
    temp_df['particular'] = temp_df['hour'].isin([0,8,12,18]).astype(int)
    trains[num-1] = temp_df
for num in [15]:
    temp_df = trains[num-1]
    temp_df['work_time'] = (temp_df['hour'].isin([4,5,6,7,8])).astype(int)
    trains[num-1] = temp_df

## cluster 1
for num in [17,18,19,21,22,69,70,72]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=18)&(temp_df['hour']>=6)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    trains[num-1] = temp_df
for num in [45,46,47,48,49,51,52,73]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=21)&(temp_df['hour']>=6)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    trains[num-1] = temp_df
for num in [26,28,76,78,80,82,83,84]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=18)&(temp_df['hour']>=8)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    trains[num-1] = temp_df
for num in [71]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=18)*(temp_df['hour']>=8)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    temp_df['particular'] = (((temp_df['weekday']==5)&(temp_df['hour'].isin([10,11,13,14])))
                               |((temp_df['weekday'].isin([0,1,2,3,4,5]))&(temp_df['hour'].isin([0,1,2,3,4])))).astype(int)
    trains[num-1] = temp_df
for num in [53]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=21)&(temp_df['hour']>=4)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    trains[num-1] = temp_df
for num in [55]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=19)&(temp_df['hour']>=5)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    trains[num-1] = temp_df
for num in [50]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=20)&(temp_df['hour']>=5)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    trains[num-1] = temp_df
for num in [56]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=20)&(temp_df['hour']>=5)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    temp_df['particular'] = ((temp_df['weekday']==2)&(temp_df['hour']==16)).astype(int)
    trains[num-1] = temp_df
for num in [60]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=21)&(temp_df['hour']>=5)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    trains[num-1] = temp_df
for num in [20,23]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=17)&(temp_df['hour']>=6)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    trains[num-1] = temp_df
for num in [58,59,77]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=20)&(temp_df['hour']>=6)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    trains[num-1] = temp_df
for num in [45,46,47,48,49,51,52,73]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=21)&(temp_df['hour']>=6)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    trains[num-1] = temp_df
for num in [27,74,79]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=18)&(temp_df['hour']>=7)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    trains[num-1] = temp_df
for num in [57]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=22)&(temp_df['hour']>=7)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    trains[num-1] = temp_df
for num in [24,25]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=20)&(temp_df['hour']>=8)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    trains[num-1] = temp_df
for num in [29,30]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=17)&(temp_df['hour']>=9)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    trains[num-1] = temp_df
for num in [31]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=20)&(temp_df['hour']>=9)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    trains[num-1] = temp_df
for num in [50]:
    temp_df = trains[num-1]
    temp_df['particular'] = (((temp_df['weekday']==2)&(temp_df['hour']==16))|((temp_df['weekday']==2)&(temp_df['hour']==16))).astype(int)
    trains[num-1] = temp_df
for num in [1]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=21)&(temp_df['hour']>=7)).astype(int)
    temp_df['low_day'] = (temp_df['weekday']==6).astype(int)
    trains[num-1] = temp_df
for num in [8]:
    temp_df = trains[num-1]
    temp_df['low_day'] = (temp_df['weekday']==6).astype(int)
    trains[num-1] = temp_df
for num in [12]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=21)&(temp_df['hour']>=5)).astype(int)
    temp_df['low_day'] = (temp_df['weekday']==6).astype(int)
    trains[num-1] = temp_df
for num in [88,90]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=22)&(temp_df['hour']>=8)).astype(int)
    trains[num-1] = temp_df
for num in [86]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=22)&(temp_df['hour']>=8)).astype(int)
    temp_df['particular'] = ((temp_df['month']==6)&(temp_df['day']==26)).astype(int)
    trains[num-1] = temp_df
for num in [87,89,91]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=21)&(temp_df['hour']>=9)).astype(int)
    trains[num-1] = temp_df
for num in [4]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=18)&(temp_df['hour']>=8)).astype(int)
    trains[num-1] = temp_df
for num in [7,16]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=22)&(temp_df['hour']>=5)).astype(int)
    trains[num-1] = temp_df
for num in [9]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=23)&(temp_df['hour']>=7)).astype(int)
    trains[num-1] = temp_df
for num in [11]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=21)&(temp_df['hour']>=7)).astype(int)
    trains[num-1] = temp_df
for num in [36]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=20)&(temp_df['hour']>=9)).astype(int)
    trains[num-1] = temp_df
for num in [75]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=15)&(temp_df['hour']>=7)).astype(int)
    trains[num-1] = temp_df
for num in [92]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=22)&(temp_df['hour']>=6)).astype(int)
    trains[num-1] = temp_df

## cluster 2
for num in [32,33,34]:
    temp_df = trains[num-1]
    temp_df = temp_df[temp_df.month>=7]
    trains[num-1] = temp_df
for num in [61,65,66,67,68]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']>=7)&(temp_df['hour']<=23)).astype(int)
    trains[num-1] = temp_df
for num in [62,63,64]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']>=8)&(temp_df['hour']<=23)).astype(int)
    trains[num-1] = temp_df

## cluster 3
for num in [93,94,96,97,98,100]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']>=6)&(temp_df['hour']<=22)).astype(int)
    trains[num-1] = temp_df
for num in [42,43,44]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']>=9)&(temp_df['hour']<=20)).astype(int)
    trains[num-1] = temp_df
for num in [2,85,99]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']>=8)&(temp_df['hour']<=21)).astype(int)
    trains[num-1] = temp_df
for num in [38,40,95]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']>=8)&(temp_df['hour']<=20)).astype(int)
    trains[num-1] = temp_df
for num in [3,81]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']>=9)&(temp_df['hour']<=21)).astype(int)
    trains[num-1] = temp_df
for num in [6,41]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']>=8)&(temp_df['hour']<=22)).astype(int)
    trains[num-1] = temp_df
for num in [37,39]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']>=7)&(temp_df['hour']<=21)).astype(int)
    trains[num-1] = temp_df
for num in [35]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']>=7)&(temp_df['hour']<=23)).astype(int)
    trains[num-1] = temp_df
for num in [54]:
    temp_df = trains[num-1]
    temp_df['work_time'] = ((temp_df['hour']>=5)&(temp_df['hour']<=20)).astype(int)
    temp_df['low_day'] = (temp_df['weekday']==0).astype(int)
    trains[num-1] = temp_df
for num in [5]:
    temp_df = trains[num-1]
    temp_df['particular'] = (((temp_df['weekday']==4)&(temp_df['hour'].isin([11,12,13,14,15,16,17,18,19,20])))|
                              ((temp_df['weekday']==5)&(temp_df['hour'].isin([11,12,13,14,15,16,17,18,19,20])))|
                              ((temp_df['weekday']==6)&(temp_df['hour'].isin([8,9,10,11,12,13,14,15,16,17,18])))).astype(int)
    trains[num-1] = temp_df
    

df = pd.concat(trains, ignore_index=True)
df = df.drop(['date_time','hour','day','month','weekday','date'], axis=1)



df_0 = df[df.km_cluster==0]
df_1 = df[df.km_cluster==1]
df_2 = df[df.km_cluster==2]
df_3 = df[df.km_cluster==3]
df_4 = df[df.km_cluster==4]
df_5 = df[df.km_cluster==5]




X_0 = df_0.drop('power_consumption', axis=1)
y_0 = df_0['power_consumption']
y_0_log = np.log(y_0)
X_0_train = X_0.drop('km_cluster', axis=1)

X_1 = df_1.drop('power_consumption', axis=1)
y_1 = df_1['power_consumption']
y_1_log = np.log(y_1)
X_1_train = X_1.drop('km_cluster', axis=1)

X_2 = df_2.drop('power_consumption', axis=1)
y_2 = df_2['power_consumption']
y_2_log = np.log(y_2)
X_2_train = X_2.drop('km_cluster', axis=1)

X_3 = df_3.drop('power_consumption', axis=1)
y_3 = df_3['power_consumption']
y_3_log = np.log(y_3)
X_3_train = X_3.drop('km_cluster', axis=1)

X_4 = df_4.drop('power_consumption', axis=1)
y_4 = df_4['power_consumption']
y_4_log = np.log(y_4)
X_4_train = X_4.drop('km_cluster', axis=1)

X_5 = df_5.drop('power_consumption', axis=1)
y_5 = df_5['power_consumption']
y_5_log = np.log(y_5)
X_5_train = X_5.drop('km_cluster', axis=1)


# Standard Scaling for continuous data
con_li = ['hour_sin', 'hour_cos', 'date_sin', 'date_cos', 'month_sin', 'month_cos', 'weekday_sin', 'weekday_cos', 'temperature', 'windspeed', 
'humidity', 'total_area', 'cooling_area', 'solar_power_capacity', 'ess_capacity', 'pcs_capacity', 'discomfort', 'temperature_3', 
'discomfort_3', 'ma_dis_3', 'ma_dis_5', 'CDH', 'temperature_mean', 'discomfort_mean', 'CDH_mean', 'sensory_temperature']

# ## one-hot encoding for categorical data
cat_li = ['building_number', 'holiday','air_yes','solar_yes',
         'ess_yes','sy_ey','sy_en','low_day','particular']

T = [X_0_train,X_1_train,X_2_train,X_3_train,X_4_train,X_5_train]

for train in T :


    for col in con_li :
        if train[col].std() == 0:
            train = train.drop(col, axis=1)
        else :
            train[col] = (train[col] - train[col].mean())/train[col].std()

    for col in cat_li :
        if train[col].nunique() == 1:        
            train = train.drop(col,axis=1)
        
    
X_0_train = X_0_train.drop('building_type', axis=1)
X_2_train = X_2_train.drop('building_type', axis=1)
X_5_train = X_5_train.drop('building_type', axis=1)

X_1_train.building_type = X_1_train.building_type.astype('category')
X_1_train = pd.get_dummies(X_1_train)
X_3_train.building_type = X_3_train.building_type.astype('category')
X_3_train = pd.get_dummies(X_3_train)
X_4_train.building_type = X_4_train.building_type.astype('category')
X_4_train = pd.get_dummies(X_4_train)

        
        
train = pd.read_csv('./data/train_0821.csv', encoding='cp949')
test = pd.read_csv('./data/test_0821.csv', encoding='cp949')
# cluster_dict = train.groupby('building_number')['km_cluster'].first().to_dict()
# test['km_cluster'] = test['building_number'].map(cluster_dict)

test.solar_yes = test.solar_yes.astype(int)
test.ess_yes = test.ess_yes.astype(int)
test.sy_ey = test.sy_ey.astype(int)
test.sy_en = test.sy_en.astype(int)

# test.drop(['building_number','date_time','building_type'])

test['hour_sin'] = np.sin(2 * np.pi * test['hour']/23.0)
test['hour_cos'] = np.cos(2 * np.pi * test['hour']/23.0)

test['date_sin'] = -np.sin(2 * np.pi * (test['month']+test['day']/31)/12)
test['date_cos'] = -np.cos(2 * np.pi * (test['month']+test['day']/31)/12)

test['month_sin'] = -np.sin(2 * np.pi * test['month']/12.0)
test['month_cos'] = -np.cos(2 * np.pi * test['month']/12.0)
## cyclic transformation on weekday
test['weekday_sin'] = -np.sin(2 * np.pi * (test['weekday']+1)/7.0)
test['weekday_cos'] = -np.cos(2 * np.pi * (test['weekday']+1)/7.0)


test['work_time'] = 0
test['low_day'] = 0
test['particular'] = 0
tests = [test[test.building_number == num] for num in range(1,101,1)]
# aa = tests[9]
# aa['work_time']  = (aa['hour'].isin([6,7,8])).astype(int)
# aa.describe()

## cluster 0
for num in [10,13]:
    temp_df = tests[num-1]
    temp_df['work_time'] = (temp_df['hour'].isin([6,7,8])).astype(int)
    
    tests[num-1] = temp_df
for num in [14]:
    temp_df = tests[num-1]
    temp_df['work_time'] = (temp_df['hour'].isin([6,7])).astype(int)
    temp_df['particular'] = temp_df['hour'].isin([0,8,12,18]).astype(int)
    tests[num-1] = temp_df
for num in [15]:
    temp_df = tests[num-1]
    temp_df['work_time'] = (temp_df['hour'].isin([4,5,6,7,8])).astype(int)
    tests[num-1] = temp_df

## cluster 1
for num in [17,18,19,21,22,69,70,72]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=18)&(temp_df['hour']>=6)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    tests[num-1] = temp_df
for num in [45,46,47,48,49,51,52,73]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=21)&(temp_df['hour']>=6)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    tests[num-1] = temp_df
for num in [26,28,76,78,80,82,83,84]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=18)&(temp_df['hour']>=8)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    tests[num-1] = temp_df
for num in [71]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=18)*(temp_df['hour']>=8)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    temp_df['particular'] = (((temp_df['weekday']==5)&(temp_df['hour'].isin([10,11,13,14])))
                               |((temp_df['weekday'].isin([0,1,2,3,4,5]))&(temp_df['hour'].isin([0,1,2,3,4])))).astype(int)
    tests[num-1] = temp_df
for num in [53]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=21)&(temp_df['hour']>=4)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    tests[num-1] = temp_df
for num in [55]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=19)&(temp_df['hour']>=5)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    tests[num-1] = temp_df
for num in [50]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=20)&(temp_df['hour']>=5)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    tests[num-1] = temp_df
for num in [56]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=20)&(temp_df['hour']>=5)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    temp_df['particular'] = ((temp_df['weekday']==2)&(temp_df['hour']==16)).astype(int)
    tests[num-1] = temp_df
for num in [60]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=21)&(temp_df['hour']>=5)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    tests[num-1] = temp_df
for num in [20,23]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=17)&(temp_df['hour']>=6)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    tests[num-1] = temp_df
for num in [58,59,77]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=20)&(temp_df['hour']>=6)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    tests[num-1] = temp_df
for num in [45,46,47,48,49,51,52,73]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=21)&(temp_df['hour']>=6)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    tests[num-1] = temp_df
for num in [27,74,79]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=18)&(temp_df['hour']>=7)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    tests[num-1] = temp_df
for num in [57]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=22)&(temp_df['hour']>=7)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    tests[num-1] = temp_df
for num in [24,25]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=20)&(temp_df['hour']>=8)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    tests[num-1] = temp_df
for num in [29,30]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=17)&(temp_df['hour']>=9)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    tests[num-1] = temp_df
for num in [31]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=20)&(temp_df['hour']>=9)).astype(int)
    temp_df['low_day'] = (temp_df['weekday'].isin([5,6])).astype(int)
    tests[num-1] = temp_df
for num in [50]:
    temp_df = tests[num-1]
    temp_df['particular'] = (((temp_df['weekday']==2)&(temp_df['hour']==16))|((temp_df['weekday']==2)&(temp_df['hour']==16))).astype(int)
    tests[num-1] = temp_df
for num in [1]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=21)&(temp_df['hour']>=7)).astype(int)
    temp_df['low_day'] = (temp_df['weekday']==6).astype(int)
    tests[num-1] = temp_df
for num in [8]:
    temp_df = tests[num-1]
    temp_df['low_day'] = (temp_df['weekday']==6).astype(int)
    tests[num-1] = temp_df
for num in [12]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=21)&(temp_df['hour']>=5)).astype(int)
    temp_df['low_day'] = (temp_df['weekday']==6).astype(int)
    tests[num-1] = temp_df
for num in [88,90]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=22)&(temp_df['hour']>=8)).astype(int)
    tests[num-1] = temp_df
for num in [86]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=22)&(temp_df['hour']>=8)).astype(int)
    temp_df['particular'] = ((temp_df['month']==6)&(temp_df['day']==26)).astype(int)
    tests[num-1] = temp_df
for num in [87,89,91]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=21)&(temp_df['hour']>=9)).astype(int)
    tests[num-1] = temp_df
for num in [4]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=18)&(temp_df['hour']>=8)).astype(int)
    tests[num-1] = temp_df
for num in [7,16]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=22)&(temp_df['hour']>=5)).astype(int)
    tests[num-1] = temp_df
for num in [9]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=23)&(temp_df['hour']>=7)).astype(int)
    tests[num-1] = temp_df
for num in [11]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=21)&(temp_df['hour']>=7)).astype(int)
    tests[num-1] = temp_df
for num in [36]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=20)&(temp_df['hour']>=9)).astype(int)
    tests[num-1] = temp_df
for num in [75]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=15)&(temp_df['hour']>=7)).astype(int)
    tests[num-1] = temp_df
for num in [92]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']<=22)&(temp_df['hour']>=6)).astype(int)
    tests[num-1] = temp_df

## cluster 2
for num in [32,33,34]:
    temp_df = tests[num-1]
    temp_df = temp_df[temp_df.month>=7]
    tests[num-1] = temp_df
for num in [61,65,66,67,68]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']>=7)&(temp_df['hour']<=23)).astype(int)
    tests[num-1] = temp_df
for num in [62,63,64]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']>=8)&(temp_df['hour']<=23)).astype(int)
    tests[num-1] = temp_df

## cluster 3
for num in [93,94,96,97,98,100]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']>=6)&(temp_df['hour']<=22)).astype(int)
    tests[num-1] = temp_df
for num in [42,43,44]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']>=9)&(temp_df['hour']<=20)).astype(int)
    tests[num-1] = temp_df
for num in [2,85,99]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']>=8)&(temp_df['hour']<=21)).astype(int)
    tests[num-1] = temp_df
for num in [38,40,95]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']>=8)&(temp_df['hour']<=20)).astype(int)
    tests[num-1] = temp_df
for num in [3,81]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']>=9)&(temp_df['hour']<=21)).astype(int)
    tests[num-1] = temp_df
for num in [6,41]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']>=8)&(temp_df['hour']<=22)).astype(int)
    tests[num-1] = temp_df
for num in [37,39]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']>=7)&(temp_df['hour']<=21)).astype(int)
    tests[num-1] = temp_df
for num in [35]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']>=7)&(temp_df['hour']<=23)).astype(int)
    tests[num-1] = temp_df
for num in [54]:
    temp_df = tests[num-1]
    temp_df['work_time'] = ((temp_df['hour']>=5)&(temp_df['hour']<=20)).astype(int)
    temp_df['low_day'] = (temp_df['weekday']==0).astype(int)
    tests[num-1] = temp_df
for num in [5]:
    temp_df = tests[num-1]
    temp_df['particular'] = (((temp_df['weekday']==4)&(temp_df['hour'].isin([11,12,13,14,15,16,17,18,19,20])))|
                              ((temp_df['weekday']==5)&(temp_df['hour'].isin([11,12,13,14,15,16,17,18,19,20])))|
                              ((temp_df['weekday']==6)&(temp_df['hour'].isin([8,9,10,11,12,13,14,15,16,17,18])))).astype(int)
    tests[num-1] = temp_df
    

df = pd.concat(tests, ignore_index=True)
df = df.drop(['date_time','hour','day','month','weekday','date'], axis=1)
df.building_type = df.building_type.astype('category')

df_0 = df[df.km_cluster==0]
df_1 = df[df.km_cluster==1]
df_2 = df[df.km_cluster==2]
df_3 = df[df.km_cluster==3]
df_4 = df[df.km_cluster==4]
df_5 = df[df.km_cluster==5]

X_0_test = df_0.drop('km_cluster', axis=1)
X_1_test = df_1.drop('km_cluster', axis=1)
X_2_test = df_2.drop('km_cluster', axis=1)
X_3_test = df_3.drop('km_cluster', axis=1)
X_4_test = df_4.drop('km_cluster', axis=1)
X_5_test = df_5.drop('km_cluster', axis=1)


# Standard Scaling for continuous data
con_li = ['hour_sin', 'hour_cos', 'date_sin', 'date_cos', 'month_sin', 'month_cos', 'weekday_sin', 'weekday_cos', 'temperature', 'windspeed', 
'humidity', 'total_area', 'cooling_area', 'solar_power_capacity', 'ess_capacity', 'pcs_capacity', 'discomfort', 'temperature_3', 
'discomfort_3', 'ma_dis_3', 'ma_dis_5', 'CDH', 'temperature_mean', 'discomfort_mean', 'CDH_mean', 'sensory_temperature']


cat_li = ['building_number','building_type','holiday','air_yes','solar_yes',
         'ess_yes','sy_ey','sy_en','low_day','particular']

T = [X_0_test, X_1_test, X_2_test, X_3_test, X_4_test, X_5_test]

for test in T :
    for col in con_li :
        if test[col].std() == 0:
            test = test.drop(col, axis=1)
        else :
            test[col] = (test[col] - test[col].mean())/test[col].std()

    for col in cat_li :
        if test[col].nunique() == 1:        
            test = test.drop(col,axis=1)
    test = pd.get_dummies(test)

X_0_test = X_0_test.drop('building_type', axis=1)
X_2_test = X_2_test.drop('building_type', axis=1)
X_5_test = X_5_test.drop('building_type', axis=1)

X_1_test.building_type = X_1_test.building_type.astype('category')
X_1_test = pd.get_dummies(X_1_test)
X_3_test.building_type = X_3_test.building_type.astype('category')
X_3_test = pd.get_dummies(X_3_test)
X_4_test.building_type = X_4_test.building_type.astype('category')
X_4_test = pd.get_dummies(X_4_test)


In [21]:
col_0 = ['building_number', 'total_area','hour_cos','holiday','date_sin','hour_sin','work_time','low_day','discomfort','temperature_mean']
col_1 = ['building_number', 'cooling_area','work_time','total_area','hour_cos','building_type_Hospital','low_day','solar_power_capacity',
        'hour_sin','holiday','building_type_University']
col_2 = ['building_number','cooling_area']
col_3 = ['building_number', 'work_time','total_area','cooling_area','hour_cos','hour_sin', 'date_sin','temperature']
col_4 = ['building_number','work_time','total_area','hour_cos','hour_sin','date_sin','cooling_area','solar_power_capacity','weekday_cos',
        'ess_capacity','discomfort']
col_5 = ['building_number','cooling_area','total_area','work_time','hour_sin','ma_dis_5','date_sin','hour_cos','CDH','holiday']

In [22]:
X_0_train = X_0_train[col_0]
X_1_train = X_1_train[col_1]
X_2_train = X_2_train[col_2]
X_3_train = X_3_train[col_3]
X_4_train = X_4_train[col_4]
X_5_train = X_5_train[col_5]

X_0_test = X_0_test[col_0]
X_1_test = X_1_test[col_1]
X_2_test = X_2_test[col_2]
X_3_test = X_3_test[col_3]
X_4_test = X_4_test[col_4]
X_5_test = X_5_test[col_5]

In [23]:
def smape(true, pred):
    v = 2 * abs(pred - true) / ((abs(pred) + abs(true)))
    result = np.mean(v) * 100
    return result

In [24]:
trains = [X_0_train, X_1_train, X_2_train, X_3_train, X_4_train, X_5_train]
tests = [X_0_test,X_1_test,X_2_test,X_3_test,X_4_test,X_5_test]
ys = [y_0_log, y_1_log,y_2_log,y_3_log,y_4_log,y_5_log]

In [None]:
cat_bestli = []
cat_paramli = []
for i, (train, y) in enumerate(zip(trains, ys)) :
    X_train, X_valid, y_train, y_valid = train_test_split(train, y, test_size=168*train.building_number.nunique())
    def objective(trial):
        param = {}
        param['learning_rate'] = trial.suggest_float('learning_rate',0.02, 0.08, step=0.002)
        param['depth'] = trial.suggest_int('depth', 10, 16)
        param['l2_leaf_reg'] = trial.suggest_float('l2_leaf_reg',low=2.0, high=4, step=0.5)
        param['min_child_samples'] = trial.suggest_categorical('min_child_samples', [1, 2])
        param['grow_policy'] = 'Depthwise'
        param['iterations'] = 1000
        param['eval_metric'] = 'SMAPE'
        param['od_type'] = 'iter'
        param['od_wait'] = 20
        param['random_state'] = 1
        param['logging_level'] = 'Silent'
        regressor = CatBoostRegressor(**param)
        regressor.fit(X_train, y_train,early_stopping_rounds=50)
        sm = smape(np.exp(np.array(y_valid)), np.exp(np.array(regressor.predict(X_valid))))
        return sm

    study = optuna.create_study(study_name=f'catboost-seed')
    study.optimize(objective, n_trials=30, n_jobs=-1, timeout=24000)

    cat = study.best_trial
    cat_param = cat.params
    print('Best Trial: score {},\nparams {}'.format(cat.value, cat_param))

    cat_bestli.append(cat.value)
    cat_paramli.append(cat_3_param)

[I 2023-08-21 23:30:43,428] A new study created in memory with name: catboost-seed


In [7]:
et_bestli = []
et_paramli = []

for i, (train, y) in enumerate(zip(trains, ys)) :

    def objective(trial):
        param = {
            'n_estimators': trial.suggest_int('n_estimators', 200, 500),
            'max_depth' : trial.suggest_int('max_depth', 20, 30),
            'min_samples_split': trial.suggest_int('min_samples_split', 2, 5),
            'min_samples_leaf': trial.suggest_int('min_samples_leaf', 2, 5)
        }

        model = ExtraTreesRegressor(**param)
        model.fit(X_train,y_train)

        preds = model.predict(X_valid)

        sm = smape(np.exp(np.array(y_valid)), np.exp(np.array(preds)))

        return sm

    X_train, X_valid, y_train, y_valid = train_test_split(train, y, test_size=168*train.building_number.nunique())

    study = optuna.create_study(study_name = f'ET Optuna_{i}')
    study.optimize(objective, n_trials=30, timeout=24000)

    et = study.best_trial
    et_param = et.params
    print('Best Trial: score {},\nparams {}'.format(et.value, et_param))

    et_bestli.append(et.value)
    et_paramli.append(et_param)

[I 2023-08-21 23:32:57,911] A new study created in memory with name: ET Optuna_0
[I 2023-08-21 23:33:16,628] Trial 0 finished with value: 3.774810216574874 and parameters: {'n_estimators': 304, 'max_depth': 25, 'min_samples_split': 5, 'min_samples_leaf': 4}. Best is trial 0 with value: 3.774810216574874.
[I 2023-08-21 23:33:47,715] Trial 1 finished with value: 3.5854508967519454 and parameters: {'n_estimators': 426, 'max_depth': 20, 'min_samples_split': 3, 'min_samples_leaf': 3}. Best is trial 1 with value: 3.5854508967519454.
[I 2023-08-21 23:34:01,508] Trial 2 finished with value: 3.947209006006102 and parameters: {'n_estimators': 256, 'max_depth': 21, 'min_samples_split': 4, 'min_samples_leaf': 5}. Best is trial 1 with value: 3.5854508967519454.
[I 2023-08-21 23:34:32,893] Trial 3 finished with value: 3.378141133767868 and parameters: {'n_estimators': 332, 'max_depth': 22, 'min_samples_split': 3, 'min_samples_leaf': 2}. Best is trial 3 with value: 3.378141133767868.
[I 2023-08-21 23

Best Trial: score 3.378141133767868,
params {'n_estimators': 332, 'max_depth': 22, 'min_samples_split': 3, 'min_samples_leaf': 2}


[I 2023-08-21 23:50:05,831] Trial 0 finished with value: 8.358423392223829 and parameters: {'n_estimators': 483, 'max_depth': 29, 'min_samples_split': 3, 'min_samples_leaf': 5}. Best is trial 0 with value: 8.358423392223829.
[I 2023-08-21 23:51:02,956] Trial 1 finished with value: 8.373545068286598 and parameters: {'n_estimators': 338, 'max_depth': 23, 'min_samples_split': 5, 'min_samples_leaf': 2}. Best is trial 0 with value: 8.358423392223829.
[I 2023-08-21 23:52:05,341] Trial 2 finished with value: 8.354473253797973 and parameters: {'n_estimators': 392, 'max_depth': 30, 'min_samples_split': 4, 'min_samples_leaf': 4}. Best is trial 2 with value: 8.354473253797973.
[I 2023-08-21 23:52:47,631] Trial 3 finished with value: 8.371983492410457 and parameters: {'n_estimators': 259, 'max_depth': 26, 'min_samples_split': 3, 'min_samples_leaf': 2}. Best is trial 2 with value: 8.354473253797973.
[I 2023-08-21 23:53:25,983] Trial 4 finished with value: 8.354552152109768 and parameters: {'n_estim

Best Trial: score 8.352258533650996,
params {'n_estimators': 206, 'max_depth': 26, 'min_samples_split': 4, 'min_samples_leaf': 4}


[I 2023-08-22 00:07:31,819] Trial 0 finished with value: 1.530537494573521 and parameters: {'n_estimators': 331, 'max_depth': 20, 'min_samples_split': 4, 'min_samples_leaf': 4}. Best is trial 0 with value: 1.530537494573521.
[I 2023-08-22 00:07:32,282] Trial 1 finished with value: 1.5305374945735308 and parameters: {'n_estimators': 294, 'max_depth': 27, 'min_samples_split': 4, 'min_samples_leaf': 4}. Best is trial 0 with value: 1.530537494573521.
[I 2023-08-22 00:07:33,000] Trial 2 finished with value: 1.5305374945734858 and parameters: {'n_estimators': 480, 'max_depth': 30, 'min_samples_split': 5, 'min_samples_leaf': 4}. Best is trial 2 with value: 1.5305374945734858.
[I 2023-08-22 00:07:33,751] Trial 3 finished with value: 1.5305374945734898 and parameters: {'n_estimators': 486, 'max_depth': 30, 'min_samples_split': 5, 'min_samples_leaf': 3}. Best is trial 2 with value: 1.5305374945734858.
[I 2023-08-22 00:07:34,823] Trial 4 finished with value: 1.5305374945734795 and parameters: {'n

Best Trial: score 1.5305374945734722,
params {'n_estimators': 495, 'max_depth': 24, 'min_samples_split': 5, 'min_samples_leaf': 3}


[I 2023-08-22 00:08:16,196] Trial 0 finished with value: 6.196489723313432 and parameters: {'n_estimators': 352, 'max_depth': 20, 'min_samples_split': 4, 'min_samples_leaf': 3}. Best is trial 0 with value: 6.196489723313432.
[I 2023-08-22 00:08:37,586] Trial 1 finished with value: 6.054923003443085 and parameters: {'n_estimators': 241, 'max_depth': 30, 'min_samples_split': 3, 'min_samples_leaf': 2}. Best is trial 1 with value: 6.054923003443085.
[I 2023-08-22 00:08:51,856] Trial 2 finished with value: 6.203381986811397 and parameters: {'n_estimators': 201, 'max_depth': 22, 'min_samples_split': 2, 'min_samples_leaf': 3}. Best is trial 1 with value: 6.054923003443085.
[I 2023-08-22 00:09:14,768] Trial 3 finished with value: 6.067734262390501 and parameters: {'n_estimators': 257, 'max_depth': 28, 'min_samples_split': 2, 'min_samples_leaf': 2}. Best is trial 1 with value: 6.054923003443085.
[I 2023-08-22 00:09:44,860] Trial 4 finished with value: 6.058009364806557 and parameters: {'n_estim

Best Trial: score 6.038840652673782,
params {'n_estimators': 324, 'max_depth': 27, 'min_samples_split': 3, 'min_samples_leaf': 2}


[I 2023-08-22 00:21:59,599] Trial 0 finished with value: 6.247664719439594 and parameters: {'n_estimators': 281, 'max_depth': 30, 'min_samples_split': 2, 'min_samples_leaf': 2}. Best is trial 0 with value: 6.247664719439594.
[I 2023-08-22 00:22:25,494] Trial 1 finished with value: 6.764975333516476 and parameters: {'n_estimators': 229, 'max_depth': 22, 'min_samples_split': 2, 'min_samples_leaf': 4}. Best is trial 0 with value: 6.247664719439594.
[I 2023-08-22 00:23:23,596] Trial 2 finished with value: 6.537984516313989 and parameters: {'n_estimators': 443, 'max_depth': 21, 'min_samples_split': 3, 'min_samples_leaf': 3}. Best is trial 0 with value: 6.247664719439594.
[I 2023-08-22 00:24:24,326] Trial 3 finished with value: 6.524218389570298 and parameters: {'n_estimators': 470, 'max_depth': 26, 'min_samples_split': 5, 'min_samples_leaf': 3}. Best is trial 0 with value: 6.247664719439594.
[I 2023-08-22 00:25:08,889] Trial 4 finished with value: 6.286083009193412 and parameters: {'n_estim

Best Trial: score 6.241914709256115,
params {'n_estimators': 340, 'max_depth': 29, 'min_samples_split': 2, 'min_samples_leaf': 2}


[I 2023-08-22 00:47:01,297] Trial 0 finished with value: 3.7498939161832 and parameters: {'n_estimators': 216, 'max_depth': 26, 'min_samples_split': 3, 'min_samples_leaf': 3}. Best is trial 0 with value: 3.7498939161832.
[I 2023-08-22 00:47:17,988] Trial 1 finished with value: 3.75757363134062 and parameters: {'n_estimators': 379, 'max_depth': 30, 'min_samples_split': 5, 'min_samples_leaf': 3}. Best is trial 0 with value: 3.7498939161832.
[I 2023-08-22 00:47:33,755] Trial 2 finished with value: 3.753908233008272 and parameters: {'n_estimators': 328, 'max_depth': 28, 'min_samples_split': 2, 'min_samples_leaf': 3}. Best is trial 0 with value: 3.7498939161832.
[I 2023-08-22 00:47:50,690] Trial 3 finished with value: 3.7486110502207395 and parameters: {'n_estimators': 378, 'max_depth': 25, 'min_samples_split': 4, 'min_samples_leaf': 3}. Best is trial 3 with value: 3.7486110502207395.
[I 2023-08-22 00:48:01,926] Trial 4 finished with value: 4.165210443852942 and parameters: {'n_estimators':

Best Trial: score 3.5027303012436617,
params {'n_estimators': 477, 'max_depth': 26, 'min_samples_split': 2, 'min_samples_leaf': 2}


In [10]:
pip install pycaret --user

Looking in indexes: http://ftp.daumkakao.com/pypi/simple
Collecting pycaret
  Downloading http://mirror.kakao.com/pypi/packages/f5/4b/2002980b046ac396618dfc152d384b812a78182b776ca77fe0ae5f80deac/pycaret-3.0.4-py3-none-any.whl (484 kB)
Collecting pyod>=1.0.8
  Using cached pyod-1.1.0-py3-none-any.whl
Collecting xxhash
  Downloading http://mirror.kakao.com/pypi/packages/a7/1e/88549866793ab18c9dd187a713cfe3c34fd344811600c0ecdee9ae587793/xxhash-3.2.0-cp39-cp39-win_amd64.whl (30 kB)
Collecting requests>=2.27.1
  Downloading http://mirror.kakao.com/pypi/packages/70/8e/0e2d847013cb52cd35b38c009bb167a1a26b2ce6cd6965bf26b47bc0bf44/requests-2.31.0-py3-none-any.whl (62 kB)
Collecting imbalanced-learn>=0.8.1
  Downloading http://mirror.kakao.com/pypi/packages/a3/9e/fbe60a768502af54563dcb59ca7856f5a8833b3ad5ada658922e1ab09b7f/imbalanced_learn-0.11.0-py3-none-any.whl (235 kB)
Collecting tbats>=1.1.3
  Downloading http://mirror.kakao.com/pypi/packages/63/94/1949dc644c3fa05b736b988dc8058122f8c0187778f

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
spyder 5.1.5 requires pyqtwebengine<5.13, which is not installed.
conda-repo-cli 1.0.4 requires pathlib, which is not installed.
anaconda-project 0.10.1 requires ruamel-yaml, which is not installed.
jupyterlab-server 2.8.2 requires jupyter-server~=1.4, but you have jupyter-server 2.5.0 which is incompatible.
cookiecutter 1.7.2 requires Jinja2<3.0.0, but you have jinja2 3.1.2 which is incompatible.
cookiecutter 1.7.2 requires MarkupSafe<2.0.0, but you have markupsafe 2.1.2 which is incompatible.


In [8]:
from pycaret.regression import *

X_train, X_valid, y_train, y_valid = train_test_split(X_0_train, y_0_log, test_size=168*X_0_train.building_number.nunique(), random_state=42)

result_0 = setup(data=X_train, target=y_train, fold=5)

best_0 = compare_models(sort='mse')

In [17]:
train_smapeli = []

for i, (train, test, y) in enumerate(zip(trains, tests, ys)) :
    X_train, X_valid, y_train, y_valid = train_test_split(train, y, test_size=168*train.building_number.nunique(), random_state=42)

    result = setup(data=X_train, target=y_train, fold=5)
    
    best = compare_models(sort='mse')
    best_tune = tune_model(best)
    pred = predict_model(best_tune)
    
    train_smape = smape(np.exp(np.array(y_valid)), np.exp(np.array(pred)))
    print(f"train {i} SMAPE : {train_smape}")
    train_smapeli.append(train_smape)

Unnamed: 0,Description,Value
0,Session id,2172
1,Target,power_consumption
2,Target type,Regression
3,Original data shape,"(14976, 11)"
4,Transformed data shape,"(14976, 11)"
5,Transformed train set shape,"(10483, 11)"
6,Transformed test set shape,"(4493, 11)"
7,Numeric features,10
8,Preprocess,True
9,Imputation type,simple


Processing:   0%|          | 0/85 [00:00<?, ?it/s]

ValueError: Estimator [] does not have the required fit() method.

In [9]:
X_train, X_valid, y_train, y_valid = train_test_split(X_3_train, y_3_log, test_size=168*X_3_train.building_number.nunique())

train_3_merged = pd.DataFrame(X_train, columns=col_3)
train_3_merged['power_consumption'] = y_train
exp_3 = setup(train_3_merged, target='power_consumption',fold=5)

Unnamed: 0,Description,Value
0,Session id,496
1,Target,power_consumption
2,Target type,Regression
3,Original data shape,"(29952, 9)"
4,Transformed data shape,"(29952, 9)"
5,Transformed train set shape,"(20966, 9)"
6,Transformed test set shape,"(8986, 9)"
7,Numeric features,8
8,Preprocess,True
9,Imputation type,simple


In [10]:
best_3 = compare_models(sort='mae')

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
rf,Random Forest Regressor,0.0613,0.0116,0.1074,0.9794,0.0134,0.0087,2.114
catboost,CatBoost Regressor,0.0639,0.0098,0.099,0.9825,0.0124,0.009,3.974
xgboost,Extreme Gradient Boosting,0.0641,0.0102,0.1007,0.9819,0.0126,0.0091,1.664
et,Extra Trees Regressor,0.0689,0.0159,0.126,0.9717,0.0155,0.0098,1.614
lightgbm,Light Gradient Boosting Machine,0.0731,0.0137,0.1171,0.9756,0.0145,0.0103,0.982
dt,Decision Tree Regressor,0.0743,0.0189,0.1375,0.9663,0.0171,0.0106,0.308
knn,K Neighbors Regressor,0.0836,0.0214,0.1462,0.9619,0.0178,0.0117,0.324
gbr,Gradient Boosting Regressor,0.1225,0.033,0.1816,0.9413,0.0223,0.0173,0.838
ada,AdaBoost Regressor,0.3029,0.1347,0.367,0.7602,0.044,0.0413,0.486
huber,Huber Regressor,0.3505,0.193,0.4392,0.6564,0.0544,0.0494,0.494


Processing:   0%|          | 0/85 [00:00<?, ?it/s]

In [11]:
best_3

In [15]:
X_train, X_valid, y_train, y_valid = train_test_split(X_0_train, y_0_log, test_size=168*X_3_train.building_number.nunique())

train_3_merged = pd.DataFrame(X_train, columns=col_0)
train_3_merged['power_consumption'] = y_train
exp_3 = setup(train_3_merged, target='power_consumption',fold=5)

Unnamed: 0,Description,Value
0,Session id,5236
1,Target,power_consumption
2,Target type,Regression
3,Original data shape,"(13632, 11)"
4,Transformed data shape,"(13632, 11)"
5,Transformed train set shape,"(9542, 11)"
6,Transformed test set shape,"(4090, 11)"
7,Numeric features,10
8,Preprocess,True
9,Imputation type,simple


In [16]:
best_3 = compare_models(sort='mae')

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,0.0385,0.0045,0.0668,0.9871,0.0079,0.005,0.952
rf,Random Forest Regressor,0.0393,0.0044,0.066,0.9874,0.0078,0.0051,1.246
catboost,CatBoost Regressor,0.0415,0.0043,0.0653,0.9876,0.0078,0.0054,1.744
xgboost,Extreme Gradient Boosting,0.0439,0.0047,0.0683,0.9865,0.0081,0.0057,0.566
lightgbm,Light Gradient Boosting Machine,0.0455,0.0052,0.0718,0.9851,0.0085,0.006,0.386
dt,Decision Tree Regressor,0.0483,0.0081,0.0901,0.9767,0.0106,0.0063,0.296
knn,K Neighbors Regressor,0.0612,0.0114,0.1068,0.9672,0.0125,0.008,0.3
gbr,Gradient Boosting Regressor,0.0768,0.0126,0.1124,0.9637,0.0132,0.01,0.646
ada,AdaBoost Regressor,0.1529,0.0349,0.1869,0.8997,0.0212,0.0196,0.45
br,Bayesian Ridge,0.2203,0.0779,0.279,0.7763,0.0319,0.0284,0.248


Processing:   0%|          | 0/85 [00:00<?, ?it/s]

In [17]:
best_3

In [32]:
trains = [X_0_train, X_2_train, X_3_train]
ys = [y_0_log, y_2_log, y_3_log]
cols = [col_0, col_2, col_3]

for i, (train, y, col) in enumerate(zip(trains, ys, cols)) :

    X_train, X_valid, y_train, y_valid = train_test_split(train, y, test_size=168*train.building_number.nunique())
    def objective(trial):
        param = {
            'n_estimators': trial.suggest_int('n_estimators', 200, 500),
        'criterion': 'absolute_error',
        'max_depth' : trial.suggest_int('max_depth', 20, 30),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 5),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5),
        'bootstrap' : True
        }

        model = ExtraTreesRegressor(**param)
        model.fit(X_train,y_train)

        preds = model.predict(X_valid)

        sm = smape(np.exp(np.array(y_valid)), np.exp(np.array(preds)))

        return sm


    study = optuna.create_study(study_name = f'ET Optuna_cluster')
    study.optimize(objective, n_trials=30, timeout=24000)

    et = study.best_trial
    et_param = et.params
    print('Best Trial : score {},\nparams {}'.format(et.value, et_param))

    bestli.append(et.value)
    paramli.append(et_param)

[I 2023-08-23 22:06:30,815] A new study created in memory with name: ET Optuna_cluster
[W 2023-08-23 22:07:31,720] Trial 0 failed with parameters: {'n_estimators': 428, 'max_depth': 29, 'min_samples_split': 5, 'min_samples_leaf': 2} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "C:\Users\magne\anaconda3\lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\magne\AppData\Local\Temp/ipykernel_8028/1633510177.py", line 19, in objective
    model.fit(X_train,y_train)
  File "C:\Users\magne\AppData\Roaming\Python\Python39\site-packages\sklearn\ensemble\_forest.py", line 473, in fit
    trees = Parallel(
  File "C:\Users\magne\AppData\Roaming\Python\Python39\site-packages\sklearn\utils\parallel.py", line 63, in __call__
    return super().__call__(iterable_with_config)
  File "C:\Users\magne\anaconda3\lib\site-packages\joblib\parallel.py", line 1855, in __call__
    return outp

KeyboardInterrupt: 