In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor,ExtraTreesRegressor, VotingRegressor, StackingRegressor
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFECV, RFE
import xgboost
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from skopt.space import Real, Categorical, Integer
from lightgbm import LGBMRegressor

from skopt import BayesSearchCV

train_a = pd.read_parquet('A/train_targets.parquet')
train_b = pd.read_parquet('B/train_targets.parquet')
train_c = pd.read_parquet('C/train_targets.parquet')

X_train_estimated_a = pd.read_parquet('A/X_train_estimated.parquet')
X_train_estimated_b = pd.read_parquet('B/X_train_estimated.parquet')
X_train_estimated_c = pd.read_parquet('C/X_train_estimated.parquet')

X_train_observed_a = pd.read_parquet('A/X_train_observed.parquet')
X_train_observed_b = pd.read_parquet('B/X_train_observed.parquet')
X_train_observed_c = pd.read_parquet('C/X_train_observed.parquet')

X_test_estimated_a = pd.read_parquet('A/X_test_estimated.parquet')
X_test_estimated_b = pd.read_parquet('B/X_test_estimated.parquet')
X_test_estimated_c = pd.read_parquet('C/X_test_estimated.parquet')


train_a['building_id'] = 'a'
train_b['building_id'] = 'b'
train_c['building_id'] = 'c'

X_train_estimated_a['building_id'] = 'a'
X_train_estimated_b['building_id'] = 'b'
X_train_estimated_c['building_id'] = 'c'

X_train_observed_a['building_id'] = 'a'
X_train_observed_b['building_id'] = 'b'
X_train_observed_c['building_id'] = 'c'

X_test_estimated_a['building_id'] = 'a'
X_test_estimated_b['building_id'] = 'b'
X_test_estimated_c['building_id'] = 'c'

X_test = pd.concat([X_test_estimated_a, X_test_estimated_b, X_test_estimated_c])
X_test['time'] = X_test['date_forecast'].dt.floor('H')
X_test = X_test.groupby(['building_id', 'time']).mean().reset_index()

X_test_delta = (X_test['time']-X_test['date_calc']).apply(lambda x: x.total_seconds() / 3600)


X_test.drop(['date_calc'], axis=1, inplace=True)

X_observed = pd.concat([X_train_observed_a, X_train_observed_b, X_train_observed_c])
X_observed['time'] = X_observed['date_forecast'].dt.floor('H')

y_train = pd.concat([train_a, train_b, train_c])

X_observed_grouped = X_observed.groupby(['building_id', 'time']).mean()

# combine and remove rows with missing values in y
Xy_observed = pd.merge(X_observed_grouped, y_train, on=['time', 'building_id'], how='inner')
Xy_observed = Xy_observed[Xy_observed['pv_measurement'].notna()]
X = Xy_observed.drop(['pv_measurement'], axis=1)
y = Xy_observed['pv_measurement']

X['time_month'] = X['time'].dt.month
X_test['time_month'] = X_test['time'].dt.month

drop_cols = ['time', 'date_forecast', 'snow_density:kgm3','clear_sky_energy_1h:J',
 'direct_rad_1h:J',
 'fresh_snow_24h:cm',
 'fresh_snow_1h:cm',
 'fresh_snow_12h:cm',
 'diffuse_rad_1h:J',
 'dew_point_2m:K',
 'dew_or_rime:idx',
 'precip_5min:mm',
 'fresh_snow_6h:cm',
 'prob_rime:p',
 'ceiling_height_agl:m',
 'rain_water:kgm2',
 'sfc_pressure:hPa',
 'snow_depth:cm',
 'snow_drift:idx',
 'snow_melt_10min:mm',
 'snow_water:kgm2',
 'pressure_50m:hPa',
 'wind_speed_w_1000hPa:ms',
 'pressure_100m:hPa',
 'fresh_snow_3h:cm']
X = X.drop(drop_cols, axis=1)
X_test = X_test.drop(drop_cols,axis=1)
# get y mean per building id
mean_y_per_building = y.groupby(Xy_observed['building_id']).mean()

# divide y by mean per building id
y = y.groupby(Xy_observed['building_id']).transform(lambda x: x / mean_y_per_building[x.name]) 

# setting types of columns
categorical_features = [
    'building_id'
]

impute_features = [
    "cloud_base_agl:m",
    #"ceiling_height_agl:m",
]


In [2]:
LGB = LGBMRegressor(
    colsample_bytree = 0.5,
    learning_rate = 0.1,
    max_depth = 11,
    min_child_samples = 20,
    min_child_weight = 10,
    n_estimators = 100,
    num_leaves = 100,
    reg_alpha = 0.57,
    reg_lambda = 0.45,
    subsample = 0.9
)

XT = ExtraTreesRegressor(
    max_depth=10,
    min_samples_split= 3,
    n_estimators=173
)

XGB = xgboost.XGBRegressor(
    colsample_bytree = 0.5,
    gamma = 0,
    learning_rate = 0.07,
    max_depth = 10,
    n_estimators = 174,
    reg_alpha = 0,
    reg_lambda = 5,
    subsample = 1
)

HGB = HistGradientBoostingRegressor(
    learning_rate=0.04,
    max_depth=10,
    max_iter=1000,
    min_samples_leaf=20,
)

In [4]:
# set column transformer
columnTransformer = ColumnTransformer(
    transformers=[
        ('imputer', SimpleImputer(strategy='mean'),impute_features),
        ('oneHotEncoder', OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ],
    remainder='passthrough',  # Include other columns
    n_jobs=-1
)

# build the pipeline
pipeline = Pipeline(steps=[
    ('columnTransformer', columnTransformer),
    ('estimator', LGB)
])

pipeline.fit(X,y)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004909 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4420
[LightGBM] [Info] Number of data points in the train set: 82026, number of used features: 26
[LightGBM] [Info] Start training from score 1.000000


In [91]:
estimators=[('lgbgf',LGB),('xgb',XGB),('hgb',HGB)]

Vote=VotingRegressor([('lgbgf',LGB),('xgb',XGB),('hgb',HGB)])

Stack=StackingRegressor(estimators=estimators)

In [92]:
# set column transformer
columnTransformer = ColumnTransformer(
    transformers=[
        ('imputer', SimpleImputer(strategy='mean'),impute_features),
        ('oneHotEncoder', OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ],
    remainder='passthrough',  # Include other columns
    n_jobs=-1
)

# build the pipeline
pipeline = Pipeline(steps=[
    ('columnTransformer', columnTransformer),
    ('estimator', Vote)
])

vote_model1 = pipeline.fit(X,y)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003699 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4420
[LightGBM] [Info] Number of data points in the train set: 82026, number of used features: 26
[LightGBM] [Info] Start training from score 1.000000


In [93]:
X_train_estimated = pd.concat([X_train_estimated_a, X_train_estimated_b, X_train_estimated_c])

X_train_estimated['delta_forecasting'] = (X_train_estimated['date_forecast']-X_train_estimated['date_calc']).apply(lambda x: x.total_seconds() / 3600)
X_test['delta_forecasting'] = X_test_delta

X_train_estimated['time'] = X_train_estimated['date_forecast'].dt.floor('H')
X_train_estimated = X_train_estimated.groupby(['building_id', 'time']).mean().reset_index()

Xy_estimated = pd.merge(X_train_estimated, y_train, on=['time', 'building_id'], how='inner')
Xy_estimated = Xy_estimated[Xy_estimated['pv_measurement'].notna()]

Xe = Xy_estimated.drop(['pv_measurement'], axis=1)
ye = Xy_estimated['pv_measurement']
Xe['time_month'] = Xe['time'].dt.month

ye = ye.groupby(Xy_estimated['building_id']).transform(lambda x: x / mean_y_per_building[x.name]) 

drop_cols = ['time', 'date_forecast', 'snow_density:kgm3','date_calc','clear_sky_energy_1h:J',
 'direct_rad_1h:J',
 'fresh_snow_24h:cm',
 'fresh_snow_1h:cm',
 'fresh_snow_12h:cm',
 'diffuse_rad_1h:J',
 'dew_point_2m:K',
 'dew_or_rime:idx',
 'precip_5min:mm',
 'fresh_snow_6h:cm',
 'prob_rime:p',
 'ceiling_height_agl:m',
 'rain_water:kgm2',
 'sfc_pressure:hPa',
 'snow_depth:cm',
 'snow_drift:idx',
 'snow_melt_10min:mm',
 'snow_water:kgm2',
 'pressure_50m:hPa',
 'wind_speed_w_1000hPa:ms',
 'pressure_100m:hPa',
 'fresh_snow_3h:cm']
Xe = Xe.drop(drop_cols, axis=1)

In [94]:
y_pred_e = vote_model1.predict(Xe)

y_pred_test = vote_model1.predict(X_test)




In [95]:
Xe['first_pred'] = y_pred_e
X_test['first_pred'] = y_pred_test

In [96]:
rf = RandomForestRegressor()

columnTransformer2 = ColumnTransformer(
    transformers=[
        ('imputer', SimpleImputer(strategy='mean'),impute_features),
        ('oneHotEncoder', OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ],
    remainder='passthrough',  # Include other columns
    n_jobs=-1
)

# build the pipeline
pipeline2 = Pipeline(steps=[
    ('columnTransformer', columnTransformer),
    ('estimator', rf)
])

second_model = pipeline2.fit(Xe,ye)