In [241]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFECV, RFE
import xgboost

from skopt import BayesSearchCV

train_a = pd.read_parquet('A/train_targets.parquet')
train_b = pd.read_parquet('B/train_targets.parquet')
train_c = pd.read_parquet('C/train_targets.parquet')

X_train_estimated_a = pd.read_parquet('A/X_train_estimated.parquet')
X_train_estimated_b = pd.read_parquet('B/X_train_estimated.parquet')
X_train_estimated_c = pd.read_parquet('C/X_train_estimated.parquet')

X_train_observed_a = pd.read_parquet('A/X_train_observed.parquet')
X_train_observed_b = pd.read_parquet('B/X_train_observed.parquet')
X_train_observed_c = pd.read_parquet('C/X_train_observed.parquet')

X_test_estimated_a = pd.read_parquet('A/X_test_estimated.parquet')
X_test_estimated_b = pd.read_parquet('B/X_test_estimated.parquet')
X_test_estimated_c = pd.read_parquet('C/X_test_estimated.parquet')

# Adding building ID

In [242]:
train_c.shape

(32155, 2)

In [243]:

train_a['building_id'] = 'a'
train_b['building_id'] = 'b'
train_c['building_id'] = 'c'

X_train_estimated_a['building_id'] = 'a'
X_train_estimated_b['building_id'] = 'b'
X_train_estimated_c['building_id'] = 'c'

X_train_observed_a['building_id'] = 'a'
X_train_observed_b['building_id'] = 'b'
X_train_observed_c['building_id'] = 'c'

X_test_estimated_a['building_id'] = 'a'
X_test_estimated_b['building_id'] = 'b'
X_test_estimated_c['building_id'] = 'c'

# Aggregating data

In [244]:
X_test = pd.concat([X_test_estimated_a, X_test_estimated_b, X_test_estimated_c])
X_test['time'] = X_test['date_forecast'].dt.floor('H')
X_test = X_test.groupby(['building_id', 'time']).mean().reset_index()
X_test.drop(['date_calc'], axis=1, inplace=True)

X_observed = pd.concat([X_train_observed_a, X_train_observed_b, X_train_observed_c])
X_observed['time'] = X_observed['date_forecast'].dt.floor('H')

y_train = pd.concat([train_a, train_b, train_c])

X_observed_grouped = X_observed.groupby(['building_id', 'time']).mean()

# combine and remove rows with missing values in y
Xy_observed = pd.merge(X_observed_grouped, y_train, on=['time', 'building_id'], how='inner')
Xy_observed = Xy_observed[Xy_observed['pv_measurement'].notna()]


X = Xy_observed.drop(['pv_measurement'], axis=1)
y = Xy_observed['pv_measurement']

# Preprocessing

In [245]:

drop_cols = ['time', 'date_forecast', 'snow_density:kgm3']
X = X.drop(drop_cols, axis=1)

# get y mean per building id
mean_y_per_building = y.groupby(Xy_observed['building_id']).mean()

# divide y by mean per building id
y = y.groupby(Xy_observed['building_id']).transform(lambda x: x / mean_y_per_building[x.name]) 



In [246]:
# setting types of columns
categorical_features = [
    'building_id'
]

impute_features = [
    "cloud_base_agl:m",
    "ceiling_height_agl:m",
]


# Building the pipeline

In [247]:
from skopt.space import Real, Categorical, Integer

parameters = {
    'estimator__n_estimators': Integer(50, 500),
    'estimator__max_depth': Integer(3, 10),
    'estimator__learning_rate': Real(0.01, 0.5),
    'estimator__subsample': Real(0.5, 1.0),
    'estimator__colsample_bytree': Real(0.5, 1.0),
    'estimator__gamma': Real(0, 5),
    'estimator__reg_alpha': Real(0, 5),
    'estimator__reg_lambda': Real(0, 5),
}


In [248]:
y.shape

(82026,)

In [249]:
y.isna().sum()

0

In [250]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


# set column transformer
columnTransformer = ColumnTransformer(
    transformers=[
        ('imputer', SimpleImputer(strategy='mean'),impute_features),
        ('oneHotEncoder', OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ],
    remainder='passthrough',  # Include other columns
    n_jobs=-1
)

# build the pipeline
pipeline = Pipeline(steps=[
    ('columnTransformer', columnTransformer),
    ('estimator', xgboost.XGBRegressor())
])

# create bayesian search estimator
bayes_search_estimator = BayesSearchCV(
    pipeline, parameters, scoring='neg_mean_absolute_error', cv=3, error_score='raise', n_jobs=-1, verbose=1, n_iter=100, random_state=42)

# fit the estimator on the data
bayes_search_estimator.fit(X, y)

# fit best model on the data
best_model = bayes_search_estimator.best_estimator_
best_model.fit(X, y)



Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fi

In [251]:
# print the scores
print('Best score:', bayes_search_estimator.best_score_)
print('Best parameters:', bayes_search_estimator.best_params_)
print('All scores:', bayes_search_estimator.cv_results_['mean_test_score'])


Best score: -113.50109339209769
Best parameters: OrderedDict([('estimator__colsample_bytree', 1.0), ('estimator__gamma', 4.719328384245778), ('estimator__learning_rate', 0.01), ('estimator__max_depth', 10), ('estimator__n_estimators', 500), ('estimator__reg_alpha', 4.150674618612672), ('estimator__reg_lambda', 2.9704809601841458), ('estimator__subsample', 0.8823683637661639)])
All scores: [-245.75317996 -133.16641911 -139.29007633 -131.99639738 -140.10388662
 -150.49637602 -157.31680197 -193.89879572 -208.84015006 -246.51922911
 -120.81184144 -126.91495004 -125.24056604 -118.6155655  -119.2277078
 -121.27076704 -118.66365506 -118.6519584  -118.72272348 -153.82600271
 -118.55618988 -116.09955251 -114.84771489 -114.87278677 -114.68838954
 -114.55949535 -114.4770843  -114.24096008 -114.07135782 -114.11214001
 -114.17734983 -114.11417592 -114.01896219 -113.98399731 -113.85088295
 -113.62830923 -113.78324783 -113.82907348 -113.84289685 -113.64992786
 -113.72484377 -113.91673024 -113.6726531

In [252]:
y_test_pred = best_model.predict(X_test)

test = pd.concat([X_test, pd.Series(y_test_pred)], axis=1)

# multiply by mean per building id
test = pd.merge(test, mean_y_per_building, on='building_id', how='left')
y_pred = test[0] * test['pv_measurement']


In [253]:
y_pred = y_pred.reset_index(name='prediction')
y_pred.rename(columns={'index': 'id'}, inplace=True)
y_pred.to_csv('XGBOOST.csv', header=True, index=False)


In [254]:
y_pred.loc[y_pred['prediction'] < 0,'prediction'] = 0
y_pred.to_csv('XGBOOST_zeroed_non_standardized.csv', header=True, index=False)


In [255]:
y_pred

Unnamed: 0,id,prediction
0,0,674.145521
1,1,674.145521
2,2,674.145521
3,3,674.145521
4,4,674.145521
...,...,...
2155,2155,82.288069
2156,2156,82.288069
2157,2157,82.288069
2158,2158,82.288069


#