In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFECV, RFE
import xgboost

from skopt import BayesSearchCV

train_a = pd.read_parquet('A/train_targets.parquet')
train_b = pd.read_parquet('B/train_targets.parquet')
train_c = pd.read_parquet('C/train_targets.parquet')

X_train_estimated_a = pd.read_parquet('A/X_train_estimated.parquet')
X_train_estimated_b = pd.read_parquet('B/X_train_estimated.parquet')
X_train_estimated_c = pd.read_parquet('C/X_train_estimated.parquet')

X_train_observed_a = pd.read_parquet('A/X_train_observed.parquet')
X_train_observed_b = pd.read_parquet('B/X_train_observed.parquet')
X_train_observed_c = pd.read_parquet('C/X_train_observed.parquet')

X_test_estimated_a = pd.read_parquet('A/X_test_estimated.parquet')
X_test_estimated_b = pd.read_parquet('B/X_test_estimated.parquet')
X_test_estimated_c = pd.read_parquet('C/X_test_estimated.parquet')

# Adding building ID

In [2]:
train_c.shape

(32155, 2)

In [3]:

train_a['building_id'] = 'a'
train_b['building_id'] = 'b'
train_c['building_id'] = 'c'

X_train_estimated_a['building_id'] = 'a'
X_train_estimated_b['building_id'] = 'b'
X_train_estimated_c['building_id'] = 'c'

X_train_observed_a['building_id'] = 'a'
X_train_observed_b['building_id'] = 'b'
X_train_observed_c['building_id'] = 'c'

X_test_estimated_a['building_id'] = 'a'
X_test_estimated_b['building_id'] = 'b'
X_test_estimated_c['building_id'] = 'c'

# Aggregating data

In [184]:
X_test = pd.concat([X_test_estimated_a, X_test_estimated_b, X_test_estimated_c])
X_test['time'] = X_test['date_forecast'].dt.floor('H')
X_test = X_test.groupby(['building_id', 'time']).mean().reset_index()

X_test['delta_forecast'] = (X_test['time']-X_test['date_calc']).apply(lambda x: x.total_seconds() / 3600)

Xt2 = pd.DataFrame(X_test['delta_forecast'])

X_test.drop(['date_calc','delta_forecast'], axis=1, inplace=True)

X_observed = pd.concat([X_train_observed_a, X_train_observed_b, X_train_observed_c])
X_observed['time'] = X_observed['date_forecast'].dt.floor('H')

y_train = pd.concat([train_a, train_b, train_c])

X_observed_grouped = X_observed.groupby(['building_id', 'time']).mean()

# combine and remove rows with missing values in y
Xy_observed = pd.merge(X_observed_grouped, y_train, on=['time', 'building_id'], how='inner')
Xy_observed = Xy_observed[Xy_observed['pv_measurement'].notna()]


X = Xy_observed.drop(['pv_measurement'], axis=1)
y = Xy_observed['pv_measurement']

# Preprocessing

In [5]:

drop_cols = ['time', 'date_forecast', 'snow_density:kgm3']
X = X.drop(drop_cols, axis=1)

# get y mean per building id
mean_y_per_building = y.groupby(Xy_observed['building_id']).mean()

# divide y by mean per building id
y = y.groupby(Xy_observed['building_id']).transform(lambda x: x / mean_y_per_building[x.name]) 



In [6]:
# setting types of columns
categorical_features = [
    'building_id'
]

impute_features = [
    "cloud_base_agl:m",
    "ceiling_height_agl:m",
]


# Building the pipeline

In [7]:
from skopt.space import Real, Categorical, Integer

parameters = {
    'estimator__n_estimators': Integer(450,500),
    'estimator__max_depth': Integer(3, 10),
    'estimator__learning_rate': Real(0.01, 0.5),
    'estimator__subsample': Real(0.5, 1.0),
    'estimator__colsample_bytree': Real(0.5, 1.0),
    'estimator__gamma': Real(0, 5),
    'estimator__reg_alpha': Real(0, 5),
    'estimator__reg_lambda': Real(0, 5),
}


In [8]:
y.shape

(82026,)

In [9]:
y.isna().sum()

0

In [10]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


# set column transformer
columnTransformer = ColumnTransformer(
    transformers=[
        ('imputer', SimpleImputer(strategy='mean'),impute_features),
        ('oneHotEncoder', OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ],
    remainder='passthrough',  # Include other columns
    n_jobs=-1
)

# build the pipeline
pipeline = Pipeline(steps=[
    ('columnTransformer', columnTransformer),
    ('estimator', xgboost.XGBRegressor())
])

# create bayesian search estimator
bayes_search_estimator = BayesSearchCV(
    pipeline, parameters, scoring='neg_mean_absolute_error', cv=3, error_score='raise', n_jobs=-1, verbose=1, n_iter=100, random_state=42)

# fit the estimator on the data
bayes_search_estimator.fit(X, y)

# fit best model on the data
best_model = bayes_search_estimator.best_estimator_
best_model.fit(X, y)



Fitting 3 folds for each of 1 candidates, totalling 3 fits


Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fi

In [11]:
# print the scores
print('Best score:', bayes_search_estimator.best_score_)
print('Best parameters:', bayes_search_estimator.best_params_)
print('All scores:', bayes_search_estimator.cv_results_['mean_test_score'])


Best score: -0.37468744936578596
Best parameters: OrderedDict([('estimator__colsample_bytree', 0.6872786118665584), ('estimator__gamma', 0.623464377174465), ('estimator__learning_rate', 0.02687088180940387), ('estimator__max_depth', 10), ('estimator__n_estimators', 500), ('estimator__reg_alpha', 5.0), ('estimator__reg_lambda', 5.0), ('estimator__subsample', 0.5)])
All scores: [-0.4221026  -0.39507542 -0.40100943 -0.39640834 -0.39534579 -0.40777012
 -0.39225509 -0.40346881 -0.42624648 -0.42648582 -0.39727678 -0.4026648
 -0.39428632 -0.43057517 -0.39408829 -0.38943497 -0.39392249 -0.41355665
 -0.39583224 -0.38658653 -0.39018153 -0.42352983 -0.39088456 -0.46128561
 -0.4286951  -0.39273645 -0.4609386  -0.40106421 -0.42215645 -0.38176678
 -0.39114733 -0.38809308 -0.4014927  -0.39473086 -0.42018432 -0.38879529
 -0.40245185 -0.38588384 -0.40293128 -0.39667133 -0.39524811 -0.39393822
 -0.39182151 -0.39105611 -0.40159423 -0.4159902  -0.39810992 -0.40006663
 -0.38390212 -0.42627133 -0.39642022 -

In [29]:
y_test_pred = best_model.predict(X_test)

test = pd.concat([X_test, pd.Series(y_test_pred)], axis=1)

# multiply by mean per building id
test = pd.merge(test, mean_y_per_building, on='building_id', how='left')
y_pred = test[0] * test['pv_measurement']


In [30]:
y_pred = y_pred.reset_index(name='prediction')
y_pred.rename(columns={'index': 'id'}, inplace=True)
y_pred.to_csv('XGBOOST.csv', header=True, index=False)


In [32]:
y_pred.loc[y_pred['prediction'] < 0,'prediction'] = 0
y_pred.to_csv('XGBOOST_zeroed_non_standardized.csv', header=True, index=False)


# Model2

In [193]:
X_estimated = pd.concat([X_train_estimated_a, X_train_estimated_b, X_train_estimated_c])
X_estimated['time'] = X_estimated['date_forecast'].dt.floor('H')
X_estimated['delta_forecast'] = (X_estimated['time']-X_estimated['date_calc']).apply(lambda x: x.total_seconds() / 3600)

X_estimated_grouped = X_estimated.groupby(['building_id', 'time']).mean()

# combine and remove rows with missing values in y
Xy_estimated = pd.merge(X_estimated_grouped, y_train, on=['time', 'building_id'], how='inner')
Xy_estimated = Xy_estimated[Xy_observed['pv_measurement'].notna()]


Xe = Xy_estimated.drop(['pv_measurement'], axis=1)
ye = Xy_estimated['pv_measurement']

  Xy_estimated = Xy_estimated[Xy_observed['pv_measurement'].notna()]


In [194]:
Xe2 = pd.DataFrame(Xe['delta_forecast'])

# divide y by mean per building id
ye = ye.groupby(Xy_estimated['building_id']).transform(lambda x: x / mean_y_per_building[x.name]) 


In [195]:
y_pred1 = best_model.predict(Xe)



In [196]:
Xe2['pred'] = y_pred1

## second as LinearRegression

In [197]:
from sklearn.linear_model import LinearRegression
linear = LinearRegression()
linear.fit(Xe2,ye)

## second as XGBoost

In [218]:
parameters2 = {
    'estimator__n_estimators': Integer(20,21),
    'estimator__max_depth': Integer(3, 10),
    'estimator__learning_rate': Real(0.01, 0.5),
    'estimator__subsample': Real(0.5, 1.0),
    'estimator__colsample_bytree': Real(0.5, 1.0),
    'estimator__gamma': Real(0, 5),
    'estimator__reg_alpha': Real(0, 5),
    'estimator__reg_lambda': Real(0, 5),
}

In [219]:
bayes_search_estimator2 = BayesSearchCV(
    xgboost.XGBRegressor(), parameters2, scoring='neg_mean_absolute_error', cv=3, error_score='raise', n_jobs=-1, verbose=1, n_iter=20, random_state=42)

# fit the estimator on the data
bayes_search_estimator2.fit(Xe2, ye)

# fit best model on the data
best_model2 = bayes_search_estimator2.best_estimator_
best_model2.fit(Xe2, ye)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fi

Parameters: { "estimator__colsample_bytree", "estimator__gamma", "estimator__learning_rate", "estimator__max_depth", "estimator__n_estimators", "estimator__reg_alpha", "estimator__reg_lambda", "estimator__subsample" } are not used.

Parameters: { "estimator__colsample_bytree", "estimator__gamma", "estimator__learning_rate", "estimator__max_depth", "estimator__n_estimators", "estimator__reg_alpha", "estimator__reg_lambda", "estimator__subsample" } are not used.



In [221]:
# print the scores
print('Best score:', bayes_search_estimator2.best_score_)
print('Best parameters:', bayes_search_estimator2.best_params_)
print('All scores:', bayes_search_estimator2.cv_results_['mean_test_score'])


Best score: -0.05828693531431781
Best parameters: OrderedDict([('estimator__colsample_bytree', 0.705051979426657), ('estimator__gamma', 3.6386287158866253), ('estimator__learning_rate', 0.4671053194354386), ('estimator__max_depth', 5), ('estimator__n_estimators', 21), ('estimator__reg_alpha', 2.0705931624276923), ('estimator__reg_lambda', 1.7546566744957202), ('estimator__subsample', 0.8697521170952103)])
All scores: [-0.05828694 -0.05828694 -0.05828694 -0.05828694 -0.05828694 -0.05828694
 -0.05828694 -0.05828694 -0.05828694 -0.05828694 -0.05828694 -0.05828694
 -0.05828694 -0.05828694 -0.05828694 -0.05828694 -0.05828694 -0.05828694
 -0.05828694 -0.05828694]


# Final prevision

In [198]:
Xt2['pred'] = best_model.predict(X_test)

In [204]:
#y_test_pred = linear.predict(Xt2)
y_test_pred = best_model2.predict(Xt2)

test = pd.concat([X_test, pd.Series(y_test_pred)], axis=1)

# multiply by mean per building id
test = pd.merge(test, mean_y_per_building, on='building_id', how='left')
y_pred = test[0] * test['pv_measurement']

y_pred = y_pred.reset_index(name='prediction')
y_pred.rename(columns={'index': 'id'}, inplace=True)
#y_pred.to_csv('XGBOOST.csv', header=True, index=False)

y_pred.loc[y_pred['prediction'] < 0,'prediction'] = 0
y_pred.to_csv('XGBOOST_zeroed_non_standardized_double.csv', header=True, index=False)


#