In [62]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFECV, RFE
import xgboost

train_a = pd.read_parquet('A/train_targets.parquet')
train_b = pd.read_parquet('B/train_targets.parquet')
train_c = pd.read_parquet('C/train_targets.parquet')

X_train_estimated_a = pd.read_parquet('A/X_train_estimated.parquet')
X_train_estimated_b = pd.read_parquet('B/X_train_estimated.parquet')
X_train_estimated_c = pd.read_parquet('C/X_train_estimated.parquet')

X_train_observed_a = pd.read_parquet('A/X_train_observed.parquet')
X_train_observed_b = pd.read_parquet('B/X_train_observed.parquet')
X_train_observed_c = pd.read_parquet('C/X_train_observed.parquet')


X_test_estimated_a = pd.read_parquet('A/X_test_estimated.parquet')
X_test_estimated_b = pd.read_parquet('B/X_test_estimated.parquet')
X_test_estimated_c = pd.read_parquet('C/X_test_estimated.parquet')

# Adding building ID

In [63]:

train_a['building_id'] = 'a'
train_b['building_id'] = 'b'
train_c['building_id'] = 'c'

X_train_estimated_a['building_id'] = 'a'
X_train_estimated_b['building_id'] = 'b'
X_train_estimated_c['building_id'] = 'c'

X_train_observed_a['building_id'] = 'a'
X_train_observed_b['building_id'] = 'b'
X_train_observed_c['building_id'] = 'c'

X_test_estimated_a['building_id'] = 'a'
X_test_estimated_b['building_id'] = 'b'
X_test_estimated_c['building_id'] = 'c'

# Aggregating data

In [64]:
all_observed_X = pd.concat([X_train_observed_a,X_train_observed_b,X_train_observed_c])
all_estimated_X = pd.concat([X_train_estimated_a,X_train_estimated_b,X_train_estimated_c])

X_test = pd.concat([X_test_estimated_a,X_test_estimated_b,X_test_estimated_c])
X_train = pd.concat([all_observed_X,all_estimated_X])

y_train = pd.concat([train_a,train_b,train_c])

X_train['time'] = X_train['date_forecast'].dt.floor('H')
X_train_grouped = X_train.groupby(['building_id','time']).mean()

X_test['time'] = X_test['date_forecast'].dt.floor('H')
X_test_grouped = X_test.groupby(['building_id','time']).mean()

data_train = X_train_grouped.merge(y_train,on=['time','building_id'],how='left')


# Some basic preprocessing

In [65]:
#Removing samples where there is not pv_measurement
data_train = data_train.dropna(subset=['pv_measurement'])

# DateTime removal
#Keeping only month and hour because more relevant for pv_measurement
data_train['time_month'] = data_train['time'].dt.month
data_train['time_hour'] = data_train['time'].dt.hour

X_test = X_test_grouped.reset_index()
X_test['time_month'] = X_test['time'].dt.month
X_test['time_hour'] = X_test['time'].dt.hour


# Splitting observed and estimated (preprocess of date_calc)
data_train_o = data_train[data_train['date_calc'].isna()]
data_train_e = data_train[data_train['date_calc'].notna()]

pd.options.mode.chained_assignment = None
data_train_e['delta_forecasting'] = (data_train_e['time']-data_train_e['date_calc']).apply(lambda x: x.total_seconds() / 3600)
X_test['delta_forecasting'] = (X_test['time']-X_test['date_calc']).apply(lambda x: x.total_seconds() / 3600)

data_train_o = data_train_o.sort_values(by='date_forecast')

data_train_o.drop(columns=['time','date_forecast','date_calc'],inplace=True)
data_train_e.drop(columns=['time','date_forecast','date_calc'],inplace=True)
X_test.drop(columns=['time','date_forecast','date_calc'],inplace=True)

#Snow density
data_train_o.drop('snow_density:kgm3',axis=1, inplace=True)
data_train_e.drop('snow_density:kgm3',axis=1, inplace=True)
X_test.drop('snow_density:kgm3',axis=1, inplace=True)

#Elevation
data_train_o.drop('elevation:m',axis=1, inplace=True)
data_train_e.drop('elevation:m',axis=1, inplace=True)
X_test.drop('elevation:m',axis=1, inplace=True)

#Cloud and ceiling
data_train_o['cloud_base_agl:m'].fillna(data_train_o['cloud_base_agl:m'].mean(), inplace=True)
data_train_o['ceiling_height_agl:m'].fillna(data_train_o['ceiling_height_agl:m'].mean(), inplace=True)

data_train_e['cloud_base_agl:m'].fillna(data_train_e['cloud_base_agl:m'].mean(), inplace=True)
data_train_e['ceiling_height_agl:m'].fillna(data_train_e['ceiling_height_agl:m'].mean(), inplace=True)

X_test['cloud_base_agl:m'].fillna(X_test['cloud_base_agl:m'].mean(), inplace=True)
X_test['ceiling_height_agl:m'].fillna(X_test['ceiling_height_agl:m'].mean(), inplace=True)


#Resetting indexes
data_train_o.reset_index(inplace=True,drop=True)
data_train_e.reset_index(inplace=True,drop=True)
X_test.reset_index(inplace=True,drop=True)

#One hot encoding
#data_train_o = pd.get_dummies(data_train_o, columns=['building_id'], prefix=['id'],dtype=int)
#data_train_e = pd.get_dummies(data_train_e, columns=['building_id'], prefix=['id'],dtype=int)
#X_test = pd.get_dummies(X_test, columns=['building_id'], prefix=['id'],dtype=int)

# Correlated Feature


In [66]:
#Making assumption only on data_train_o, should be same also for other dataset
#corrmat = data_train_o.corr()

#groups
#groups = []
#assigned_features = set()
#for col in data_train_o.columns:
#    if col not in assigned_features:
#        highly_correlated_features = corrmat.index[corrmat[col].abs() > 0.6].tolist()
#        highly_correlated_features.remove(col)  # Remove col itself
#        if highly_correlated_features:
#            highly_correlated_group = [col] + highly_correlated_features
#            groups.append(highly_correlated_group)
#            assigned_features.update(highly_correlated_group)
            #if highly_correlated_group not in groups:
            #    groups.append(highly_correlated_group)

#for group in groups:
#    print(f"Group: {', '.join(group)}")

#groups that make sense
#Humidity Group: absolute_humidity_2m:gm3, air_density_2m:kgm3, dew_point_2m:K, t_1000hPa:K
#Clouds height Group: ceiling_height_agl:m, cloud_base_agl:m
#Sun Group: clear_sky_energy_1h:J, clear_sky_rad:W, diffuse_rad:W, diffuse_rad_1h:J, direct_rad:W, direct_rad_1h:J, is_day:idx, is_in_shadow:idx, sun_elevation:d
#Cloud cover Group: effective_cloud_cover:p, total_cloud_cover:p
#Snow Group: fresh_snow_12h:cm, fresh_snow_24h:cm, fresh_snow_3h:cm, fresh_snow_6h:cm, fresh_snow_1h:cm,
#Pressure Group: msl_pressure:hPa, pressure_100m:hPa, pressure_50m:hPa, sfc_pressure:hPa

#f, ax = plt.subplots(figsize=(12, 9))
#sns.heatmap(corrmat, vmax=.8, square=True)

In [67]:
#PCA function
def pca_analysis(df, features, feature_prefix, n_components, scalers):
    df = df.copy(deep=True)

    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df[features])

    pca_transformer = PCA(n_components=n_components)
    df_pca = pd.DataFrame(pca_transformer.fit_transform(df_scaled))
    df_pca.columns = [feature_prefix + "_" + str(i+1) for i in range(n_components)]

    df = df.drop(features, axis=1)
    #df = pd.concat([df, df_pca], axis=1, ignore_index=True)
    df = pd.concat([df, pd.DataFrame(df_pca)], axis=1)
    

    scalers |= {feature_prefix: {'scaler': scaler, 'pca_transformer': pca_transformer}}

    return df, scalers

In [68]:
#Performing PCA on correlated groups
# define groups of variables

humidity_features_list = ['absolute_humidity_2m:gm3', 'air_density_2m:kgm3', 'dew_point_2m:K', 't_1000hPa:K']

cloud_height_features_list = ['ceiling_height_agl:m', 'cloud_base_agl:m']

sun_features_list = ['clear_sky_energy_1h:J', 'clear_sky_rad:W', 'diffuse_rad:W', 'diffuse_rad_1h:J', 'direct_rad:W', 'direct_rad_1h:J', 'is_day:idx', 'is_in_shadow:idx', 'sun_elevation:d']

cloud_cover_features_list = ['effective_cloud_cover:p', 'total_cloud_cover:p']

snow_features_list = ['fresh_snow_12h:cm', 'fresh_snow_1h:cm', 'fresh_snow_24h:cm', 'fresh_snow_3h:cm', 'fresh_snow_6h:cm']

pressure_feature_list = ['msl_pressure:hPa', 'pressure_100m:hPa', 'pressure_50m:hPa', 'sfc_pressure:hPa']

time_hour_features_list = ['time_hour', 'sun_azimuth:d']

all_groups = [humidity_features_list,cloud_height_features_list,sun_features_list,cloud_cover_features_list,snow_features_list,pressure_feature_list,time_hour_features_list]
all_groups_names = ['humidity','cloud_height','sun','cloud_cover','snow','pressure','time_hour']
n_pca = [2,1,3,1,2,2,1]

data_train_o_pca = data_train_o.copy()
scalers = {}

In [69]:
#Applying PCA 
for group, name, n in zip(all_groups, all_groups_names, n_pca):
    data_train_o_pca, scalers = pca_analysis(data_train_o_pca, group, name, n, scalers)

In [70]:
"""corrmat = data_train_o_pca.corr(numeric_only=True)
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True)"""

'corrmat = data_train_o_pca.corr(numeric_only=True)\nf, ax = plt.subplots(figsize=(12, 9))\nsns.heatmap(corrmat, vmax=.8, square=True)'

# Feature Importance

In [71]:
#data_train_o = data_train_o_pca.copy()
#random_forest = RandomForestRegressor(n_estimators=20, random_state=42)
#X = data_train_o.drop(columns=['pv_measurement'])
#y= data_train_o['pv_measurement']

#random_forest.fit(X, y)


In [72]:
"""
feature_importance_series = pd.Series(
    data=random_forest.feature_importances_,
    index=X.columns
).sort_values(ascending=False)

plt.figure(figsize=(15,5))
plt.xticks(rotation=90)

sns.barplot(
    x=feature_importance_series.index,
    y=feature_importance_series.values
)"""

'\nfeature_importance_series = pd.Series(\n    data=random_forest.feature_importances_,\n    index=X.columns\n).sort_values(ascending=False)\n\nplt.figure(figsize=(15,5))\nplt.xticks(rotation=90)\n\nsns.barplot(\n    x=feature_importance_series.index,\n    y=feature_importance_series.values\n)'

In [73]:
y = data_train_o['pv_measurement']

# get y mean per building id
mean_y_per_building = y.groupby(data_train_o['building_id']).mean()

# divide y by mean per building id
y = y.groupby(data_train_o['building_id']).transform(lambda x: x / mean_y_per_building[x.name]) 

X = data_train_o.drop(columns=['pv_measurement'])


In [None]:
X = pd.get_dummies(X, columns=['building_id'], prefix=['id'],dtype=int)

In [78]:
#X_train_xgb = X[:57418]
#y_train_xgb = y[:57418]

#X_test_xgb = X[57418:]
#y_test_xgb = y[57418:]
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

parameters = {
    #'estimator__n_estimators': Integer(50, 500),
    'estimator__max_depth': [10],
    'estimator__learning_rate': [0.01],
    'estimator__subsample': [0.9],
    'estimator__colsample_bytree': [0.5],
    'estimator__gamma': [4.7],
    'estimator__reg_alpha': [4.1],
    'estimator__reg_lambda': [3],
}

model1 = xgboost.XGBRegressor(n_estimators = 450,
                              max_depth = 10, 
                              learning_rate = 0.01,
                              estimator_subsample = 0.9,
                              colsample_bytree = 0.5,
                              gamma = 4.7,
                              reg_alpha = 4.1,
                              reg_lambda = 3)

model1.fit(X,y, verbose=True)





Parameters: { "estimator_subsample" } are not used.

