# Imports and settings

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error
from skopt import BayesSearchCV
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from skopt.space import Integer
#!pip install xgboost
import xgboost

# Loading Data

In [2]:
#Importing data
train_a = pd.read_parquet('A/train_targets.parquet')
train_b = pd.read_parquet('B/train_targets.parquet')
train_c = pd.read_parquet('C/train_targets.parquet')

X_train_estimated_a = pd.read_parquet('A/X_train_estimated.parquet')
X_train_estimated_b = pd.read_parquet('B/X_train_estimated.parquet')
X_train_estimated_c = pd.read_parquet('C/X_train_estimated.parquet')

X_train_observed_a = pd.read_parquet('A/X_train_observed.parquet')
X_train_observed_b = pd.read_parquet('B/X_train_observed.parquet')
X_train_observed_c = pd.read_parquet('C/X_train_observed.parquet')

X_test_estimated_a = pd.read_parquet('A/X_test_estimated.parquet')
X_test_estimated_b = pd.read_parquet('B/X_test_estimated.parquet')
X_test_estimated_c = pd.read_parquet('C/X_test_estimated.parquet')

# Labeling data
train_a['building_id'] = 'a'
train_b['building_id'] = 'b'
train_c['building_id'] = 'c'

X_train_estimated_a['building_id'] = 'a'
X_train_estimated_b['building_id'] = 'b'
X_train_estimated_c['building_id'] = 'c'

X_train_observed_a['building_id'] = 'a'
X_train_observed_b['building_id'] = 'b'
X_train_observed_c['building_id'] = 'c'

X_test_estimated_a['building_id'] = 'a'
X_test_estimated_b['building_id'] = 'b'
X_test_estimated_c['building_id'] = 'c'

# Combining data
all_observed_X = pd.concat([X_train_observed_a,X_train_observed_b,X_train_observed_c])
all_estimated_X = pd.concat([X_train_estimated_a,X_train_estimated_b,X_train_estimated_c])
all_y = pd.concat([train_a, train_b, train_c])
X_test = pd.concat([X_test_estimated_a,X_test_estimated_b,X_test_estimated_c])

# add type of weather data
all_observed_X['isObserved'] = 1
all_estimated_X['isObserved'] = 0
X_test['isObserved'] = 0

# Combining training data
all_X = pd.concat([all_estimated_X,all_observed_X])

# Aggregating all_X
all_X_aggregated = all_X.copy()
all_X_aggregated['time_hour'] = all_X_aggregated['date_forecast'].dt.floor('H')
all_X_aggregated = all_X_aggregated.groupby(['building_id','time_hour','isObserved']).mean().reset_index()

# Aggregating X_test
X_test_aggregated = X_test.copy()
X_test_aggregated['time_hour'] = X_test_aggregated['date_forecast'].dt.floor('H')
X_test_aggregated = X_test_aggregated.groupby(['building_id','time_hour','isObserved']).mean().reset_index()

# Renaming time column in all_y
all_y = all_y.rename(columns={'time': 'time_hour'})

# Merging all_X_aggregated and all_y
all_train = pd.merge(all_X_aggregated,all_y,on=['building_id','time_hour'],how='right') # right join to keep all y values

#Encoding date
all_X_aggregated['sin_mon'] = np.sin((all_X_aggregated['time_hour'].dt.month - 1)*np.pi/11)
all_X_aggregated['cos_mon'] = np.cos((all_X_aggregated['time_hour'].dt.month - 1)*np.pi/11)

all_X_aggregated['sin_hr']= np.sin(all_X_aggregated['time_hour'].dt.hour*np.pi/23)
all_X_aggregated['cos_hr']= np.sin(all_X_aggregated['time_hour'].dt.hour*np.pi/23)

X_test_aggregated['sin_mon'] = np.sin((X_test_aggregated['time_hour'].dt.month - 1)*np.pi/11)
X_test_aggregated['cos_mon'] = np.cos((X_test_aggregated['time_hour'].dt.month - 1)*np.pi/11)

X_test_aggregated['sin_hr']= np.sin(X_test_aggregated['time_hour'].dt.hour*np.pi/23)
X_test_aggregated['cos_hr']= np.sin(X_test_aggregated['time_hour'].dt.hour*np.pi/23)
# 

# Data Analysis

## Preprocessing for Data Analysis

In [3]:
# define groups of variables

sun_features_list = ['clear_sky_energy_1h:J', 'clear_sky_rad:W', 'diffuse_rad:W', 'diffuse_rad_1h:J', 'direct_rad:W', 'direct_rad_1h:J', 'is_day:idx', 'is_in_shadow:idx', 'sun_elevation:d']

humidity_features_list = ['absolute_humidity_2m:gm3', 'air_density_2m:kgm3', 'dew_point_2m:K', 't_1000hPa:K']

snow_features_list = ['fresh_snow_12h:cm', 'fresh_snow_1h:cm', 'fresh_snow_24h:cm', 'fresh_snow_3h:cm', 'fresh_snow_6h:cm']

cloud_height_features_list = ['ceiling_height_agl:m', 'cloud_base_agl:m']

feature_groups = [
    {
        'name': 'sun',
        'features': sun_features_list
    },
    {
        'name': 'humidity',
        'features': humidity_features_list
    },
    {
        'name': 'snow',
        'features': snow_features_list
    },
    {
        'name': 'cloud_height',
        'features': cloud_height_features_list
    }
]

all_pca_features = sun_features_list + humidity_features_list + snow_features_list + cloud_height_features_list

In [4]:
all_data = all_X_aggregated.merge(all_y,on=['time_hour','building_id'],how='left')

# Pre-Processing

### Data cleaning

In [5]:
# drop non useful features
drop_features = [
    'snow_density:kgm3', # always 250 or nan
    'date_calc', # TODO put dates back in, only for testing
    'date_forecast', # TODO
    'time_hour', # TODO
    'elevation:m'
]

all_data = all_data.drop(columns=drop_features)

In [6]:
#fill as mean, (iterative imputer)
all_data['cloud_base_agl:m'].fillna(all_data['cloud_base_agl:m'].mean(), inplace=True)
all_data['ceiling_height_agl:m'].fillna(all_data['ceiling_height_agl:m'].mean(), inplace=True)


In [7]:
#same for test dataset
X_test_aggregated = X_test_aggregated.drop(['date_forecast','date_calc','snow_density:kgm3','time_hour','elevation:m'],axis=1) # TODO remove columns at better cell
X_test_aggregated['cloud_base_agl:m'].fillna(X_test_aggregated['cloud_base_agl:m'].mean(), inplace=True)
X_test_aggregated['ceiling_height_agl:m'].fillna(X_test_aggregated['ceiling_height_agl:m'].mean(), inplace=True)

# Modeling

In [8]:
estimators = [
    {
        "estimator": RandomForestRegressor(random_state=42, n_estimators=20, max_depth=20),
    },
    #{
    #    "estimator": AdaBoostRegressor(DecisionTreeRegressor(max_depth=10),random_state=42,n_estimators=200),
   # },
    #{
    #    "estimator": xgboost.XGBRegressor(n_estimators=200, max_depth= 20),
    #},
    #{
    #    "estimator": MLPRegressor(random_state=42)
    #},
]

In [17]:
# remove rows that are NaN in target column
all_data = all_data[~all_data['pv_measurement'].isna()]

# shuffle all_data to have approximately the same distribution of buildings and observed/estimated in each fold of CV
all_data = all_data.sample(frac=1, random_state=42).reset_index(drop=True) # TODO turn back on?

X_test = X_test_aggregated

# define X, y and X_test
X = all_data.drop(['pv_measurement'], axis=1)
y = all_data['pv_measurement']

sun_pipeline = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=2))
])
humidity_pipeline = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('pca' , PCA(n_components=2))
])
snow_pipeline = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=1))
])
cloud_height_pipeline = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=1))
])

# set column transformer
columnTransformer = ColumnTransformer(
    transformers = [
        ('pca_sun', sun_pipeline, sun_features_list),
        ('pca_humidity', humidity_pipeline, humidity_features_list),
        ('pca_snow', snow_pipeline, snow_features_list),
        ('pca_cloud_height', cloud_height_pipeline, cloud_height_features_list),
        ('oneHotEncoder', OneHotEncoder(handle_unknown='ignore'), ['building_id']),
    ],
    remainder='passthrough',  # Include other columns
)

# build the pipeline
pipelinePrep = Pipeline([
    ('columnTransformer', columnTransformer),
])

prep_model = pipelinePrep.fit(X)

X = pipelinePrep.fit_transform(X)
X_test = pipelinePrep.fit_transform(X_test)
prep_model.get_feature_names_out()

array(['pca_sun__pca0', 'pca_sun__pca1', 'pca_humidity__pca0',
       'pca_humidity__pca1', 'pca_snow__pca0', 'pca_cloud_height__pca0',
       'oneHotEncoder__building_id_a', 'oneHotEncoder__building_id_b',
       'oneHotEncoder__building_id_c', 'remainder__isObserved',
       'remainder__dew_or_rime:idx', 'remainder__effective_cloud_cover:p',
       'remainder__msl_pressure:hPa', 'remainder__precip_5min:mm',
       'remainder__precip_type_5min:idx', 'remainder__pressure_100m:hPa',
       'remainder__pressure_50m:hPa', 'remainder__prob_rime:p',
       'remainder__rain_water:kgm2',
       'remainder__relative_humidity_1000hPa:p',
       'remainder__sfc_pressure:hPa', 'remainder__snow_depth:cm',
       'remainder__snow_drift:idx', 'remainder__snow_melt_10min:mm',
       'remainder__snow_water:kgm2', 'remainder__sun_azimuth:d',
       'remainder__super_cooled_liquid_water:kgm2',
       'remainder__total_cloud_cover:p', 'remainder__visibility:m',
       'remainder__wind_speed_10m:ms', 'rem

In [60]:
"""cutoff = int(len(X)*0.8)

X_train = X[:cutoff]
X_val = X[cutoff:]
y_train = y[:cutoff]
y_val = y[cutoff:]

models = []

# main training function
for estimator in estimators:
  # set pca pipelines
  # build the pipeline
    pipeline_model = Pipeline([
        ('estimator', estimator.get("estimator"))
    ])

    model = pipeline_model.fit(X_train,y_train)
    models.append(model)"""

'cutoff = int(len(X)*0.8)\n\nX_train = X[:cutoff]\nX_val = X[cutoff:]\ny_train = y[:cutoff]\ny_val = y[cutoff:]\n\nmodels = []\n\n# main training function\nfor estimator in estimators:\n  # set pca pipelines\n  # build the pipeline\n    pipeline_model = Pipeline([\n        (\'estimator\', estimator.get("estimator"))\n    ])\n\n    model = pipeline_model.fit(X_train,y_train)\n    models.append(model)'

In [10]:
estimators = [
    {
        "estimator": RandomForestRegressor(random_state=42),
        "parameters": {
            'estimator__n_estimators': [1, 2, 4, 8, 16, 32, 64, 100, 200],
            'estimator__max_depth': Integer(1, 32),
        }
    },
    {
        "estimator": AdaBoostRegressor(DecisionTreeRegressor(),random_state=42),
        "parameters": {
            'estimator__n_estimators': [500, 1000, 2000],
            'learning_rate':[.001,0.01,.1],

        }
    },
    {
        "estimator": xgboost.XGBRegressor(),
        "parameters": {
            'estimator__n_estimators': Integer(10, 500),
            'estimator__max_depth': Integer(2, 10),
            'estimator__eta': [0.01, 0.05, 0.1, 0.15, 0.2],
            'estimator__min_child_weight': [1,5,20,200],
            'estimator__max_depth': Integer(3,10)

        }
    },
    {
        "estimator": MLPRegressor(random_state=42),
        "parameters": {
            'estimator__max_iter': Integer(10, 1000),
            #'estimator__hidden_layer_sizes': [(150,100,50), (120,80,40), (100,50,30)], when knowing n_layers
            'estimator__n_layers': Integer(2, 6),
        }
    },
]


models_tuned = []
for estimator in estimators:
  # set pca pipelines
  # build the pipeline
    pipeline_model = Pipeline([
        ('estimator', estimator.get("estimator"))
    ])

    parameters = estimator.get("parameters") 

    bayes_search_estimator = BayesSearchCV(
        pipeline_model, parameters, scoring='neg_mean_absolute_error', cv=3, error_score='raise', n_jobs=-1, verbose=10, n_iter=5, random_state=42)
    
    bayes_search_estimator.fit(X, y)
    
    models_tuned.append(bayes_search_estimator)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


In [None]:
model_number = 0
with open('readme.txt', 'w') as f:
    for model in models_tuned:
        f.write("\nMODEL "+str(models_tuned[model_number].best_estimator_.named_steps)+"\n")
        f.write("best params: "+str(models_tuned[model_number].best_params_)+"\n")
        f.write("best score "+str(models_tuned[model_number].best_score_)+"\n")
        model_number+=1

In [None]:
y_pred_0 = models[0].predict(X_val)

mae = mean_absolute_error(y_val, y_pred_0)
mae

91.57987733486269

In [None]:
y_pred_1 = models[1].predict(X_val)

mae = mean_absolute_error(y_val, y_pred_1)
mae

IndexError: list index out of range

In [None]:
y_pred_2 = models[2].predict(X_val)

mae = mean_absolute_error(y_val, y_pred_2)
mae

146.378556846439

In [None]:
plt.figure()

#plt.plot(y_pred_0, color='blue')
#plt.plot(y_pred_1, color='red')
#plt.plot(np.array(y_val), color='green')
#plt.plot(y_pred_2, color='cyan')

plt.plot(np.array(y_val)-y_pred_2)


NameError: name 'y_pred_2' is not defined

<Figure size 640x480 with 0 Axes>

In [None]:
sample_submission = pd.read_csv('sample_submission.csv')
test = pd.read_csv('test.csv')
test['prediction'] = models[-1].predict(X_test)
sample_submission = sample_submission[['id']].merge(test[['id', 'prediction']], on='id', how='left')
sample_submission.to_csv('nn.csv', index=False)