# Submission

<br>

### Imports

In [1]:
import numpy as np
import pandas as pd

from evtdemand import data, feature, suite

In [12]:
params = {
    'data_dir': '../data',
    'model_1': 'lightgbm.LGBMRegressor', 
    'model_2': 'lightgbm.LGBMRegressor',
    'model_1_kwargs': {
        'num_leaves': 50,
        'n_estimators': 100,
        'reg_lambda': 0.1
    },
    'model_2_kwargs': {
        'num_leaves': 50,
        'n_estimators': 100,
        'reg_lambda': 0.1
    },
    'data_kwargs': {
        'real_power_sub_dir': 'real_power',
        'weather_sub_dir': 'weather',
        'real_power_time_period': '_pre_september',
        'real_power_site': 'Staplegrove_CB905',
        'weather_sites': ['staplegrove'],#, 'mousehole'],
        'weather_grid_points': None,
        'weather_interpolate_method': 'interpolate',
        'use_target_delta': False
    },
    'y1_col': 'value_max',
    'y2_col': 'value_min',
    'split_kwargs': {
        'n_splits': 5, 
        'shuffle': False
    },
    'cols_subset': None, 
    'features_kwargs': {
        'features': ['temporal', 'dir_speed', 'lagged', 'demand', 'solar', 'roc'],
        'feature_lags': {
            'value': [1, 2, 3, 4, 5, 6, 48, 96, 336],
            'solar_irradiance_staplegrove_1': [1, 2],
            # 'solar_irradiance_staplegrove_2': [1, 2],
            # 'solar_irradiance_staplegrove_3': [1, 2],
            # 'solar_irradiance_staplegrove_4': [1, 2],
            # 'solar_irradiance_staplegrove_5': [1, 2],
            'temperature_staplegrove_1': [1, 2],
            # 'temperature_staplegrove_2': [1, 2],
            # 'temperature_staplegrove_3': [1, 2],
            # 'temperature_staplegrove_4': [1, 2],
            # 'temperature_staplegrove_5': [1, 2]
        },
        'roc_features': {
            'value': 3
        },
        'sites': ['staplegrove'],
        'grid_points': None
    }
}

In [13]:
%%time

model_suite, error_metrics, df_pred, input_data = suite.run_parameterised_model(**params)

error_metrics

Wall time: 10.5 s


{'y1_rmse': 9.564830632306506,
 'y2_rmse': 10.61546968401076,
 'combined_rmse': 14.33432205518877,
 'skill_score': 0.46333919980844857}

In [21]:
data_dir = params['data_dir']

df_train_features, df_train_target = data.construct_baseline_features_target_dfs(data_dir, **params['data_kwargs'])
df_train_features, *_ = feature.create_additional_features(df_train_features, **params['features_kwargs'])
df_train_features = feature.process_features(df_train_features, cols_subset=params['cols_subset'])

In [22]:
df_observation_submission = data.load_real_power_dataset(f'{data_dir}/real_power', site='Staplegrove_CB905', real_power_variable='observation_variable_half_hourly', time_period='_september')
df_weather = data.load_weather_df(f'{data_dir}/weather', sites=['staplegrove'])

common_idxs = df_observation_submission.index.intersection(df_weather.index)

df_submission_features = df_observation_submission.loc[common_idxs].copy()
df_submission_features[df_weather.columns] = df_weather.loc[common_idxs].copy()

df_train_features_subset, df_train_target_subset = data.construct_baseline_features_target_dfs(data_dir, **params['data_kwargs'])
df_features_combined = df_train_features_subset.append(df_submission_features)
df_submission_features, *_ = feature.create_additional_features(df_features_combined, df_train_target_subset.reindex(df_features_combined.index), **params['features_kwargs'])
df_submission_features = feature.process_features(df_submission_features, cols_subset=params['cols_subset'])
df_submission_features = df_submission_features.loc[common_idxs]

X_submission = df_submission_features.values

X_submission.shape

(1413, 78)

In [23]:
df_pred = model_suite.run_submission(
    df_train_target,
    df_train_features,
    df_submission_features,
    save_submission=True
)

df_pred.head()

Unnamed: 0_level_0,value_max,value_min
time,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-09-01 00:00:00,0.638257,0.587787
2021-09-01 00:30:00,0.616864,0.565895
2021-09-01 01:00:00,0.578921,0.530042
2021-09-01 01:30:00,0.556713,0.489746
2021-09-01 02:00:00,0.531277,0.483053


In [26]:
save = False
fp = f'../data/params/archive/params - {pd.Timestamp.now().strftime("%Y-%m-%d %H-%M-%S")}.yml'

if save == True:
    suite.save_params(input_data, fp)

In [None]:
# do feature permutation importance
# should shuffle the indexes before training in the run_submission method of the suite class
# add in a long-term trend term
# get the settlement period feature working and compare with time-of-day
# try xgboost

# give the spread distribution params with a 12 month lag
# look into resampling later dates more - weighted bootstrapping?
# start visualising the model tuning/decisions (LIME?) - https://www.scikit-yb.org/en/latest/api/model_selection/validation_curve.html
# can we create a `feels like` temperature?
# try a stacked regressor - http://rasbt.github.io/mlxtend/user_guide/regressor/StackingRegressor/
# create an example pytorch model with skorch - https://github.com/skorch-dev/skorch
# get some automated model tuning running with skopt or sk-deap