# 1 Imports

In [None]:
import pandas as pd
from sklearn.feature_selection import f_regression, mutual_info_regression
from ev_load_fc.config import CFG, resolve_path
from ev_load_fc.data.preprocessing import get_holidays
from ev_load_fc.features.feature_creation import aggregate_features, date_features, lag_features, rolling_window_features

split_date = CFG["data"]["preprocessing"]['split_date']
holiday_list = list(CFG["features"]["feature_engineering"]['holidays'])
processed = resolve_path(CFG["paths"]["processed_data"])
combined_file = CFG["files"]["combined_filename"]
combined = pd.read_csv(processed / combined_file, parse_dates=['timestamp'], index_col='timestamp')

min_timestamp = combined.index.min()
max_timestamp = combined.index.max()

target = 'energy'
inputs = [col for col in combined.columns if col!=target]

# 2 Feature Creation

## Assess base features

In [3]:
combined.describe()

Unnamed: 0,energy,energy_outlier,fog_moderate_dur,fog_severe_dur,rain_heavy_dur,rain_light_dur,rain_moderate_dur,storm_severe_dur,temp,temp_imputed,...,distance_flow-incident_3,distance_flow-incident_4,duration_congestion_0,duration_congestion_1,duration_congestion_2,duration_flow-incident_0,duration_flow-incident_1,duration_flow-incident_2,duration_flow-incident_3,duration_flow-incident_4
count,22632.0,22632.0,22632.0,22632.0,22632.0,22632.0,22632.0,22632.0,22632.0,22632.0,...,22632.0,22632.0,22632.0,22632.0,22632.0,22632.0,22632.0,22632.0,22632.0,22632.0
mean,44.327834,0.211382,3.362778,0.527969,0.003535,4.109047,0.123186,0.077015,15.25258,0.078031,...,0.003662,0.000392,21.748087,46.412374,12.344992,2.04847,2.167078,1.645355,0.062873,0.002033
std,46.08896,0.408298,16.670334,6.261206,0.387701,19.099182,2.474323,2.021632,5.228711,0.268226,...,0.088655,0.041738,62.72729,94.944797,37.559739,11.307468,11.068575,9.766494,1.526671,0.226276
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.725,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,33.733423,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,72.0995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.3,0.0,...,0.0,0.0,0.0,42.0,0.0,0.0,0.0,0.0,0.0,0.0
max,297.30527,1.0,180.0,167.0,56.0,180.0,95.0,60.0,40.6,1.0,...,4.4,4.44,909.332832,764.833208,526.833208,260.0,263.833208,195.833208,60.0,30.1


In [4]:
combined.columns

Index(['energy', 'energy_outlier', 'fog_moderate_dur', 'fog_severe_dur',
       'rain_heavy_dur', 'rain_light_dur', 'rain_moderate_dur',
       'storm_severe_dur', 'temp', 'temp_imputed', 'distance_congestion_0',
       'distance_congestion_1', 'distance_congestion_2',
       'distance_flow-incident_0', 'distance_flow-incident_1',
       'distance_flow-incident_2', 'distance_flow-incident_3',
       'distance_flow-incident_4', 'duration_congestion_0',
       'duration_congestion_1', 'duration_congestion_2',
       'duration_flow-incident_0', 'duration_flow-incident_1',
       'duration_flow-incident_2', 'duration_flow-incident_3',
       'duration_flow-incident_4'],
      dtype='object')

## Sum features 

In [5]:
# Aggregate weather event duration by weather event type
combined = aggregate_features(df=combined, out_name='rain_dur', substr1='rain')
combined = aggregate_features(df=combined, out_name='fog_dur', substr1='fog')
# Aggregate traffic event duration and distance by traffic event type 
combined = aggregate_features(df=combined, out_name='dis_cong', substr1='distance_congestion')
combined = aggregate_features(df=combined, out_name='dis_flow', substr1='distance_flow')
combined = aggregate_features(df=combined, out_name='dur_cong', substr1='duration_congestion')
combined = aggregate_features(df=combined, out_name='dur_flow', substr1='duration_flow')
# Aggregate traffic event duration and distance by traffic event severity 
combined = aggregate_features(df=combined, out_name='dis_l0', substr1='_0', substr2='distance')
combined = aggregate_features(df=combined, out_name='dis_l1', substr1='_1', substr2='distance')
combined = aggregate_features(df=combined, out_name='dis_l2', substr1='_2', substr2='distance')
combined = aggregate_features(df=combined, out_name='dis_l3', substr1='_3', substr2='distance')
combined = aggregate_features(df=combined, out_name='dis_l4', substr1='_4', substr2='distance')
combined = aggregate_features(df=combined, out_name='dur_l0', substr1='_0', substr2='duration')
combined = aggregate_features(df=combined, out_name='dur_l1', substr1='_1', substr2='duration')
combined = aggregate_features(df=combined, out_name='dur_l2', substr1='_2', substr2='duration')
combined = aggregate_features(df=combined, out_name='dur_l3', substr1='_3', substr2='duration')
combined = aggregate_features(df=combined, out_name='dur_l4', substr1='_4', substr2='duration')

## Date features

In [None]:
combined = date_features(combined) # sinusoidal features for hour, weekday and month

In [None]:
holidays = get_holidays(holiday_list, min_timestamp, max_timestamp) # OHE holiday calendar features

combined = combined.merge(holidays, how='left', left_index=True, right_index=True)

## Lagged features

In [None]:
lag_dict = {
    'energy': [1, 24, 48, 168],
    'energy_outlier': [1, 24, 48, 168],
}

combined = lag_features(combined, lag_dict) # creates specified lagged featues for given columns


## Rolling window features

In [None]:
# All weather columns we want to produce rolling window features for
weather_cols = [col for col in combined.columns 
                if 'fog' in col
                or 'rain' in col
                or 'storm' in col
]
# All traffic columns we want to produce rolling window features for
traffic_cols = [col for col in combined.columns 
                if 'cong' in col
                or 'flow' in col
                or 'dis_l' in col
                or 'dur_l' in col
]

Populate dictionary specifying window sizes for our chosen columns

In [None]:
rolling_sum_dict = {
    'energy': [12,24, 168, 24*14],
}

for w_col in weather_cols:  
    rolling_sum_dict[w_col] = [1,3,6,12,24,48]

for t_col in traffic_cols:
    rolling_sum_dict[t_col] = [1,3,6,12,24]

In [13]:
combined = rolling_window_features(combined, rolling_sum_dict, agg_func='sum')

  df_rw[f"{col}_rw_{window}_{agg_func}"] = (
  df_rw[f"{col}_rw_{window}_{agg_func}"] = (
  df_rw[f"{col}_rw_{window}_{agg_func}"] = (
  df_rw[f"{col}_rw_{window}_{agg_func}"] = (
  df_rw[f"{col}_rw_{window}_{agg_func}"] = (
  df_rw[f"{col}_rw_{window}_{agg_func}"] = (
  df_rw[f"{col}_rw_{window}_{agg_func}"] = (
  df_rw[f"{col}_rw_{window}_{agg_func}"] = (
  df_rw[f"{col}_rw_{window}_{agg_func}"] = (
  df_rw[f"{col}_rw_{window}_{agg_func}"] = (
  df_rw[f"{col}_rw_{window}_{agg_func}"] = (
  df_rw[f"{col}_rw_{window}_{agg_func}"] = (
  df_rw[f"{col}_rw_{window}_{agg_func}"] = (
  df_rw[f"{col}_rw_{window}_{agg_func}"] = (
  df_rw[f"{col}_rw_{window}_{agg_func}"] = (
  df_rw[f"{col}_rw_{window}_{agg_func}"] = (
  df_rw[f"{col}_rw_{window}_{agg_func}"] = (
  df_rw[f"{col}_rw_{window}_{agg_func}"] = (
  df_rw[f"{col}_rw_{window}_{agg_func}"] = (
  df_rw[f"{col}_rw_{window}_{agg_func}"] = (
  df_rw[f"{col}_rw_{window}_{agg_func}"] = (
  df_rw[f"{col}_rw_{window}_{agg_func}"] = (
  df_rw[f"

In [14]:
rolling_mean_dict = {
    'energy': [6,12,24,48,168],
    'temp': [1,3,6,12,24],
    'temp_imputed': [1,3,6,12,24]
}

In [15]:
combined = rolling_window_features(combined, rolling_mean_dict, 'mean')

# 3 Evaluate feature importance

## Restrict to generated features and window aware timeframe

In [None]:
features = [
    col for col in combined.columns
    if '_sin' in col # hour/weekday/month
    or '_cos' in col # hour/weekday/month
    or '_lag_' in col # lagged features
    or '_rw_' in col # rolling window (sum or mean) features
]
features = features + holiday_list

X = combined.iloc[24*14:][features].copy()

In [17]:
X_train = X[X.index < split_date]
y_train = combined[combined.index < split_date][target].iloc[24*14:]

## ANOVA F-test

In [18]:
f_test = f_regression(X_train, y_train)
f_stat = f_test[0]
f_p = f_test[1]
f_df = pd.DataFrame({'feature': X_train.columns, 'f_stat': f_stat, 'p-value': f_p})
f_df = f_df.sort_values(by='f_stat', ascending=False).reset_index(drop=True)

display(f_df)

Unnamed: 0,feature,f_stat,p-value
0,energy_lag_168,17366.414701,0.0
1,hour_cos,16866.105207,0.0
2,energy_lag_24,13347.289474,0.0
3,energy_lag_1,12201.678299,0.0
4,energy_lag_48,9434.321721,0.0
...,...,...,...
232,dur_l4_rw_12_sum,0.000000,1.0
233,dur_l4_rw_1_sum,0.000000,1.0
234,dur_l4_rw_6_sum,0.000000,1.0
235,dur_l4_rw_24_sum,0.000000,1.0


## Estimate mutual information 

In [19]:
mut_info = mutual_info_regression(X_train, y_train)
mi_df = pd.DataFrame({'feature': X_train.columns, 'mutual_info': mut_info})
mi_df = mi_df.sort_values(by='mutual_info', ascending=False).reset_index(drop=True) 

display(mi_df)

Unnamed: 0,feature,mutual_info
0,energy_lag_168,0.490754
1,hour_cos,0.487379
2,energy_lag_24,0.435264
3,energy_lag_1,0.400762
4,energy_lag_48,0.386651
...,...,...
232,dur_l3_rw_1_sum,0.000000
233,dur_l3_rw_12_sum,0.000000
234,dur_l3_rw_24_sum,0.000000
235,temp_imputed_rw_6_mean,0.000000


# 99 Conclusions

- Energy based lag/window features are highly predictive as expected.
    - All lags in [1,24,48,168] are strong as well as in rolling windows

- Hour of day features (particularly cos) are highly predictive, with weekday and month features less predictive but not insignificant

- Temperature based features show good predictive power.
    - Particularly lags 1, 3 and 6, implying recent temperatures are more predictive 

- Traffic based features, specifically around congestion and severity level 0 & 1, show good predictive power.
    - Low severity congestion could indicate high volumes of cars on the road but in a way that does not impede travel too heavily
    - Lags/window sizes of 1,3 and 6 perform the best

- Flow-incident and high severity traffic features do not perform as well.
    - Flow incidents can be unrelated to the number of cars on the road and so may not be a good indication of how many EVs are being used and therefore require charging

- Weather event based features do not perform well on average, could be due to California's relatively stable (if very hot) climate.
    - Weather effects like rain and fog have little lasting impact after they occur