In [107]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objs as go
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error

In [108]:

import joblib
#model = joblib.load(open('demand_model_dec_2022_weather_lags.sav', 'rb'))

In [109]:
def add_datetime(df, column_name="start_time"):
    conversion_dict_hourly = dict(year= df[f'{column_name}_year'],
                                  month=df[f'{column_name}_month'],
                                  day=  df[f'{column_name}_day'],
                                  hour= df[f'{column_name}_hour']
                                 )
    conversion_dict_daily = dict(year= df[f'{column_name}_year'],
                           month=df[f'{column_name}_month'],
                           day=  df[f'{column_name}_day']
                                 )
    #print(conversion_dict_daily)
    df[f'{column_name}_hourly'] = pd.to_datetime(conversion_dict_hourly)

    df[f'{column_name}_daily'] = pd.to_datetime(conversion_dict_daily)


    return df

In [110]:

def get_prediction_results(test_file_name, target_scaler_file, scaler_file, 
                           time_feature = "start_time", target ='actual_demand',
                           period_start="2022-12-15", period_end="2022-12-30"):
    test_file = pd.read_csv(test_file_name)
    target_scaler = joblib.load(open(target_scaler_file, 'rb'))
    scaler = joblib.load(open(scaler_file, 'rb'))
    
    test_file['pred'] = target_scaler.inverse_transform(test_file[['pred']]) 
    test_file[target] = target_scaler.inverse_transform(test_file[[target]]) 
    
    
    rmse_score = np.sqrt(mean_squared_error(test_file[target], test_file['pred']))
    #mape_score = mean_absolute_percentage_error(test_file[target], test_file['pred'])
    mae_score = mean_absolute_error(test_file[target], test_file['pred'])
    
    features = scaler.get_feature_names_out()
    test_file[features] =  scaler.inverse_transform(test_file[features])
    
    df = add_datetime(test_file, time_feature)
    sorted_df = df.sort_values(f'{time_feature}_hourly').groupby(
                                                f"{time_feature}_daily")[[target, 'pred']].sum().reset_index()
    daily_fig = go.Figure()
    daily_fig.add_scatter(x=sorted_df[f'{time_feature}_daily'], y=sorted_df[target], mode='lines', name="Actual")
    daily_fig.add_scatter(x=sorted_df[f'{time_feature}_daily'], y=sorted_df['pred'], mode='lines', name="Prediction")
    daily_fig.update_layout(hovermode="x")
    #daily_fig.show()
    
    sorted_df = df.sort_values(f'{time_feature}_hourly').groupby(
                                                f"{time_feature}_hourly")[[target, 'pred']].sum().reset_index()

    sorted_df = sorted_df[(sorted_df[f'{time_feature}_hourly']>=pd.to_datetime(period_start))
                          & (sorted_df[f'{time_feature}_hourly']<=pd.to_datetime(period_end))]
    hourly_fig = go.Figure()
    hourly_fig.add_scatter(x=sorted_df[f'{time_feature}_hourly'], y=sorted_df[target], mode='lines', name="Actual")
    hourly_fig.add_scatter(x=sorted_df[f'{time_feature}_hourly'], y=sorted_df['pred'], mode='lines', name="Prediction")
    hourly_fig.update_layout(hovermode="x")
    #hourly_fig.show()
    
    print(rmse_score)
    print(mae_score)
    #print(mape_score)
    return df, daily_fig, hourly_fig, rmse_score, mae_score


def get_prediction_results_single_station(test_file_name, target_scaler_file, scaler_file, 
                           time_feature = "start_time", target ='actual_demand',
                           period_start="2022-12-15", period_end="2022-12-30", station_id=253.0):
    test_file = pd.read_csv(test_file_name).query(f"start_station_name=={station_id}")
    target_scaler = joblib.load(open(target_scaler_file, 'rb'))
    scaler = joblib.load(open(scaler_file, 'rb'))
    
    test_file['pred'] = target_scaler.inverse_transform(test_file[['pred']]) 
    test_file[target] = target_scaler.inverse_transform(test_file[[target]]) 
    
    
    rmse_score = np.sqrt(mean_squared_error(test_file[target], test_file['pred']))
    #mape_score = mean_absolute_percentage_error(test_file[target], test_file['pred'])
    mae_score = mean_absolute_error(test_file[target], test_file['pred'])
    
    features = scaler.get_feature_names_out()
    test_file[features] =  scaler.inverse_transform(test_file[features])
    
    df = add_datetime(test_file, time_feature)
    sorted_df = df.sort_values(f'{time_feature}_hourly').groupby(
                                                f"{time_feature}_daily")[[target, 'pred']].sum().reset_index()
    daily_fig = go.Figure()
    daily_fig.add_scatter(x=sorted_df[f'{time_feature}_daily'], y=sorted_df[target], mode='lines', name="Actual")
    daily_fig.add_scatter(x=sorted_df[f'{time_feature}_daily'], y=sorted_df['pred'], mode='lines', name="Prediction")
    daily_fig.update_layout(hovermode="x")
    #daily_fig.show()
    
    sorted_df = df.sort_values(f'{time_feature}_hourly').groupby(
                                                f"{time_feature}_hourly")[[target, 'pred']].sum().reset_index()

    sorted_df = sorted_df[(sorted_df[f'{time_feature}_hourly']>=pd.to_datetime(period_start))
                          & (sorted_df[f'{time_feature}_hourly']<=pd.to_datetime(period_end))]
    hourly_fig = go.Figure()
    hourly_fig.add_scatter(x=sorted_df[f'{time_feature}_hourly'], y=sorted_df[target], mode='lines', name="Actual")
    hourly_fig.add_scatter(x=sorted_df[f'{time_feature}_hourly'], y=sorted_df['pred'], mode='lines', name="Prediction")
    hourly_fig.update_layout(hovermode="x")
    #hourly_fig.show()
    
    print(rmse_score)
    print(mae_score)
    #print(mape_score)
    return df, daily_fig, hourly_fig, rmse_score, mae_score

In [111]:
def compare_outputs(test1, test2, period_start="2022-12-15", period_end="2022-12-19"):
    sorted_df = test1.sort_values('start_time_hourly').groupby(
                                                "start_time_daily")[['pred']].sum().reset_index()
    daily_fig = go.Figure()
    daily_fig.add_scatter(x=sorted_df['start_time_daily'], y=sorted_df['pred'], 
                          mode='lines', name="Prediction 2021-2022")
    
    sorted_df = test1.sort_values('start_time_hourly').groupby(
                                            "start_time_hourly")[['actual_demand', 'pred']].sum().reset_index()
    sorted_df = sorted_df[(sorted_df['start_time_hourly']>=pd.to_datetime(period_start))
                      & (sorted_df['start_time_hourly']<=pd.to_datetime(period_end))]
    hourly_fig = go.Figure()
    hourly_fig.add_scatter(x=sorted_df['start_time_hourly'], y=sorted_df['pred'], 
                          mode='lines', name="Prediction 2021-2022")
    
    sorted_df = test2.sort_values('start_time_hourly').groupby(
                                                "start_time_daily")[['actual_demand', 'pred']].sum().reset_index()
    
    daily_fig.add_scatter(x=sorted_df['start_time_daily'], y=sorted_df['pred'], 
                          mode='lines', name="Prediction 2022")
    
    daily_fig.add_scatter(x=sorted_df['start_time_daily'], y=sorted_df['actual_demand'], 
                          mode='lines', name="Actual Demand")
    
    sorted_df = test2.sort_values('start_time_hourly').groupby(
                                            "start_time_hourly")[['actual_demand', 'pred']].sum().reset_index()
    sorted_df = sorted_df[(sorted_df['start_time_hourly']>=pd.to_datetime(period_start))
                      & (sorted_df['start_time_hourly']<=pd.to_datetime(period_end))]
    hourly_fig.add_scatter(x=sorted_df['start_time_hourly'], y=sorted_df['pred'], 
                          mode='lines', name="Prediction 2021")
    hourly_fig.add_scatter(x=sorted_df['start_time_hourly'], y=sorted_df['actual_demand'], 
                      mode='lines', name="Actual Demand")
    
    daily_fig.update_layout(hovermode="x")    
    hourly_fig.update_layout(hovermode="x")
    
    return daily_fig, hourly_fig

# Demand Prediction

## Full Dataset

In [112]:
test_file_name

'test_predictions_dec_2022_weather_lags.csv'

In [113]:
test_file_name = "test_predictions_dec_2022_weather_lags.csv"
target_scaler_file = 'target_scaler_dec_2022_weather_lags.sav'
scaler_file = 'scaler_dec_2022_weather_lags.sav'

test_file, daily_fig, hourly_fig, rmse_score, mae_score = get_prediction_results(test_file_name, 
                                                                                 target_scaler_file, scaler_file,
                                                                                 period_start="2022-10-15",
                                                                                 period_end="2022-10-20")

1.4093498536497888
1.0116693142361406


In [114]:
daily_fig.update_layout(title="Demand Daily Prediction Output")

In [115]:
daily_fig.update_layout(title="Demand Daily Prediction Output").write_html("Plots/Demand Daily Prediction Output.html")

In [116]:
hourly_fig.update_layout(title="Demand Hourly Prediction Sample Output")   

In [117]:
hourly_fig.update_layout(title="Demand Hourly Prediction Sample Output").write_html("Plots/Demand Hourly Prediction Sample Output.html")

In [118]:
m = test_file.groupby(['start_station_name', 'start_time_month', 'start_time_day', 'start_time_year'])['actual_demand', 'pred'].sum().reset_index()

df_rmse_per_station = pd.DataFrame({
    'Station':[], 
    'Size':[], 
    'RMSE':[], 
       } 
    )

for i in m['start_station_name'].unique():
    n = m.query(f'start_station_name=={i}')
    RMSE = np.sqrt(mean_squared_error(n['actual_demand'], n['pred']))
    df_rmse_per_station.loc[len(df_rmse_per_station.index)] = [i, n.shape[0], RMSE]
    #print(n.shape[0])
    #print(RMSE)
    #print()
    
df_rmse_per_station.sort_values("Size")


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



Unnamed: 0,Station,Size,RMSE
263,269.0,13.0,0.685116
310,316.0,18.0,0.525267
247,253.0,33.0,0.477716
374,381.0,33.0,0.598690
2,2.0,34.0,0.686787
...,...,...,...
104,106.0,89.0,7.835081
103,105.0,89.0,4.729971
98,100.0,89.0,5.243876
288,294.0,89.0,4.882654


In [125]:
np.sqrt(mean_squared_error(test_file['actual_demand'], test_file['pred']))

1.4093498536497888

In [126]:
np.sqrt(mean_squared_error(m['actual_demand'], m['pred']))

4.465958846130103

In [127]:
df_rmse_per_station['RMSE'].mean()

3.638663823894762

In [119]:


test_file_single, daily_fig_single, hourly_fig_single, rmse_score_single, mae_score_single \
= get_prediction_results_single_station(
    test_file_name, 
    target_scaler_file, 
    scaler_file,
    period_start="2022-10-15",
    period_end="2022-10-20",
    station_id=383.0
)

1.564061967361134
1.2323142613151445


In [120]:
daily_fig_single.update_layout(title="Demand Daily Prediction Station Output Example").write_html("Plots/Demand Daily Prediction Station Output.html")

In [121]:
hourly_fig_single.update_layout(title="Demand Hourly Prediction Sample Station Output Example").write_html("Plots/Demand Hourly Prediction Station Sample Station Output.html")

## 2022 Dataset

In [28]:
test_file_name = "test_predictions_dec_2022_weather_lags_2022_only.csv"
target_scaler_file = 'target_scaler_dec_2022_weather_lags_2022_only.sav'
scaler_file = 'scaler_dec_2022_weather_lags_2022_only.sav'

test_file_2022, daily_fig_2022, hourly_fig_2022, rmse_score, mae_score = get_prediction_results(test_file_name, 
                                                                                                target_scaler_file, 
                                                                                                scaler_file,
                                                                                                 period_start="2022-10-15",
                                                                                                 period_end="2022-10-20")

1.4394243932098059
1.0553609809028108


## Comparison

In [29]:
daily_fig, hourly_fig = compare_outputs(test_file, test_file_2022, period_start="2022-10-15", period_end="2022-10-20")

In [30]:
daily_fig

In [31]:
hourly_fig

# Returns Prediction

In [14]:
test_file_name = "test_predictionsreturns_Dec_2022_weather_lags.csv"
target_scaler_file = 'target_scalerreturns_Dec_2022_weather_lags.sav'
scaler_file = 'scalerreturns_Dec_2022_weather_lags.sav'

test_file_returns, daily_fig_returns, hourly_fig_returns, rmse_score, mae_score = \
                                            get_prediction_results(test_file_name, target_scaler_file, scaler_file, 
                                                                   time_feature = "end_time",target = "actual_returns",
                                                                                 period_start="2022-10-15",
                                                                                 period_end="2022-10-20")

1.4549880496610024
1.0164518470790125


In [15]:
daily_fig_returns.update_layout(title="Returns Daily Prediction Output")

In [16]:
hourly_fig_returns.update_layout(title="Returns Hourly Prediction Sample Output")

In [17]:
df = add_datetime(test_file, "start_time")

In [18]:
pd.to_datetime("2022-12-1")

Timestamp('2022-12-01 00:00:00')