In [44]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objs as go
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [24]:

import joblib
#model = joblib.load(open('demand_model_dec_2022_weather_lags.sav', 'rb'))

In [110]:
def add_datetime(df, column_name="start_time"):
    conversion_dict_hourly = dict(year= df[f'{column_name}_year'],
                                  month=df[f'{column_name}_month'],
                                  day=  df[f'{column_name}_day'],
                                  hour= df[f'{column_name}_hour']
                                 )
    conversion_dict_daily = dict(year= df[f'{column_name}_year'],
                           month=df[f'{column_name}_month'],
                           day=  df[f'{column_name}_day']
                                 )
    #print(conversion_dict_daily)
    df[f'{column_name}_hourly'] = pd.to_datetime(conversion_dict_hourly)

    df[f'{column_name}_daily'] = pd.to_datetime(conversion_dict_daily)


    return df

In [111]:

def get_prediction_results(test_file_name, target_scaler_file, scaler_file, 
                           period_start="2022-12-15", period_end="2022-12-19"):
    test_file = pd.read_csv(test_file_name)
    target_scaler = joblib.load(open(target_scaler_file, 'rb'))
    scaler = joblib.load(open(scaler_file, 'rb'))
    
    test_file['pred'] = target_scaler.inverse_transform(test_file[['pred']]) 
    test_file['actual_demand'] = target_scaler.inverse_transform(test_file[['actual_demand']]) 
    
    
    rmse_score = np.sqrt(mean_squared_error(test_file['actual_demand'], test_file['pred']))
    mae_score = mean_absolute_error(test_file['actual_demand'], test_file['pred'])
    
    features = scaler.get_feature_names_out()
    test_file[features] =  scaler.inverse_transform(test_file[features])
    
    df = add_datetime(test_file, "start_time")
    sorted_df = df.sort_values('start_time_hourly').groupby(
                                                "start_time_daily")[['actual_demand', 'pred']].sum().reset_index()
    daily_fig = go.Figure()
    daily_fig.add_scatter(x=sorted_df['start_time_daily'], y=sorted_df['actual_demand'], mode='lines', name="Actual Demand")
    daily_fig.add_scatter(x=sorted_df['start_time_daily'], y=sorted_df['pred'], mode='lines', name="Prediction")
    daily_fig.update_layout(hovermode="x")
    #daily_fig.show()
    
    sorted_df = df.sort_values('start_time_hourly').groupby(
                                                "start_time_hourly")[['actual_demand', 'pred']].sum().reset_index()

    sorted_df = sorted_df[(sorted_df['start_time_hourly']>=pd.to_datetime(period_start))
                          & (sorted_df['start_time_hourly']<=pd.to_datetime(period_end))]
    hourly_fig = go.Figure()
    hourly_fig.add_scatter(x=sorted_df['start_time_hourly'], y=sorted_df['actual_demand'], mode='lines', name="Actual Demand")
    hourly_fig.add_scatter(x=sorted_df['start_time_hourly'], y=sorted_df['pred'], mode='lines', name="Prediction")
    hourly_fig.update_layout(hovermode="x")
    #hourly_fig.show()
    
    print(rmse_score)
    print(mae_score)
    return df, daily_fig, hourly_fig, rmse_score, mae_score

In [112]:
def compare_outputs(test1, test2, period_start="2022-12-15", period_end="2022-12-19"):
    sorted_df = test1.sort_values('start_time_hourly').groupby(
                                                "start_time_daily")[['pred']].sum().reset_index()
    daily_fig = go.Figure()
    daily_fig.add_scatter(x=sorted_df['start_time_daily'], y=sorted_df['pred'], 
                          mode='lines', name="Prediction 2021-2022")
    
    sorted_df = test1.sort_values('start_time_hourly').groupby(
                                            "start_time_hourly")[['actual_demand', 'pred']].sum().reset_index()
    sorted_df = sorted_df[(sorted_df['start_time_hourly']>=pd.to_datetime(period_start))
                      & (sorted_df['start_time_hourly']<=pd.to_datetime(period_end))]
    hourly_fig = go.Figure()
    hourly_fig.add_scatter(x=sorted_df['start_time_hourly'], y=sorted_df['pred'], 
                          mode='lines', name="Prediction 2021-2022")
    
    sorted_df = test2.sort_values('start_time_hourly').groupby(
                                                "start_time_daily")[['actual_demand', 'pred']].sum().reset_index()
    
    daily_fig.add_scatter(x=sorted_df['start_time_daily'], y=sorted_df['pred'], 
                          mode='lines', name="Prediction 2022")
    
    daily_fig.add_scatter(x=sorted_df['start_time_daily'], y=sorted_df['actual_demand'], 
                          mode='lines', name="Actual Demand")
    
    sorted_df = test2.sort_values('start_time_hourly').groupby(
                                            "start_time_hourly")[['actual_demand', 'pred']].sum().reset_index()
    sorted_df = sorted_df[(sorted_df['start_time_hourly']>=pd.to_datetime(period_start))
                      & (sorted_df['start_time_hourly']<=pd.to_datetime(period_end))]
    hourly_fig.add_scatter(x=sorted_df['start_time_hourly'], y=sorted_df['pred'], 
                          mode='lines', name="Prediction 2021")
    hourly_fig.add_scatter(x=sorted_df['start_time_hourly'], y=sorted_df['actual_demand'], 
                      mode='lines', name="Actual Demand")
    
    daily_fig.update_layout(hovermode="x")    
    hourly_fig.update_layout(hovermode="x")
    
    return daily_fig, hourly_fig

In [113]:
test_file_name = "test_predictions_dec_2022_weather_lags.csv"
target_scaler_file = 'target_scaler_dec_2022_weather_lags.sav'
scaler_file = 'scaler_dec_2022_weather_lags.sav'

test_file, daily_fig, hourly_fig, rmse_score, mae_score = get_prediction_results(test_file_name, target_scaler_file, scaler_file)

1.4093498536497888
1.0116693142361406


In [114]:
test_file_name = "test_predictions_dec_2022_weather_lags_2022_only.csv"
target_scaler_file = 'target_scaler_dec_2022_weather_lags_2022_only.sav'
scaler_file = 'scaler_dec_2022_weather_lags_2022_only.sav'

test_file_2022, daily_fig_2022, hourly_fig_2022, rmse_score, mae_score = get_prediction_results(test_file_name, target_scaler_file, scaler_file)

1.4371691776547235
1.0530978184991604


In [106]:
test_file

Unnamed: 0.1,Unnamed: 0,start_station_name,start_time_year,start_time_month,start_time_day,start_time_hour,start_time_week,start_time_quarter,start_time_dayofweek,is_holiday,...,rhum_lag_1_h,rhum_lag_2_h,rhum_lag_24_h,wspd_lag_1_h,wspd_lag_2_h,wspd_lag_24_h,actual_demand,pred,start_time_hourly,start_time_daily
0,1719264,190,2022.0,10.0,4.0,10.0,40.0,4.0,1.0,1,...,87.00,87.000000,79.500000,14.800000,11.2,0.0000,1.0,3.35,2022-10-04 10:00:00,2022-10-04
1,1719265,287,2022.0,10.0,4.0,10.0,40.0,4.0,1.0,1,...,87.00,87.000000,81.000000,14.800000,11.2,0.0000,2.0,2.31,2022-10-04 10:00:00,2022-10-04
2,1719266,383,2022.0,10.0,4.0,10.0,40.0,4.0,1.0,1,...,87.00,87.000000,81.000000,14.800000,11.2,0.0000,1.0,2.51,2022-10-04 10:00:00,2022-10-04
3,1719267,361,2022.0,10.0,4.0,10.0,40.0,4.0,1.0,1,...,87.00,87.000000,81.000000,14.800000,11.2,0.0000,1.0,2.14,2022-10-04 10:00:00,2022-10-04
4,1719268,9,2022.0,10.0,4.0,10.0,40.0,4.0,1.0,1,...,85.75,87.000000,68.333333,21.325000,11.2,10.0000,3.0,1.59,2022-10-04 10:00:00,2022-10-04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
230395,1949659,264,2022.0,12.0,31.0,23.0,52.0,4.0,5.0,1,...,89.00,83.714286,87.500000,24.100000,5.4,15.3625,2.0,2.58,2022-12-31 23:00:00,2022-12-31
230396,1949660,266,2022.0,12.0,31.0,23.0,52.0,4.0,5.0,1,...,69.50,93.000000,75.500000,7.400000,42.5,27.0000,4.0,2.29,2022-12-31 23:00:00,2022-12-31
230397,1949661,339,2022.0,12.0,31.0,23.0,52.0,4.0,5.0,1,...,74.00,57.000000,78.000000,18.400000,24.1,7.6000,1.0,1.62,2022-12-31 23:00:00,2022-12-31
230398,1949662,78,2022.0,12.0,31.0,23.0,52.0,4.0,5.0,1,...,79.00,86.000000,39.000000,4.000000,5.4,14.8000,1.0,2.17,2022-12-31 23:00:00,2022-12-31


In [118]:
daily_fig, hourly_fig = compare_outputs(test_file, test_file_2022, period_start="2022-11-15", period_end="2022-11-20")

In [119]:
daily_fig

In [120]:
hourly_fig

In [121]:
daily_fig_2022.show()
daily_fig.show()

In [34]:
df = add_datetime(test_file, "start_time")

In [54]:
pd.to_datetime("2022-12-1")

Timestamp('2022-12-01 00:00:00')