In [1]:
#imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


In [2]:
#Functions 
#Series stuff
def get_series(meter_id, type="prod", start=None, end=None, agg=None):
    """Create Series from meter_id and type of data

    Args:
        meter_id (str): meter-id
        type (str, optional): production or consumption of kwh. Defaults to "prod".
        start (str, optional): timeslot to start series. Defaults to None.
        end (str, optional): timeslot to end series. Defaults to None.
        agg (str, optional): aggregation of data. One of day, week or month. Defaults to None.

    Returns:
        pd.series: series of filtered data
    """

    print("Getting series for meter_id: {}".format(meter_id))

    # if start not none
    if start is not None:
        # Convert to datetime
        start = pd.to_datetime(start)
    
    # if end not none
    if end is not None:
        # Convert to datetime
        end = pd.to_datetime(end)

    if type == "prod":
        df_return = df_prod[df_prod["meter_id"] == meter_id]
        # Drop all columns but timeslot and num_kwh
        df_return = df_return[['timeslot', 'num_kwh']]
        # Set index to timeslot
        # Filter on start and end
        # Convert timeslot to datetime
        df_return["timeslot"] = pd.to_datetime(df_return["timeslot"], utc=True)
        if start is not None and end is not None:
            print("Filtering on start and end: ", start, end)
            try:
                df_return = df_return[(df_return['timeslot'] >= start) & (df_return['timeslot'] <= end)]
            except Exception as e:
                print(e)
                print("No data for this timeslot, timeslot might be incorrect format or out of range:")
                print("Format and range for timeslot: ", df_return.index[0], " ", df_return.index[-1])
                print("Format for input start: ", start)
                print("Format for input end: ", end)
                pass
        elif start is not None:
            print("Filtering on start: ", start)
            try:
                df_return = df_return[(df_return['timeslot'] >= start)]
            except:
                print("No data for this timeslot, timeslot might be incorrect format or out of range:")
                print("Format and range for timeslot: ", df_return.index[0], " ", df_return.index[-1])
                print("Format for input start: ", start)
                pass
        elif end is not None:
            print("Filtering on end: ", end)
            try:
                df_return = df_return[(df_return['timeslot'] <= end)]
            except:
                print("No data for this timeslot, timeslot might be incorrect format or out of range:")
                print("Format and range for timeslot: ", df_return.index[0], " ", df_return.index[-1])
                print("Format for input end: ", end)
                pass

    elif type == "cons":
        df_return = df_cons[df_cons["meter_id"] == meter_id]
        # Drop all columns but timeslot and num_kwh
        df_return = df_return[['timeslot', 'num_kwh']]
        # Set index to timeslot
        # Filter on start and end
        # Convert timeslot to datetime
        df_return["timeslot"] = pd.to_datetime(df_return["timeslot"], utc=True)
        if start is not None and end is not None:
            print("Filtering on start and end: ", start, end)
            try:
                df_return = df_return[(df_return['timeslot'] >= start) & (df_return['timeslot'] <= end)]
            except Exception as e:
                print(e)
                print("No data for this timeslot, timeslot might be incorrect format or out of range:")
                print("Format and range for timeslot: ", df_return.index[0], " ", df_return.index[-1])
                print("Format for input start: ", start)
                print("Format for input end: ", end)
                pass
        elif start is not None:
            print("Filtering on start: ", start)
            try:
                df_return = df_return[(df_return['timeslot'] >= start)]
            except:
                print("No data for this timeslot, timeslot might be incorrect format or out of range:")
                print("Format and range for timeslot: ", df_return.index[0], " ", df_return.index[-1])
                print("Format for input start: ", start)
                pass
        elif end is not None:
            print("Filtering on end: ", end)
            try:
                df_return = df_return[(df_return['timeslot'] <= end)]
            except:
                print("No data for this timeslot, timeslot might be incorrect format or out of range:")
                print("Format and range for timeslot: ", df_return.index[0], " ", df_return.index[-1])
                print("Format for input end: ", end)
                pass

    df_return = df_return.set_index("timeslot").sort_index()

    # If agg is not none
    if agg is not None:
        if agg == "day":
            df_return = df_return.resample("D").sum()
        elif agg == "week":
            df_return = df_return.resample("W").sum()
        elif agg == "month":
            df_return = df_return.resample("M").sum()
        else:
            print("Aggregation not supported")
            return None

    # Return series
    return df_return

def moving_average(timeseries, window):
    return timeseries.rolling(window=window, center=True).mean()

In [42]:
#Load the data

#True values
df_prod = pd.read_csv("data/gridtx-dump-AGGREGATED-CLEANED-THRESHOLD-COVERAGE100-NORMALIZED-PROD.csv")
df_cons = pd.read_csv("data/gridtx-dump-AGGREGATED-CLEANED-THRESHOLD-COVERAGE100-NORMALIZED-CONS.csv")

series_prod = get_series("e882f9a7-f1de-4419-9869-7339be303281",
                        type="prod",
                        start=None,
                        end=None,
                        agg=None)
series_cons = get_series("e882f9a7-f1de-4419-9869-7339be303281",
                        type="cons",
                        start=None,
                        end=None,
                        agg=None)
true_values_prod = series_prod[24*365:]
true_values_cons = series_cons[24*365:]

#RF

final_RF_predictions_prod         = pd.read_csv("data/final_RF_predictions_prod.csv")
#final_RF_predictions_cons         = pd.read_csv("data/final_RF_predictions_cons.csv")
final_RF_results_grid_prod        = pd.read_csv("data/final_RF_results_grid_prod.csv")
#final_RF_results_grid_cons        = pd.read_csv("data/final_RF_results_grid_cons.csv")

#SARIMA24

#final_SARIMA24_predictions_prod   = pd.read_csv("data/final_SARIMA24_predictions_prod.csv")
#final_SARIMA24_predictions_cons   = pd.read_csv("data/final_SARIMA24_predictions_cons.csv")
#final_SARIMA24_MSEs_prod          = pd.read_csv("data/final_SARIMA24_MSEs_prod.csv")
#final_SARIMA24_MSEs_cons          = pd.read_csv("data/final_SARIMA24_MSEs_cons.csv")
#final_SARIMA24_parameters_prod    = pd.read_csv("data/final_SARIMA24_parameters_prod.csv")
#final_SARIMA24_parameters_cons    = pd.read_csv("data/final_SARIMA24_parameters_cons.csv")

#SARIMA24x7

#final_SARIMA24x7_predictions_prod = pd.read_csv("data/final_SARIMA24x7_predictions_prod.csv")
#final_SARIMA24x7_predictions_cons = pd.read_csv("data/final_SARIMA24x7_predictions_cons.csv")
#final_SARIMA24x7_MSEs_prod        = pd.read_csv("data/final_SARIMA24x7_MSEs_prod.csv")
#final_SARIMA24x7_MSEs_cons        = pd.read_csv("data/final_SARIMA24x7_MSEs_cons.csv")
#final_SARIMA24x7_parameters_prod  = pd.read_csv("data/final_SARIMA24x7_parameters_prod.csv")
#final_SARIMA24x7_parameters_cons  = pd.read_csv("data/final_SARIMA24x7_parameters_cons.csv")

#Fix the indexes
indices = final_RF_predictions_prod['timeslot']

final_RF_predictions_prod.index         = indices
#final_RF_results_grid_cons.index        = indices
#final_SARIMA24_predictions_prod.index   = indices
#final_SARIMA24_predictions_cons.index   = indices
#final_SARIMA24x7_predictions_prod.index = indices
#final_SARIMA24x7_predictions_cons.index = indices

#Drop timeslot, as it's in indexes
#final_RF_predictions_prod.drop(['timeslot'], axis=1)
#final_RF_predictions_cons.drop(['timeslot'], axis=1)

#Put actual values into the RF prediction dataframes
final_RF_predictions_prod['actual'] = true_values_prod['num_kwh']
#final_RF_predictions_cons['actual'] = true_values_cons['num_kwh']


Getting series for meter_id: e882f9a7-f1de-4419-9869-7339be303281
Getting series for meter_id: e882f9a7-f1de-4419-9869-7339be303281
