In [1]:
import os
import math

import pandas as pd
import numpy as np

In [2]:
METER_DATA_DIR = "./data/customer_led_network_revolution/"
COND_DATA_DIR =  "./data/customer_led_network_revolution/cond_data/"
PREPROCESSED_DIR = "./data/customer_led_network_revolution/preprocessed/"
PREPROCESSING_DIR = "./data/customer_led_network_revolution/preprocessing/"

In [3]:
def load_data(csv_file_name, use_cols=["Location ID", "Date and Time of capture", "Measurement Description", "Parameter"], parse=True, engine="pyarrow"):
    csv_path = os.path.join(METER_DATA_DIR, csv_file_name)

    df = pd.read_csv(csv_path, 
                    index_col="Date and Time of capture", 
                    parse_dates=parse, 
                    date_format='%d/%m/%Y %H:%M:%S',
                    usecols=use_cols,
                    engine=engine)
    
    return df

In [4]:
def pivot_high_freq_df(df):
    data = df.copy()
    data['Period'] = data.index.floor('30min')
    data = data.groupby(['Period', 'Location ID'])['Parameter'].mean().reset_index()
    data.set_index('Period', inplace=True)
    data = data.pivot_table(index=data.index, columns='Location ID', values='Parameter')
    return data

# Regular meters

In [5]:
reg_df = load_data("TrialMonitoringDataHH.csv", ["Location ID", "Date and Time of capture", "Parameter"], False, "c")

In [6]:
reg_df.index= pd.to_datetime(reg_df.index, format='%d/%m/%Y %H:%M:%S')

In [7]:
reg_df_y1 = reg_df[reg_df.index <= reg_df.index.min() + pd.Timedelta(days=365)]
reg_df_y2 = reg_df[(reg_df.index > reg_df.index.min() + pd.Timedelta(days=365)) & (reg_df.index <= reg_df.index.min() + pd.Timedelta(days=365)*2)]

In [8]:
del reg_df

In [9]:
reg_df_one_T = reg_df_y1.pivot_table(index=reg_df_y1.index, columns='Location ID', values='Parameter')
reg_df_two_T = reg_df_y2.pivot_table(index=reg_df_y2.index, columns='Location ID', values='Parameter')

In [10]:
np.save(os.path.join(PREPROCESSING_DIR, "reg_one_df"), np.asarray(reg_df_one_T))
np.save(os.path.join(PREPROCESSING_DIR, "reg_two_df"), np.asarray(reg_df_two_T))

np.save(os.path.join(PREPROCESSING_DIR, "reg_one_df_cols"), np.asarray(reg_df_one_T.columns.tolist()))
np.save(os.path.join(PREPROCESSING_DIR, "reg_two_df_cols"), np.asarray(reg_df_two_T.columns.tolist()))

In [11]:
del reg_df_one_T
del reg_df_two_T

# HP

In [29]:
hp_df = load_data("HPTrialMonitoringData.csv")

In [30]:
hp_df = hp_df[hp_df["Measurement Description"] == "whole home power import"]

In [31]:
(hp_df.index.max() - hp_df.index.min()).days

364

In [32]:
hp_df_T = pivot_high_freq_df(hp_df)

In [33]:
np.save(os.path.join(PREPROCESSING_DIR, "hp_df"), np.asarray(hp_df_T))
np.save(os.path.join(PREPROCESSING_DIR, "hp_df_cols"), np.asarray(hp_df_T.columns.tolist()))

In [34]:
del hp_df
del hp_df_T

# PV

In [18]:
pv_df = load_data("PVTrialMonitoringData.csv")

In [20]:
pv_df = pv_df[pv_df["Measurement Description"] == "whole home power import"]

In [22]:
(pv_df.index.max() - pv_df.index.min()).days

668

just shy of 2y

In [23]:
pv_df = pv_df[pv_df.index <= pv_df.index.min() + pd.Timedelta(days=365)]

In [25]:
pv_df_T = pivot_high_freq_df(pv_df)

In [27]:
np.save(os.path.join(PREPROCESSING_DIR, "pv_df"), np.asarray(pv_df_T))
np.save(os.path.join(PREPROCESSING_DIR, "pv_df_cols"), np.asarray(pv_df_T.columns.tolist()))

In [28]:
del pv_df_T
del pv_df

# EV

In [12]:
ev_df = load_data("EVTrialMonitoringData.csv")

In [13]:
ev_df = ev_df[ev_df["Measurement Description"] == "House data"]

In [14]:
(ev_df.index.max() - ev_df.index.min()).days

279