In [3]:
%run requirements.ipynb
%run config.ipynb

In [5]:

def get_file_from_folder(path):
    data = {}
    data_path = path
    headers = ["MeterID", "Datetime", "Electricity"]

    for file_name in os.listdir(data_path):
        file_path = os.path.join(data_path, file_name)

        df = pd.read_csv(file_path, sep=" ", names=headers)
        data[file_name] = df

    return data

def transform_daycode_to_date(days_int):
    days = int(str(days_int)[:3])
    add_date = datetime.timedelta(days=days)
    current_date = START_DATE + add_date
    return str(current_date)

def transform_dataframe(df):
    df = df.sort_values(by = ['MeterID', 'Datetime'])

    df["Day"] = df["Datetime"].map(lambda x: transform_daycode_to_date(x))
    df["Time"] = df["Datetime"].map(lambda x: str(x)[3:])
    df["Time"] = pd.to_numeric(df["Time"], downcast='integer')

    df = df[['MeterID','Day', 'Time', 'Electricity', 'Datetime']]
    return df

def print_df_summary(df, all=False):
    print(f"MeterID count : {df.value_counts('MeterID').unique()}")
    print(f"Datetime count : {df.value_counts('Datetime').unique()}")
    if all == True:
        print(f"Day count : {df.value_counts('Day').unique()}")
        print(f"Time count : {df.value_counts('Time').unique()}")
        print(f"Electricity count : {df.value_counts('Electricity').unique()}")

# data cleaning functions
# drop data has eletricity comsuption with 0
def data_cleaning_drop_zero(pretty_data):
    print(f'before drop zero: {pretty_data.shape}')
    pretty_data = pretty_data.groupby("MeterID")
    pretty_data = pretty_data.filter(lambda x: ( round(x['Electricity'].eq(0).sum() / x['Electricity'].count(), 2) < 0.01) )
    print(f'after drop zero:  {pretty_data.shape}')
    return pretty_data

# drop data has more than T hours
def data_cleaning_drop_48(pretty_data, time=48):
    print(f'before drop Time: {pretty_data.shape}')
    pretty_data = pretty_data[ pd.to_numeric(pretty_data["Time"], downcast='integer') <= time ]
    print(f'after drop Time:  {pretty_data.shape}')
    return pretty_data

# select data between min and max Datetime 
def data_cleaning_select_time(pretty_data, min_date, max_date):
    print(f'before drop Datetime between {min_date} and {max_date}:  {pretty_data.shape}')
    pretty_data = pretty_data[ ((pretty_data['Datetime'] >= min_date) & (pretty_data['Datetime'] <= max_date)) ]
    print(f'after drop Datetime between {min_date} and {max_date}:  {pretty_data.shape}')
    return pretty_data

# drop data from specific column and specific value
def drop_meter_from_df(pretty_data, column, value):
    return pretty_data[~pretty_data[column].isin(value)]

def plot_df_valuecounts_with_groupby(pretty_data, byColumn, column=False):
    print(pretty_data.groupby(byColumn).count()["Time"].value_counts())
    if column:
        pretty_data.groupby(byColumn).count()[column].value_counts().plot.bar()
    else:
        pretty_data.groupby(byColumn).count().value_counts().plot.bar()
        
def replace_missing_value(df, meter_id, meter_day, replace_day, day_interval=1): 
    #day_interval is used to change the datetime
    #meter_day is the day with data
    #replace_day is the missing day
    display(df[ (df["MeterID"]==meter_id) & (df["Day"]==replace_day)].head(2))
    df_temp = df[ (df["MeterID"]==meter_id) & (df["Day"]==meter_day)]
    df_temp = df_temp.replace(to_replace=r""+meter_day, value=replace_day)
    df_temp["Datetime"] = df_temp["Datetime"].add(day_interval*100)
    df = df.append(df_temp)
    display(df[ (df["MeterID"]==meter_id) & (df["Day"]==replace_day)].head(2))
    return df


def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300, image_path = IMAGES_PATH):
    path = os.path.join(image_path, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)
    
def plot_day_data(data, x="Day", rotation=45, y="Data quantity", title="data quantity for each day in residential meter",save=False):
    plt.figure(dpi=150, figsize=(10,4))
    plt.plot(data)

    # naming the x axis 
    plt.xlabel(x) 
    plt.xticks(np.arange(0, len(data), 20), rotation=45)
    # naming the y axis 
    plt.ylabel(y) 

    # giving a title to my graph 
    plt.title(title)
    if save == True:
        save_fig(title)

In [1]:
def read_smart_meter_csv(rng=10, path='CSV/smart_meter.csv'):
    smart_meter = pd.read_csv(path)
    # print(smart_meter.shape)
    n = np.array(smart_meter)
    nr = n.reshape(-1, 530, 48, 5)
    # print(nr.shape)
    read_data_group = []
    for i in range(rng):
        print("start area ", i)
        area_data = {}
        # get data in right position
        if i == 9:
            temp_data = nr[4225-427: 4225]
        else:
            temp_data = nr[i*422:(i+1)*422]

        t = pd.DataFrame(temp_data.reshape(-1, 5), columns = ["MeterID","Day","Time","Electricity","FDI"])

        #seperate normal and fdi
        normal_df = t[t['FDI'] == False]
        fdi_df = t[t['FDI'] == True]
        normal_list = []
        fdi_list = []

        for j in range( int(normal_df.shape[0] /25440) ):
            normal_list.append(normal_df[j*25440:(j+1)*25440])
        for j in range( int(fdi_df.shape[0] /25440) ): 
            fdi_list.append(fdi_df[j*25440:(j+1)*25440])

        area_data["normal"] = normal_list
        area_data["fdi"] = fdi_list

        read_data_group.append(area_data)

    return read_data_group