In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
import pickle
import datetime as dt
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)

from IdealDataInterface import IdealDataInterface
from IdealMetadataInterface import IdealMetadataInterface
from dateutil.relativedelta import relativedelta
from datetime import timedelta

### Specify the folder where your sensor data is

In [3]:
df = pickle.load(open("D:/Documents/MSc Project/data/reduced_data.p", 'rb'))

In [4]:
result = []
for i in df:
    result.append(len(df[i]))

In [5]:
np.mean(result)

11445.833333333334

In [6]:
df

{59:                      electric-combined
 time                                  
 2016-10-06 09:00:00           0.135684
 2016-10-06 10:00:00           0.177023
 2016-10-06 11:00:00           0.587103
 2016-10-06 12:00:00           0.220286
 2016-10-06 13:00:00           0.143367
 ...                                ...
 2018-01-28 03:00:00           0.091263
 2018-01-28 04:00:00           0.077921
 2018-01-28 05:00:00           0.065573
 2018-01-28 06:00:00           0.061346
 2018-01-28 07:00:00           0.055258
 
 [11495 rows x 1 columns],
 61:                      electric-combined
 time                                  
 2016-10-06 12:00:00           0.094648
 2016-10-06 13:00:00           0.215611
 2016-10-06 14:00:00           0.289798
 2016-10-06 15:00:00           0.078180
 2016-10-06 16:00:00           0.074056
 ...                                ...
 2017-10-19 02:00:00           0.054154
 2017-10-19 03:00:00           0.000000
 2017-10-19 04:00:00           0.174604
 20

In [7]:
for i in df:
    df[i]['day'] = df[i].index.dayofweek

In [8]:
def increment_month(date):
    if (date + timedelta(1)).month == date.month:
        return date + relativedelta(months=1)
    return (date + relativedelta(months=2)).replace(day=1) - timedelta(1)

In [9]:
def get_cut(data, start, end):
    'Function to return a ful cut of data for that month depending on year'
    
    cut = data[start : end]
    
    size = cut['day'].size
    
    lastdate = end
    
    if not cut.empty:
        lastdate = cut.iloc[size - 1].name.to_pydatetime()
    
    if cut.empty or lastdate < end:
        
        start -= relativedelta(years=1)
        end -= relativedelta(years=1)

        cut = data[start : end]
        
    return cut

In [10]:
def aggregate_days(data):
    'Function to take the average data for each hour of a all weekdays and weekends'
    
    startdate = datetime.datetime(2017, 1, 31)
    enddate = datetime.datetime(2017, 1, 31)
    
    result = {}
    
    if data.index.min() > startdate:
        
        startdate = datetime.datetime(2018, 1, 1)
        enddate = datetime.datetime(2018, 1, 31)
    else:
        startdate = datetime.datetime(2017, 1, 1)

    
    for i in range(12):
        
        df_cut = get_cut(data, startdate, enddate)            
    
        weekday = df_cut.loc[df_cut['day'] < 5]
        weekend = df_cut.loc[df_cut['day'] > 4]
        
        weekday.drop('day', axis=1, inplace=True)
        weekend.drop('day', axis=1, inplace=True)
        
        weekday = weekday.groupby([weekday.index.hour]).mean()
        weekend = weekend.groupby([weekend.index.hour]).mean()
        
        result[i] = {0: weekday, 1: weekend}
        
        startdate = increment_month(startdate)
        enddate = increment_month(enddate)
        
    return result

In [11]:
def aggregate_house_data(data):
    'Function to create final data for model run'
    
    result = {}
    
    for i in data:
        result[i] = aggregate_days(data[i])
        
    return result

In [12]:
def empty_df_check(data):
    'Function to check for any empty dataframes in data'
    empty = []
    for i in data.keys():
        for x in range(12):
            for n in range(2):
                if data[i][x][n].empty:
                    empty.append({"house": i, "month": x, "day": n})
                    
    return empty

In [13]:
def get_means(data):
    'Function to get the mean values for each out hour, each aggreagted weekday and weekend for each month'
    
    means = {}

    for i in range(12):
        means[i] = {0:[], 1:[]}
        for x in data.keys():
            for n in range(2):
                means[i][n].append(data[x][i][n])
                
    for i in means.keys():
        for n in range(2):
            means[i][n] = pd.concat(means[i][n], axis=1)
            means[i][n]['mean'] = means[i][n].mean(axis=1)
            means[i][n] = means[i][n]['mean']
            
    return means

In [14]:
def null_check(data):
    'Function to check for any null values in the aggregated data'
    
    null_check = {}
    for i in data.keys():
        nulls = 0
        for x in range(12):
            for n in range(2):
                nulls += data[i][x][n].isnull().sum()

        null_check[i] = nulls
    
    result = pd.DataFrame(null_check).swapaxes("index", "columns")
    
    return result

In [15]:
def fill_null(data, means):
    'Function to fill missing values with the averages of other houses at that time'
    
    for i in data.keys():
        for n in range(12):
            for x in range(2):
                data[i][n][x][data[i][n][x].isna()] = pd.DataFrame(means[n][x]).values
                
    return data

In [16]:
def prep_data(data):
    'Function to prepare the data for simulation'
    
    'Add column to indicate the day of the week'
    for i in data:
        data[i]['day'] = data[i].index.dayofweek
    
    agg_data = aggregate_house_data(data)
    
    mean_data = get_means(agg_data)
    
    if len(empty_df_check(agg_data)) > 0:
        return "Empty dataframes present"
    
    null_count = null_check(agg_data).sum().iloc[0]
    
    if null_count > 0:
        agg_data = fill_null(agg_data, mean_data)
        
    return agg_data

In [17]:
data = prep_data(df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [78]:
jan = list(data[59][0].values())
feb = list(data[59][1].values())
mar =  list(data[59][2].values())
first = [i for s in [jan, feb, mar] for i in s]
#[item for sublist in l for item in sublist]
first

[      electric-combined
 time                   
 0              0.165627
 1              0.136813
 2              0.141431
 3              0.144035
 4              0.151827
 5              0.154330
 6              0.170215
 7              0.332099
 8              0.469156
 9              0.329522
 10             0.260124
 11             0.199457
 12             0.217513
 13             0.208343
 14             0.238792
 15             0.282465
 16             0.276351
 17             0.440827
 18             0.658822
 19             0.644671
 20             0.400439
 21             0.445979
 22             0.363952
 23             0.218736,
       electric-combined
 time                   
 0              0.179617
 1              0.124965
 2              0.120645
 3              0.135846
 4              0.138546
 5              0.131257
 6              0.142688
 7              0.269352
 8              0.245284
 9              0.222974
 10             0.237110
 11             0.359503

In [30]:
result = []
for i in first:
    result.append(i[0])
    result.append(i[1])

In [79]:
df = pd.concat(first)

In [80]:
df = df.groupby([df.index]).mean()
df

Unnamed: 0_level_0,electric-combined
time,Unnamed: 1_level_1
0,0.167179
1,0.138576
2,0.139784
3,0.144665
4,0.147862
5,0.143884
6,0.161917
7,0.280459
8,0.343713
9,0.341869


In [90]:
def seasonal(data):
    result = {}
    for i in data.keys():
        months = list(range(1,13))
        result[i] = {}
        for m in range(0,12,3):
            one = list(data[i][m].values())
            two = list(data[i][m+1].values())
            three = list(data[i][m+2].values())
            season = [i for s in [one, two, three] for i in s]
            df = pd.concat(season)
            df = df.groupby([df.index]).mean()
            if m == 3:
                m=1
            elif m ==6:
                m=2
            elif m ==9:
                m=3
            result[i][m] = df
    return result
            
            

In [92]:
seasonal_data = seasonal(data)

In [93]:
pickle.dump(data, open('D:/Documents/MSc Project/data/seasonal_data.p', 'wb'))