In [1]:
import pandas as pd
import numpy as np
import os
import glob
import random
from tqdm import tqdm_notebook as tqdm
import seaborn as sns
from fitter import Fitter, get_common_distributions, get_distributions
from sklearn.metrics import mean_squared_error
import pickle

from support import *
from features.feature_ts import genX
from experiment.algorithms.cluster_prep import *
from Gauss_fit_functions import extractFIT, extractToPs , gauss, straight_line

from synthetic_profiles_functions import *

c:\Users\Jason\AppData\Local\Programs\Python\Python39\lib\os.py
c:\Users\Jason\thesis_project


# Load the households with one year worth of data

In [2]:
X = genX([1994,2014], drop_0 = True).reset_index()

ids = pd.read_pickle("Ids_of_users_with_atleast_365days_of_data.pkl")



dropping all zero rows


In [3]:
measured_profiles = pd.read_csv("Measured_Profiles_Missing_days_replaced_sorted_lenient.csv")
measured_profiles = measured_profiles[measured_profiles['ProfileID'].isin(ids)]

measured_profiles['date'] = pd.to_datetime(measured_profiles['date'])
# measured_profiles['date'] = measured_profiles['date'].apply(lambda x: x.date())
# measured_profiles.set_index(['ProfileID', 'date'], inplace = True)
ids = np.intersect1d(measured_profiles.ProfileID.unique(), ids)

In [4]:
X_filtered = measured_profiles#X[X['ProfileID'].isin(ids)]

# Split the users data into different daytypes

### High Season Weekdays

In [5]:
# Collect only winter weekday profiles from dataset
df = X_filtered.copy()

# df.reset_index(inplace = True)

# Extract Season
df['month'] = df.date.dt.month
df['season'] = df['month'].apply(lambda x: 'winter' if x in [6, 7, 8] else 'summer') 
df_winter = df[df['season'] == 'winter'] # Create dataframe with all the winter months, excluding weekends

# Extract Weekdays
weekdays = ['Monday', 'Tuesday', 'Wednesday', 'Thursday','Friday']
df_winter['day_names'] = df_winter.date.dt.day_name()
df_winter['daytype'] = df_winter.day_names.where(~df_winter.day_names.isin(weekdays), 'weekday')
df_winter.drop(['day_names'], axis = 1, inplace = True)
df_winter_weekdays  = df_winter[df_winter['daytype'] == 'weekday'] # Create dataframe with only weekdays
df_winter_weekdays.drop(['month', 'season','daytype'], axis = 1, inplace = True)
# df_winter_weekdays =  df_winter_weekdays[df_winter_weekdays.ProfileID.isin(profileIDs)]

### High Season Weekends

In [6]:
# Collect only winter weekday profiles from dataset
df = X_filtered.copy()

# df.reset_index(inplace = True)

# Extract Season
df['month'] = df.date.dt.month
df['season'] = df['month'].apply(lambda x: 'winter' if x in [6, 7, 8] else 'summer') 
df_winter = df[df['season'] == 'winter'] # Create dataframe with all the winter months, excluding weekends


# Extract Weekdays
weekends = ['Sunday', 'Saturday']
df_winter['day_names'] = df_winter.date.dt.day_name()
df_winter['daytype'] = df_winter.day_names.where(~df_winter.day_names.isin(weekends), 'weekend')
df_winter.drop(['day_names'], axis = 1, inplace = True)
df_winter_weekend  = df_winter[df_winter['daytype'] == 'weekend'] # Create dataframe with only weekdays
df_winter_weekend.drop(['month', 'season','daytype'], axis = 1, inplace = True)
# df_winter_weekend = df_winter_weekend[df_winter_weekend.ProfileID.isin(profileIDs)]

### Low Season Weekdays

In [7]:
# Collect only winter weekday profiles from dataset
df = X_filtered.copy()

# df.reset_index(inplace = True)

# Extract Season
df['month'] = df.date.dt.month
df['season'] = df['month'].apply(lambda x: 'winter' if x in [6, 7, 8] else 'summer') 
df_summer = df[df['season'] == 'summer'] # Create dataframe with all the winter months, excluding weekends


# Extract Weekdays
weekdays = ['Monday', 'Tuesday', 'Wednesday', 'Thursday','Friday']
weekends = ['Sunday', 'Saturday']
df_summer['day_names'] = df_summer.date.dt.day_name()
df_summer['daytype'] = df_summer.day_names.where(~df_summer.day_names.isin(weekdays), 'weekday')
df_summer.drop(['day_names'], axis = 1, inplace = True)
df_summer_weekday  = df_summer[df_summer['daytype'] == 'weekday'] # Create dataframe with only weekdays
df_summer_weekday.drop(['month', 'season','daytype'], axis = 1, inplace = True)
# df_summer_weekday = df_summer_weekday[df_summer_weekday.ProfileID.isin(profileIDs)]

### Low Season Weekends

In [8]:
# Collect only winter weekday profiles from dataset
df = X_filtered.copy()

# df.reset_index(inplace = True)

# Extract Season
df['month'] = df.date.dt.month
df['season'] = df['month'].apply(lambda x: 'winter' if x in [6, 7, 8] else 'summer') 
df_summer = df[df['season'] == 'summer'] # Create dataframe with all the winter months, excluding weekends


# Extract Weekdays
weekdays = ['Monday', 'Tuesday', 'Wednesday', 'Thursday','Friday']
weekends = ['Sunday', 'Saturday']
df_summer['day_names'] = df_summer.date.dt.day_name()
df_summer['daytype'] = df_summer.day_names.where(~df_summer.day_names.isin(weekends), 'weekend')
df_summer.drop(['day_names'], axis = 1, inplace = True)
df_summer_weekends  = df_summer[df_summer['daytype'] == 'weekend'] # Create dataframe with only weekdays
df_summer_weekends.drop(['month', 'season','daytype'], axis = 1, inplace = True)
# df_summer_weekends = df_summer_weekends[df_summer_weekends.ProfileID.isin(profileIDs)]

# Take a random sample from dataframes

In [9]:
sample = 14

if sample == "all":
    HWeekends_df = df_winter_weekend#.groupby(['ProfileID'])#.sample(n = 25) # Have to take a sample of 30 because winter has fewer months thus fewer weekend days
    HWeekdays_df = df_winter_weekdays#.groupby(['ProfileID'])#.sample(n = 25)
    LWeekends_df = df_summer_weekends#.groupby(['ProfileID'])#.sample(n = 25)
    LWeekdays_df = df_summer_weekday#.groupby(['ProfileID'])#.sample(n = 25)
else:
    HWeekends_df = df_winter_weekend.groupby(['ProfileID']).sample(n = sample) # Have to take a sample of 30 because winter has fewer months thus fewer weekend days
    HWeekdays_df = df_winter_weekdays.groupby(['ProfileID']).sample(n = sample)
    LWeekends_df = df_summer_weekends.groupby(['ProfileID']).sample(n = sample)
    LWeekdays_df = df_summer_weekday.groupby(['ProfileID']).sample(n = sample)

# Extract gauss fit features

### High Season Weekdays

In [10]:
cols = ['ProfileID','H_offset','sigma1','sigma2','mu1','A1','sigma3','sigma4','mu2','A2']

# Create dummy variables
H_offset = 0
sigma1 = 0
sigma2 = 0 
mu1 = 0
A1 = 0 
sigma3 = 0 
sigma4 = 0 
mu2 = 0
A2 = 0

data=[['DROP_ROW',H_offset,sigma1,sigma2, mu1, A1, sigma3, sigma4, mu2,A2]]

dummy_df = pd.DataFrame(data, columns=cols)
gauss_df = pd.DataFrame(data, columns=cols)
i = 0
for id in tqdm(ids):
    i = i + 1
    H_offset,sigma1, sigma2, mu1, A1, sigma3, sigma4, mu2, A2, check = extractFIT(HWeekdays_df,id)
    
    if check == False:
        continue

    data=[[id,H_offset,sigma1,sigma2, mu1, A1, sigma3, sigma4, mu2,A2]]
    temp_df = pd.DataFrame(data, columns=cols)
    temp_df.set_index(['ProfileID'])
    gauss_df = gauss_df.append(temp_df)

    # if i == 500:
    #     dummy_df = gauss_df.copy()
    #     dummy_df = dummy_df.set_index(['ProfileID'])
    #     # Store Gaussian Fit features
    #     dummy_temp = dummy_df.copy()
    #     dummy_temp.drop(['DROP_ROW'],axis = 0, inplace = True)
    #     dummy_temp.to_csv('FitFeatures_dummy_High_season_weekdays.csv')
    #     i = 0


gauss_df = gauss_df.set_index(['ProfileID'])

# Store Gaussian Fit features
temp = gauss_df.copy()
temp.drop(['DROP_ROW'],axis = 0, inplace = True)
temp.to_csv('CSV_Files/' + str(sample) + '/FitFeatures_HSeason_weekdays_sample_' + str(sample) + '.csv')

  0%|          | 0/22 [00:00<?, ?it/s]

False
False
False
False
False
False
False


In [11]:
temp.to_csv('CSV_Files/' + str(sample) + '/FitFeatures_HSeason_weekdays_sample_' + str(sample) + '.csv')

### High Season Weekends

In [12]:
cols = ['ProfileID','H_offset','sigma1','sigma2','mu1','A1','sigma3','sigma4','mu2','A2']

# Create dummy variables
H_offset = 0
sigma1 = 0
sigma2 = 0 
mu1 = 0
A1 = 0 
sigma3 = 0 
sigma4 = 0 
mu2 = 0
A2 = 0

data=[['DROP_ROW',H_offset,sigma1,sigma2, mu1, A1, sigma3, sigma4, mu2,A2]]

gauss_df = pd.DataFrame(data, columns=cols)
dummy_df = pd.DataFrame(data, columns = cols)
i = 0
for id in tqdm(ids):
    i = i + 1

    H_offset,sigma1, sigma2, mu1, A1, sigma3, sigma4, mu2, A2, check = extractFIT(HWeekends_df,id)

    if check == False:
        print("Check is FALSE")
        continue

    data=[[id,H_offset,sigma1,sigma2, mu1, A1, sigma3, sigma4, mu2,A2]]
    temp_df = pd.DataFrame(data, columns=cols)
    temp_df.set_index(['ProfileID'])
    gauss_df = gauss_df.append(temp_df)

    # if i == 5:
    #     dummy_df = gauss_df.copy()
    #     dummy_df = dummy_df.set_index(['ProfileID'])
    #     # Store Gaussian fit features
    #     dummy_temp = dummy_df.copy()
    #     dummy_temp.drop(['DROP_ROW'], axis = 0, inplace = True)
    #     dummy_temp.to_csv('FitFeatures_dummy2_High_season_weekends.csv')
    #     i = 0

gauss_df = gauss_df.set_index(['ProfileID'])

# Store Gaussian Fit features
temp = gauss_df.copy()
temp.drop(['DROP_ROW'],axis = 0, inplace = True)
temp.to_csv('CSV_Files/' + str(sample) + '/FitFeatures_HSeason_weekends_sample_' + str(sample) + '.csv')

  0%|          | 0/22 [00:00<?, ?it/s]

False
False
False
False
False
False
False
False
False
False
False
True
Check is FALSE
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False


### Low Season Weekdays

In [13]:
cols = ['ProfileID','H_offset','sigma1','sigma2','mu1','A1','sigma3','sigma4','mu2','A2']

# Create dummy variables
H_offset = 0
sigma1 = 0
sigma2 = 0 
mu1 = 0
A1 = 0 
sigma3 = 0 
sigma4 = 0 
mu2 = 0
A2 = 0

data=[['DROP_ROW',H_offset,sigma1,sigma2, mu1, A1, sigma3, sigma4, mu2,A2]]

dummy_df = pd.DataFrame(data, columns=cols)
gauss_df = pd.DataFrame(data, columns=cols)
i = 0
for id in tqdm(ids):
    i = i + 1
    H_offset,sigma1, sigma2, mu1, A1, sigma3, sigma4, mu2, A2, check = extractFIT(LWeekdays_df,id)
    
    if check == False:
        continue

    data=[[id,H_offset,sigma1,sigma2, mu1, A1, sigma3, sigma4, mu2,A2]]
    temp_df = pd.DataFrame(data, columns=cols)
    temp_df.set_index(['ProfileID'])
    gauss_df = gauss_df.append(temp_df)

    # if i == 500:
    #     dummy_df = gauss_df.copy()
    #     dummy_df = dummy_df.set_index(['ProfileID'])
    #     # Store Gaussian Fit features
    #     dummy_temp = dummy_df.copy()
    #     dummy_temp.drop(['DROP_ROW'],axis = 0, inplace = True)
    #     dummy_temp.to_csv('FitFeatures_dummy_High_season_weekdays.csv')
    #     i = 0


gauss_df = gauss_df.set_index(['ProfileID'])

# Store Gaussian Fit features
temp = gauss_df.copy()
temp.drop(['DROP_ROW'],axis = 0, inplace = True)
temp.to_csv('CSV_Files/' + str(sample) + '/FitFeatures_LSeason_weekdays_sample_' + str(sample) + '.csv')

  0%|          | 0/22 [00:00<?, ?it/s]

False
False
False
False
False
False
False
False


### Low Season Weekends

In [14]:
cols = ['ProfileID','H_offset','sigma1','sigma2','mu1','A1','sigma3','sigma4','mu2','A2']

# Create dummy variables
H_offset = 0
sigma1 = 0
sigma2 = 0 
mu1 = 0
A1 = 0 
sigma3 = 0 
sigma4 = 0 
mu2 = 0
A2 = 0

data=[['DROP_ROW',H_offset,sigma1,sigma2, mu1, A1, sigma3, sigma4, mu2,A2]]

gauss_df = pd.DataFrame(data, columns=cols)
dummy_df = pd.DataFrame(data, columns = cols)
i = 0
for id in tqdm(ids):
    i = i + 1

    H_offset,sigma1, sigma2, mu1, A1, sigma3, sigma4, mu2, A2, check = extractFIT(LWeekends_df,id)

   
    if check == False:
        continue

    data=[[id,H_offset,sigma1,sigma2, mu1, A1, sigma3, sigma4, mu2,A2]]
    temp_df = pd.DataFrame(data, columns=cols)
    temp_df.set_index(['ProfileID'])
    gauss_df = gauss_df.append(temp_df)

    # if i == 500:
    #     dummy_df = gauss_df.copy()
    #     dummy_df = dummy_df.set_index(['ProfileID'])
    #     # Store Gaussian fit features
    #     dummy_temp = dummy_df.copy()
    #     dummy_temp.drop(['DROP_ROW'], axis = 0, inplace = True)
    #     dummy_temp.to_csv('FitFeatures_dummy_Low_season_weekends.csv')
    #     i = 0

gauss_df = gauss_df.set_index(['ProfileID'])

# Store Gaussian Fit features
temp = gauss_df.copy()
temp.drop(['DROP_ROW'],axis = 0, inplace = True)
temp.to_csv('CSV_Files/' + str(sample) + '/FitFeatures_LSeason_weekends_sample_' + str(sample) + '.csv')

  0%|          | 0/22 [00:00<?, ?it/s]

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True


# Extract the amplitudes from the datasets

### High Season Weekdays

In [15]:
# Create cols
cols = ['ProfileID','A1','A2','mu1','mu2']

# Create dummy variables
mu1 = 0
A1 = 0 
mu2 = 0
A2 = 0

data=[['DROP_ROW',A1,A2,mu1,mu2]]

amplitudes_df = pd.DataFrame(data, columns=cols)

# profileIDs_3 = gauss_fit_features['ProfileID'].unique()

for id in tqdm(ids):
    for index in HWeekdays_df[HWeekdays_df['ProfileID'] == id].index:
        A1, A2, mu1, mu2, check = extractToPs(HWeekdays_df[HWeekdays_df['ProfileID'] == id].loc[index])
        if check == False:
            continue
        
        data=[[id,A1,A2,mu1,mu2]]
        temp_df = pd.DataFrame(data, columns=cols)
        # temp_df.set_index(['ProfileID'])
        amplitudes_df = amplitudes_df.append(temp_df)
    
amplitudes_df = amplitudes_df.set_index(['ProfileID'])
temporary = amplitudes_df.copy()
temporary.drop(['DROP_ROW'],axis = 0, inplace = True)
temporary.to_csv("CSV_Files/" + str(sample) + "/HWeekdays_amplitudes_sample_" + str(sample) + ".csv")

# HWeekdays_amplitudes_sample_30 = temporary.copy()
HWeekdays_amplitudes_sample_21 = temporary.copy()

  0%|          | 0/22 [00:00<?, ?it/s]

### High Season Weekends

In [16]:
# Create cols
cols = ['ProfileID','A1','A2','mu1','mu2']

# Create dummy variables
mu1 = 0
A1 = 0 
mu2 = 0
A2 = 0

data=[['DROP_ROW',A1,A2,mu1,mu2]]

amplitudes_df = pd.DataFrame(data, columns=cols)

# profileIDs_3 = gauss_fit_features['ProfileID'].unique()

for id in tqdm(ids):
    for index in HWeekends_df[HWeekends_df['ProfileID'] == id].index:
        A1, A2, mu1, mu2, check = extractToPs(HWeekends_df[HWeekends_df['ProfileID'] == id].loc[index])
        if check == False:
            continue
        
        data=[[id,A1,A2,mu1,mu2]]
        temp_df = pd.DataFrame(data, columns=cols)
        # temp_df.set_index(['ProfileID'])
        amplitudes_df = amplitudes_df.append(temp_df)
    
amplitudes_df = amplitudes_df.set_index(['ProfileID'])
temporary = amplitudes_df.copy()
temporary.drop(['DROP_ROW'],axis = 0, inplace = True)
temporary.to_csv("CSV_Files/" + str(sample) + "/HWeekends_amplitudes_sample_" + str(sample) + ".csv")

# HWeekends_amplitudes_sample_30 = temporary.copy()
HWeekends_amplitudes_sample_21 = temporary.copy()

  0%|          | 0/22 [00:00<?, ?it/s]

### Low Season Weekdays

In [17]:
# Create cols
cols = ['ProfileID','A1','A2','mu1','mu2']

# Create dummy variables
mu1 = 0
A1 = 0 
mu2 = 0
A2 = 0

data=[['DROP_ROW',A1,A2,mu1,mu2]]

amplitudes_df = pd.DataFrame(data, columns=cols)

# profileIDs_3 = gauss_fit_features['ProfileID'].unique()

for id in tqdm(ids):
    for index in LWeekdays_df[LWeekdays_df['ProfileID'] == id].index:
        A1, A2, mu1, mu2, check = extractToPs(LWeekdays_df[LWeekdays_df['ProfileID'] == id].loc[index])
        if check == False:
            continue
        
        data=[[id,A1,A2,mu1,mu2]]
        temp_df = pd.DataFrame(data, columns=cols)
        # temp_df.set_index(['ProfileID'])
        amplitudes_df = amplitudes_df.append(temp_df)
    
amplitudes_df = amplitudes_df.set_index(['ProfileID'])
temporary = amplitudes_df.copy()
temporary.drop(['DROP_ROW'],axis = 0, inplace = True)
temporary.to_csv("CSV_Files/" + str(sample) + "/LWeekdays_amplitudes_sample_" + str(sample) + ".csv")

# LWeekdays_amplitudes_sample_30 = temporary.copy()
LWeekdays_amplitudes_sample_21 = temporary.copy()

  0%|          | 0/22 [00:00<?, ?it/s]

### Low Season Weekends

In [18]:
# Create cols
cols = ['ProfileID','A1','A2','mu1','mu2']

# Create dummy variables
mu1 = 0
A1 = 0 
mu2 = 0
A2 = 0

data=[['DROP_ROW',A1,A2,mu1,mu2]]

amplitudes_df = pd.DataFrame(data, columns=cols)

# profileIDs_3 = gauss_fit_features['ProfileID'].unique()

for id in tqdm(ids):
    for index in LWeekends_df[LWeekends_df['ProfileID'] == id].index:
        A1, A2, mu1, mu2, check = extractToPs(LWeekends_df[LWeekends_df['ProfileID'] == id].loc[index])
        if check == False:
            continue
        
        data=[[id,A1,A2,mu1,mu2]]
        temp_df = pd.DataFrame(data, columns=cols)
        # temp_df.set_index(['ProfileID'])
        amplitudes_df = amplitudes_df.append(temp_df)
    
amplitudes_df = amplitudes_df.set_index(['ProfileID'])
temporary = amplitudes_df.copy()
temporary.drop(['DROP_ROW'],axis = 0, inplace = True)
temporary.to_csv("CSV_Files/" + str(sample) + "/LWeekends_amplitudes_sample_" + str(sample) + ".csv")

# LWeekends_amplitudes_sample_30 = temporary.copy()
LWeekends_amplitudes_sample_21 = temporary.copy()

  0%|          | 0/22 [00:00<?, ?it/s]

# Extract standard deviation from amplitudes

In [19]:
def standard_deviation(my_list):
    #calculate population standard deviation of list 
    return (sum((x-(sum(my_list) / len(my_list)))**2 for x in my_list) / len(my_list))**0.5

In [20]:

def determine_standard_deviation(consumption_data, amplitudes_df):
    daily_consumption = pd.DataFrame()

    daily_consumption['Daily_Consumption'] = consumption_data.set_index(["ProfileID"]).sum(axis = 1)

    std_deviation_df = pd.DataFrame(index = amplitudes_df.index.unique())

    for id in tqdm(amplitudes_df.index.unique()):
        try:
            std_deviation_df.loc[id,'A1_std'] = standard_deviation(amplitudes_df.loc[id]['A1'])
            std_deviation_df.loc[id,'A2_std'] = standard_deviation(amplitudes_df.loc[id]['A2'])
            std_deviation_df.loc[id,'mu1_std'] = standard_deviation(amplitudes_df.loc[id]['mu1'])
            std_deviation_df.loc[id,'mu2_std'] = standard_deviation(amplitudes_df.loc[id]['mu2'])
            std_deviation_df.loc[id,'DC_std'] = standard_deviation(daily_consumption.loc[id]['Daily_Consumption'])
        except TypeError:
            print('TypeError')
            continue
    
    return std_deviation_df

In [21]:
HWeekday_std_deviation = determine_standard_deviation(HWeekdays_df, HWeekdays_amplitudes_sample_21)
HWeekend_std_deviation = determine_standard_deviation(HWeekends_df, HWeekends_amplitudes_sample_21)
LWeekday_std_deviation = determine_standard_deviation(LWeekdays_df, LWeekdays_amplitudes_sample_21)
LWeekend_std_deviation = determine_standard_deviation(LWeekends_df, LWeekends_amplitudes_sample_21)

  0%|          | 0/22 [00:00<?, ?it/s]

  0%|          | 0/22 [00:00<?, ?it/s]

  0%|          | 0/22 [00:00<?, ?it/s]

  0%|          | 0/22 [00:00<?, ?it/s]

### Combine dataframe

In [22]:
# HWeekdays_features = pd.read_csv('FitFeatures_HSeason_weekdays_sample_30.csv', index_col='ProfileID')
# HWeekends_features = pd.read_csv('FitFeatures_HSeason_weekends_sample_30.csv', index_col='ProfileID')
# LWeekdays_features = pd.read_csv('FitFeatures_LSeason_weekdays_sample_30.csv', index_col='ProfileID')
# LWeekends_features = pd.read_csv('FitFeatures_LSeason_weekends_sample_30.csv', index_col='ProfileID')

HWeekdays_features = pd.read_csv('CSV_Files/' + str(sample) + '/FitFeatures_HSeason_weekdays_sample_' + str(sample) + '.csv', index_col='ProfileID')
HWeekends_features = pd.read_csv('CSV_Files/' + str(sample) + '/FitFeatures_HSeason_weekends_sample_' + str(sample) + '.csv', index_col='ProfileID')
LWeekdays_features = pd.read_csv('CSV_Files/' + str(sample) + '/FitFeatures_LSeason_weekdays_sample_' + str(sample) + '.csv', index_col='ProfileID')
LWeekends_features = pd.read_csv('CSV_Files/' + str(sample) + '/FitFeatures_LSeason_weekends_sample_' + str(sample) + '.csv', index_col='ProfileID')

In [23]:
HWeekdays_combined = pd.merge(HWeekdays_features,HWeekday_std_deviation, left_index = True, right_index = True)
HWeekends_combined = pd.merge(HWeekends_features,HWeekend_std_deviation, left_index = True, right_index = True)
LWeekdays_combined = pd.merge(LWeekdays_features,LWeekday_std_deviation, left_index = True, right_index = True)
LWeekends_combined = pd.merge(LWeekends_features,LWeekend_std_deviation, left_index = True, right_index = True)


In [24]:
HWeekends_features

Unnamed: 0_level_0,H_offset,sigma1,sigma2,mu1,A1,sigma3,sigma4,mu2,A2
ProfileID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
695,6.373399,1.886792,1.860465,9,12.93275,2.580645,1.923077,19,10.916768
710,1.885179,1.165049,1.481481,8,18.36619,2.0,1.621622,13,13.204821
4519,8.04644,0.581395,3.018868,8,13.136643,1.714286,1.73913,20,10.612607
4529,5.477339,0.900901,3.809524,9,4.24925,2.380952,1.470588,19,11.935976
4547,5.341839,0.75188,0.555556,8,2.857369,1.219512,2.758621,20,8.492167
4562,2.314571,1.075269,5.0,10,9.078935,1.481481,2.222222,19,10.250577
4564,7.05297,1.621622,2.380952,8,12.341917,2.272727,2.12766,19,14.998143
4566,4.679387,1.346154,2.777778,9,8.429744,1.428571,1.818182,20,11.776976
4596,5.994935,1.538462,1.666667,10,8.855256,0.816327,3.030303,19,19.669822
4608,2.466173,0.006025,0.754717,5,1.188149,1.111111,2.142857,19,12.35872


# Fit Distributions to data

In [25]:
# profiles_ids = combined_df.index.unique().values

def determine_distributions(amplitudes_df):

  cols = pd.MultiIndex.from_tuples([#("ProfileID",''),
                                  ('A1', 'Distribution'),
                                  ("A1", "chi_square"), 
                                    ("A1", "params"), 
                                    ("A2", "Distribution"),
                                    ("A2", "chi_square"),
                                    ("A2", "params") 
                                    #, ('t1', 'Distribution'),
                                    # ("t1", "chi_square"), 
                                    # ("t1", "params"), 
                                    # ("t2", "Distribution"),
                                    # ("t2", "chi_square"),
                                    # ("t2", "params"),
                                  ])
  distributions_df = pd.DataFrame(index = ids,columns = cols)
  results = []
  for id in tqdm(ids):
    # Extract the best distribution fitted
    try:
      results1 = fit_distribution(amplitudes_df.loc[id],'A1',0.99,0.01)
      results2 = fit_distribution(amplitudes_df.loc[id],'A2',0.99,0.01)
      # results3 = fit_distribution(temporary.loc[id],'mu1',0.99,0.01)
      # results4 = fit_distribution(temporary.loc[id],'mu2',0.99,0.01)

      results = [results1.values[0],results1.values[1],results1.values[2], results2.values[0],results2.values[1],results2.values[2]]
                # ,results3.values[0],results3.values[1],results3.values[2], results4.values[0],results4.values[1],results4.values[2]]

      distributions_df.loc[id] = results
    except Exception:
      continue

  return distributions_df    

In [26]:
HWeekdays_distributions = determine_distributions(HWeekdays_amplitudes_sample_21)
HWeekends_distributions = determine_distributions(HWeekends_amplitudes_sample_21)
LWeekdays_distributions = determine_distributions(LWeekdays_amplitudes_sample_21)
LWeekends_distributions = determine_distributions(LWeekends_amplitudes_sample_21)

  0%|          | 0/22 [00:00<?, ?it/s]

  0%|          | 0/22 [00:00<?, ?it/s]

  0%|          | 0/22 [00:00<?, ?it/s]

  0%|          | 0/22 [00:00<?, ?it/s]

### save the distributions for the sample of 30

In [27]:
# HWeekdays_distributions.to_csv("HWeekdays_distributions_sample_30.csv")
# HWeekends_distributions.to_csv("HWeekends_distributions_sample_30.csv") 
# LWeekdays_distributions.to_csv("LWeekdays_distributions_sample_30.csv")
# LWeekends_distributions.to_csv("LWeekends_distributions_sample_30.csv")

HWeekdays_distributions.to_csv("CSV_Files/" + str(sample) + "/HWeekdays_distributions_sample_" + str(sample) + ".csv")
HWeekends_distributions.to_csv("CSV_Files/" + str(sample) + "/HWeekends_distributions_sample_" + str(sample) + ".csv") 
LWeekdays_distributions.to_csv("CSV_Files/" + str(sample) + "/LWeekdays_distributions_sample_" + str(sample) + ".csv")
LWeekends_distributions.to_csv("CSV_Files/" + str(sample) + "/LWeekends_distributions_sample_" + str(sample) + ".csv")

# Generate the synthetic profiles

### Generate synthetic peaks

In [30]:
def generate_synthetic_peaks(distributions_df, combined_df):
    level_0 = distributions_df.columns.get_level_values(0).unique()
    inv_data_df = pd.DataFrame()
    temp_df_inv = pd.DataFrame()
    for houseID in tqdm(distributions_df.index):
        try:
            for column in level_0:

                distributions = distributions_df[column].loc[houseID]['Distribution']
                parameters = distributions_df[column].loc[houseID]['params']

                # parameters = eval(parameters)
                loc = combined_df.loc[houseID][column] + combined_df.loc[houseID]['H_offset']
                scale = combined_df.loc[houseID][column + '_std']  
                # loc = parameters[-2]
                # scale = parameters[-1]
                size = 300

                if distributions == 'invgauss':
                    # print('invgauss')
                    data_points = invgauss.rvs(parameters[0],loc = loc,scale = scale,size = size)
                elif distributions == 'weibull_min':
                    # print('weibull_min')
                    data_points = weibull_min.rvs(parameters[0], loc = loc,scale = scale, size = size)       
                elif distributions == 'lognorm':
                    # print('lognorm')
                    data_points = lognorm.rvs(parameters[0], loc = loc,scale = scale, size = size)            
                elif distributions == 'expon':
                    # print('expon')
                    data_points = expon.rvs(loc = loc,scale = scale, size = size)
                elif distributions == 'gamma':
                    # print('gamma')
                    data_points = gamma.rvs(parameters[0], loc = loc,scale = scale, size = size)            
                elif distributions == 'halflogistic':
                    # print('halflogistic')
                    data_points = halflogistic.rvs(loc=loc, scale = scale,size=size)
                
                

                # print(column)
                # inverse_data_points = inverse_StandardScalar(data_points,temporary.loc[id],column,0.99,0.01)
                temp_df_inv['ProfileID'] = houseID
                temp_df_inv[column] = data_points#inverse_data_points
                
            inv_data_df = inv_data_df.append(temp_df_inv)
            temp_df_inv = pd.DataFrame()
                # if column == 'A1':
                #     data_pointA1 = data_points#inverse_data_points
                # elif column == 'A2':
                #     data_pointsA2 = data_points#inverse_data_points
        except KeyError:
            print(f"KeyError: {houseID}")
            continue

    inv_data_df = inv_data_df.dropna()
    inv_data_df.set_index(['ProfileID'], inplace = True)

    return inv_data_df

In [31]:
HWeekday_synthetic_peaks = generate_synthetic_peaks(HWeekdays_distributions, HWeekdays_combined)
HWeekend_synthetic_peaks = generate_synthetic_peaks(HWeekends_distributions, HWeekends_combined)
LWeekday_synthetic_peaks = generate_synthetic_peaks(LWeekdays_distributions, LWeekdays_combined)
LWeekend_synthetic_peaks = generate_synthetic_peaks(LWeekends_distributions, LWeekends_combined)

  0%|          | 0/22 [00:00<?, ?it/s]

  0%|          | 0/22 [00:00<?, ?it/s]

KeyError: 4525


  0%|          | 0/22 [00:00<?, ?it/s]

  0%|          | 0/22 [00:00<?, ?it/s]

KeyError: 4610
KeyError: 12022948


### Remove outliers from peaks

In [32]:
def Remove_Outlier_Indices(df):
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.95)
    IQR = Q3 - Q1
    trueList = ~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR)))
    return trueList

In [33]:
HWeekday_synthetic_peaks = HWeekday_synthetic_peaks[Remove_Outlier_Indices(HWeekday_synthetic_peaks['A1'])]
HWeekday_synthetic_peaks = HWeekday_synthetic_peaks[Remove_Outlier_Indices(HWeekday_synthetic_peaks['A2'])]

HWeekend_synthetic_peaks = HWeekend_synthetic_peaks[Remove_Outlier_Indices(HWeekend_synthetic_peaks['A1'])]
HWeekend_synthetic_peaks = HWeekend_synthetic_peaks[Remove_Outlier_Indices(HWeekend_synthetic_peaks['A2'])]

LWeekday_synthetic_peaks = LWeekday_synthetic_peaks[Remove_Outlier_Indices(LWeekday_synthetic_peaks['A1'])]
LWeekday_synthetic_peaks = LWeekday_synthetic_peaks[Remove_Outlier_Indices(LWeekday_synthetic_peaks['A2'])]

LWeekend_synthetic_peaks = LWeekend_synthetic_peaks[Remove_Outlier_Indices(LWeekend_synthetic_peaks['A1'])]
LWeekend_synthetic_peaks = LWeekend_synthetic_peaks[Remove_Outlier_Indices(LWeekend_synthetic_peaks['A2'])]

### Create the synthetic profiles

In [36]:
def create_synthetic_profiles(distributions_df, features_df, synthetic_peaks): 
    synthetic_df = pd.DataFrame()
    for id in tqdm(distributions_df.index.unique()):
        try:
            houseID = id

            H_offset = features_df.loc[houseID]['H_offset']
            # H_offset = 0.0
            mu1 = features_df.loc[houseID]['mu1']
            mu2 = features_df.loc[houseID]['mu2']

            sigma1 = features_df.loc[houseID]['sigma1']
            sigma2 = features_df.loc[houseID]['sigma2']
            sigma3 = features_df.loc[houseID]['sigma3']
            sigma4 = features_df.loc[houseID]['sigma4']

            A1 = synthetic_peaks.loc[houseID]['A1']
            A2 = synthetic_peaks.loc[houseID]['A2']
            A1 = pd.DataFrame(A1)
            A2 = pd.DataFrame(A2)
            A1 = A1 - H_offset
            A2 = A2 - H_offset

            A1.reset_index(inplace=True)
            A2.reset_index(inplace=True)


            synth = generate_synth_profiles2(houseID,A1,A2,mu1,mu2,H_offset,sigma1,sigma2, sigma3,sigma4)

            # temp = pd.DataFrame(synth)
            # temp = temp.T
            synthetic_df = synthetic_df.append(synth)
        except KeyError:
            print(f"KeyError: {id}")
            continue
        
    return synthetic_df    

In [37]:
HWeekday_synthetic_profiles = create_synthetic_profiles(HWeekdays_distributions, HWeekdays_features, HWeekday_synthetic_peaks)
HWeekend_synthetic_profiles = create_synthetic_profiles(HWeekends_distributions, HWeekends_features, HWeekend_synthetic_peaks)
LWeekday_synthetic_profiles = create_synthetic_profiles(LWeekdays_distributions, LWeekdays_features, LWeekday_synthetic_peaks)
LWeekend_synthetic_profiles = create_synthetic_profiles(LWeekends_distributions, LWeekends_features, LWeekend_synthetic_peaks)

  0%|          | 0/22 [00:00<?, ?it/s]

  0%|          | 0/22 [00:00<?, ?it/s]

KeyError: 695
KeyError: 4525


  0%|          | 0/22 [00:00<?, ?it/s]

KeyError: 12021804


  0%|          | 0/22 [00:00<?, ?it/s]

KeyError: 4610
KeyError: 12022948


# save the synthetic sample profiles

In [38]:
# HWeekday_synthetic_profiles.to_csv("HWeekday_synthetic_profiles_sample_30.csv")
# HWeekend_synthetic_profiles.to_csv("HWeekend_synthetic_profiles_sample_30.csv")
# LWeekday_synthetic_profiles.to_csv("LWeekday_synthetic_profiles_sample_30.csv")
# LWeekend_synthetic_profiles.to_csv("LWeekend_synthetic_profiles_sample_30.csv")

HWeekday_synthetic_profiles.to_csv("CSV_Files/" + str(sample) + "/HWeekday_synthetic_profiles_sample_" + str(sample) + ".csv")
HWeekend_synthetic_profiles.to_csv("CSV_Files/" + str(sample) + "/HWeekend_synthetic_profiles_sample_" + str(sample) + ".csv")
LWeekday_synthetic_profiles.to_csv("CSV_Files/" + str(sample) + "/LWeekday_synthetic_profiles_sample_" + str(sample) + ".csv")
LWeekend_synthetic_profiles.to_csv("CSV_Files/" + str(sample) + "/LWeekend_synthetic_profiles_sample_" + str(sample) + ".csv")

# Model evaluation with sample n = 30

In [39]:
measured_profiles = pd.read_csv("Measured_Profiles_Missing_days_replaced_sorted_lenient.csv")
measured_profiles = measured_profiles[measured_profiles['ProfileID'].isin(ids)]


In [40]:
measured_profiles['date'] = pd.to_datetime(measured_profiles['date'])
measured_profiles['date'] = measured_profiles['date'].apply(lambda x: x.date())
measured_profiles.set_index(['ProfileID', 'date'], inplace = True)
measured_profiles

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
ProfileID,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
695,1996-01-01,0.985750,4.109417,1.074250,1.419250,3.643583,10.296583,10.082417,9.147667,10.835666,2.116750,...,11.612750,8.668333,3.388667,7.225583,8.197500,5.243833,4.677750,2.391833,2.068917,1.648250
695,1996-01-02,1.907667,1.904167,0.431250,0.745917,2.515500,8.386333,9.517416,14.535250,15.029833,7.886000,...,11.141500,14.060333,14.050333,5.163000,14.639417,11.439667,14.850416,14.425333,7.889250,1.879750
695,1996-01-03,1.217000,1.090000,1.981833,3.594417,1.455667,10.883000,15.531333,14.280333,13.538000,4.530500,...,10.132667,7.960417,5.208333,11.852750,13.416583,5.832083,9.138500,11.399667,2.311500,2.070083
695,1996-01-04,1.403833,1.487583,1.363167,2.727000,2.300917,10.251583,9.054167,6.212333,3.919417,4.546083,...,9.270917,10.222667,14.477667,11.548417,10.390000,10.334333,9.304167,11.652083,2.681083,2.354083
695,1996-01-05,1.239750,2.198667,1.280500,2.503500,2.171000,10.705250,10.645000,7.862417,6.695750,5.926417,...,11.801167,20.552417,9.668500,4.797333,5.144333,6.551500,5.032500,2.160417,2.428667,2.770167
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12023147,2012-12-27,3.516667,1.666667,3.666667,1.583333,1.566667,1.616667,4.233333,4.816667,9.433333,7.500000,...,3.466667,4.400000,5.850000,10.133333,7.733333,6.850000,8.866667,8.550000,15.316667,6.450000
12023147,2012-12-28,3.033333,1.600000,1.483333,1.533333,1.533333,1.533333,4.466667,8.133333,3.833333,6.766667,...,12.583333,8.133333,6.616667,5.133333,5.000000,9.983333,7.016667,14.116667,7.800000,4.283333
12023147,2012-12-29,4.016667,1.850000,1.833333,1.650000,1.700000,2.266667,1.650000,3.116667,6.183333,5.300000,...,5.950000,12.850000,11.833333,4.416667,10.516667,10.866667,5.250000,8.483333,9.183333,5.666667
12023147,2012-12-30,2.250000,1.700000,0.483333,0.466667,0.516667,2.000000,2.550000,1.733333,8.700000,8.050000,...,7.683333,10.133333,10.250000,4.916667,8.833333,7.550000,3.933333,9.450000,10.033333,5.950000


### Create one year of synthetic profiles for each user

In [42]:
temp_synth = pd.DataFrame(index = measured_profiles.index)
temp_synth.reset_index(['date'], inplace = True)

temp_synth['month'] = temp_synth.date.dt.month
temp_synth['month'] = temp_synth['month'].apply(lambda x: 'HIGH' if x in [6, 7, 8] else 'LOW') 
temp_synth['day_names'] = temp_synth.date.dt.day_name()
weekdays = ['Monday', 'Tuesday', 'Wednesday', 'Thursday','Friday']
temp_synth['day_names'] =  temp_synth['day_names'].apply(lambda x: 'WEEKDAY' if x in weekdays else 'WEEKEND') 

df_frames = []

for id in tqdm(ids):
    try:
        try:
            temp1 = pd.concat([temp_synth.loc[id][(temp_synth.loc[id]['month'] == "LOW") & (temp_synth.loc[id]['day_names'] == 'WEEKEND')],
                        LWeekend_synthetic_profiles.loc[id].sample(len(temp_synth.loc[id][(temp_synth.loc[id]['month'] == "LOW") & (temp_synth.loc[id]['day_names'] == 'WEEKEND')]))], axis = 1)
        except ValueError:
            print("ValueError")
            temp1 = pd.concat([temp_synth.loc[id][(temp_synth.loc[id]['month'] == "LOW") & (temp_synth.loc[id]['day_names'] == 'WEEKEND')],
                        LWeekend_synthetic_profiles.loc[id].sample(len(temp_synth.loc[id][(temp_synth.loc[id]['month'] == "LOW") & (temp_synth.loc[id]['day_names'] == 'WEEKEND')]), replace = True)], axis = 1)
        try:                
            temp2 = pd.concat([temp_synth.loc[id][(temp_synth.loc[id]['month'] == "LOW") & (temp_synth.loc[id]['day_names'] == 'WEEKDAY')],
                        LWeekday_synthetic_profiles.loc[id].sample(len(temp_synth.loc[id][(temp_synth.loc[id]['month'] == "LOW") & (temp_synth.loc[id]['day_names'] == 'WEEKDAY')]))], axis = 1)
        except ValueError:
            print("ValueError")
            temp2 = pd.concat([temp_synth.loc[id][(temp_synth.loc[id]['month'] == "LOW") & (temp_synth.loc[id]['day_names'] == 'WEEKDAY')],
                        LWeekday_synthetic_profiles.loc[id].sample(len(temp_synth.loc[id][(temp_synth.loc[id]['month'] == "LOW") & (temp_synth.loc[id]['day_names'] == 'WEEKDAY')]), replace = True)], axis = 1)

        try:
            temp3 = pd.concat([temp_synth.loc[id][(temp_synth.loc[id]['month'] == "HIGH") & (temp_synth.loc[id]['day_names'] == 'WEEKDAY')],
                        HWeekday_synthetic_profiles.loc[id].sample(len(temp_synth.loc[id][(temp_synth.loc[id]['month'] == "HIGH") & (temp_synth.loc[id]['day_names'] == 'WEEKDAY')]))], axis = 1)
        except ValueError:
            print("ValueError")
            temp3 = pd.concat([temp_synth.loc[id][(temp_synth.loc[id]['month'] == "HIGH") & (temp_synth.loc[id]['day_names'] == 'WEEKDAY')],
                        HWeekday_synthetic_profiles.loc[id].sample(len(temp_synth.loc[id][(temp_synth.loc[id]['month'] == "HIGH") & (temp_synth.loc[id]['day_names'] == 'WEEKDAY')]), replace = True)], axis = 1)
                    
        try:
            temp4 = pd.concat([temp_synth.loc[id][(temp_synth.loc[id]['month'] == "HIGH") & (temp_synth.loc[id]['day_names'] == 'WEEKEND')],
                        HWeekend_synthetic_profiles.loc[id].sample(len(temp_synth.loc[id][(temp_synth.loc[id]['month'] == "HIGH") & (temp_synth.loc[id]['day_names'] == 'WEEKEND')]))], axis = 1)
        except ValueError:
            print("ValueError")
            temp4 = pd.concat([temp_synth.loc[id][(temp_synth.loc[id]['month'] == "HIGH") & (temp_synth.loc[id]['day_names'] == 'WEEKEND')],
                        HWeekend_synthetic_profiles.loc[id].sample(len(temp_synth.loc[id][(temp_synth.loc[id]['month'] == "HIGH") & (temp_synth.loc[id]['day_names'] == 'WEEKEND')]), replace = True)], axis = 1)

            
        frames = [temp1, temp2, temp3, temp4]
        result = pd.concat(frames)

        df_frames.insert(len(df_frames)+1,result)
    except KeyError:
        print(f"KeyError: {id}")

result2 = pd.concat(df_frames)

result2.reset_index(inplace = True)
result2.set_index(['ProfileID', 'date'], inplace = True)
result2 = result2.sort_index()
# result2 = result2.drop(['24'], axis = 1)
result2.drop(['month', 'day_names'], axis = 1, inplace = True)

synthetic_profiles = result2

  0%|          | 0/22 [00:00<?, ?it/s]

KeyError: 695
KeyError: 4525
KeyError: 4610
KeyError: 12021804
KeyError: 12022948


In [43]:
synthetic_profiles.to_csv('CSV_Files/' + str(sample) + '/Synthetic_Profiles_sample_' + str(sample) + '.csv')
# synthetic_profiles = pd.read_csv('Synthetic_Profiles_model.csv')