## Data Collection

In [None]:
# including the project directory to the notebook level
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
# import modules
from alumni_scripts import data_process as dp
import json

In [None]:
# run a single update of the data pull
with open('../auths.json', 'r') as fp:
    api_args = json.load(fp)

time_args = {
    'start_year': 2020,'start_month': 6,'start_day': 26,'start_hour': 12,'start_minute': 34,'start_second': 23,
    'end_year':   2020,'end_month'  : 6,'end_day'  : 26,'end_hour'  : 13,'end_minute'  :  4,'end_second'  : 23,
    'trend_id': '2681',
    'save_path':'../data/raw_data/alumni_data.csv'
}
api_args.update(time_args)

dp.pull_offline_data(**api_args)

In [None]:
# download data in a loop
time_args = [
    {
    'start_year': 2018,'start_month': 7,'start_day': 1,'start_hour': 0,'start_minute': 0,'start_second': 0,
    'end_year': 2018,'end_month': 12,'end_day': 31,'end_hour': 23,'end_minute': 59,'end_second': 59,
    'trend_id': '2681',
    'save_path':'../data/raw_data/alumni_data_jul2dec2018.csv'
    },
    {
    'start_year': 2019,'start_month': 1,'start_day': 1,'start_hour': 0,'start_minute': 0,'start_second': 0,
    'end_year': 2019,'end_month': 6,'end_day': 30,'end_hour': 23,'end_minute': 59,'end_second': 59,
    'trend_id': '2681',
    'save_path':'../data/raw_data/alumni_data_jan2jun2019.csv'  
    },
    {
    'start_year': 2019,'start_month': 7,'start_day': 1,'start_hour': 0,'start_minute': 0,'start_second': 0,
    'end_year': 2019,'end_month': 12,'end_day': 31,'end_hour': 23,'end_minute': 59,'end_second': 59,
    'trend_id': '2681',
    'save_path':'../data/raw_data/alumni_data_jul2dec2019.csv'  
    },
    {
    'start_year': 2020,'start_month': 1,'start_day': 1,'start_hour': 0,'start_minute': 0,'start_second': 0,
    'end_year': 2020,'end_month': 6,'end_day': 15,'end_hour': 23,'end_minute': 59,'end_second': 59,
    'trend_id': '2681',
    'save_path':'../data/raw_data/alumni_data_jan2jun2020.csv'  
    }
]
for i in time_args:
    with open('../auths.json', 'r') as fp:
        api_args = json.load(fp)
    api_args.update(i)
    dp.pull_offline_data(**api_args)
    print("Success!")

### Deployment testing

In [None]:
# testing the deploy control thread
""" change path in the deploy control thread to save the file to appropriate location"""
import warnings
with warnings.catch_warnings():
    from alumni_scripts import deploy_control as dc

with open('../auths.json', 'r') as fp:
    api_args = json.load(fp)
with open('../alumni_scripts/meta_data.json', 'r') as fp:
    meta_data_ = json.load(fp)
obs_space_vars = ['oat', 'oah', 'wbt', 'avg_stpt', 'sat']

df = dc.get_real_obs(api_args, meta_data_, obs_space_vars)

df

## Create Offline Batch Time Series Data Base for Alumni Hall data

In [None]:
# including the project directory to the notebook level
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
# import modules
from alumni_scripts import data_process as dp
from alumni_scripts import alumni_data_utils as a_utils
import json
import numpy as np
import pandas as pd
from influxdb import DataFrameClient
from collections import OrderedDict
from CoolProp.HumidAirProp import HAPropsSI

### Collate the data from csv files

In [None]:
# collate batch of data
file_names = ['jul2dec2018', 'jan2jun2019', 'jul2dec2019', 'jan2jun2020']
dflist = []
for fname in file_names:
    df_ = pd.read_csv('../data/raw_data/alumni_data_{}.csv'.format(fname))
    df_['time'] = pd.to_datetime(df_['time'])
    df_.set_index(keys='time',inplace=True, drop = True)
    dflist.append(df_)
df = a_utils.mergerows(dflist)

### Calculate Wet Bulb Temperature and add it to the dataframe

In [None]:
rh = df['WeatherDataProfile humidity']/100
rh = rh.to_numpy()
t_db = 5*(df['AHU_1 outdoorAirTemp']-32)/9 + 273.15
t_db = t_db.to_numpy()

In [None]:
T = HAPropsSI('T_wb','R',rh,'T',t_db,'P',101325)
t_f = 9*(T-273.15)/5 + 32
df['wbt'] = t_f

### Create meta data

In [None]:
# Create column_aliases
d1 = {'column_names': list(df.columns)}
column_aliases = [
    'pchwst', 'vrf50', 'oat', 'sat', 'sat_stpt', 'oah', 'vrf67', 'pchw_flow',
    'hwe', 'vrf1', 'vrf30', 'vrf34', 'vrf74', 'cwe', 'hws_st_stpt', 'vrf60',
    'vrf63', 'hws_st', 'hws_vlv1', 'vrf77', 'vrf64', 'vrf10', 'ee', 'hws_rt',
    'vrf100', 'vrf40', 'hws_flow', 'vrf108', 'vrf20', 'wbt'
]

d1['column_agg_type'] = {
    "pchwst": "mean","vrf50": "mean","oat": "mean","sat": "mean","sat_stpt": "mean","oah": "mean",
    "vrf67": "mean","pchw_flow": "sum","hwe": "sum","vrf1": "mean","vrf30": "mean","vrf34": "mean",
    "vrf74": "mean","cwe": "sum","hws_st_stpt": "mean","vrf60": "mean","vrf63": "mean","hws_st": "mean",
    "hws_vlv1": "sum","vrf77": "mean", "vrf64": "mean","vrf10": "mean","ee": "sum","hws_rt": "mean",
    "vrf100": "mean","vrf40": "mean","hws_flow": "sum","vrf108": "mean","vrf20": "mean", 'wbt' : "mean"
}

# Create column alias
d2 = OrderedDict()
for i, j in zip(df.columns, column_aliases):
    d2.update({j: i})
d1['column_aliases'] = d2

df.columns = column_aliases

# Create column stats
stats = {}
d3 = OrderedDict(df.describe())
for key, alias in zip(d3.keys(), column_aliases):
    stats[alias] = dict(d3[key])
d1['column_stats'] = stats

### Create half hour stats for the data

In [None]:
# aggregate data
rolling_sum_target, rolling_mean_target = [], []
for key, value in d1['column_agg_type'].items():
    if value == 'sum': rolling_sum_target.append(key)
    else: rolling_mean_target.append(key)

df_agg = df.copy()
        
df_agg[rolling_sum_target] =  a_utils.window_sum(df_agg, window_size=6, column_names=rolling_sum_target)
df_agg[rolling_mean_target] =  a_utils.window_mean(df_agg, window_size=6, column_names=rolling_mean_target)
df_agg = a_utils.dropNaNrows(df_agg)
# sample at half hour
df_agg = a_utils.sample_timeseries_df(df_agg, period=6)

# Create column stats for half hour data
stats_halfhour = {}
d4 = OrderedDict(df_agg.describe())
for key, alias in zip(d4.keys(),column_aliases):
    stats_halfhour[alias] = dict(d4[key])
d1['column_stats_half_hour'] = stats_halfhour

In [None]:
# Create meta data json file
with open('../alumni_scripts/meta_data.json', 'w') as fp:
    json.dump(d1, fp, indent=4)

### Remove outliers

In [None]:
df_cleaned = dp.offline_batch_data_clean(meta_data_path='../alumni_scripts/meta_data.json', df = df)

In [None]:
df_cleaned.columns = column_aliases

### Push data to a database

In [None]:
"""
before the next steps launch influxd client at a cli
sudo influxd
"""
# launch python client for influxdb
client = DataFrameClient(host='localhost', port=8086)
# create a database inc case it's not there
client.create_database('bdx_batch_db')
# get list of database
client.get_list_database()
# switch to the databaase you want
client.switch_database('bdx_batch_db')
# write "dataframe" as "measurements"
client.write_points(dataframe=df_cleaned,
                    measurement='alumni_data_v2',
                    tags={
                        'data_cleaned': 'True',
                        'aggregated': False,
                        'time-interval': '5 minutes'
                    },
                    protocol='line',
                    batch_size=5000)
# see measurement added to curent db
client.get_list_measurements()
client.close()

### Read data from the database

In [None]:
"""
before the next steps launch influxd client at a cli
sudo influxd
"""
# launch python client for influxdb
client = DataFrameClient(host='localhost', port=8086)
# switch to the databaase you want
client.switch_database('bdx_batch_db')
results_obj = client.query(
    "select * from alumni_data_v2 \
    where time >= '2018-11-15 12:25:00' - 13w \
    and time < '2018-11-15 12:25:00'"
)
df2 = results_obj['alumni_data_v2']
df2

## Read the data from the data base

In [None]:
# including the project directory to the notebook level
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
# import modules
from alumni_scripts import data_process as dp
from alumni_scripts import alumni_data_utils as utils
import json
import pandas as pd
from influxdb import DataFrameClient

In [None]:
"""
before the next steps launch influxd client at a cli
sudo influxd
"""
# launch python client for influxdb
client = DataFrameClient(host='localhost', port=8086)
# switch to the databaase you want
client.switch_database('bdx_batch_db')
results_obj = client.query(
    "select * from alumni_data_v2\
    where time >= '2018-08-07 00:00:00' \
    and time < '2019-02-07 00:00:00'" 
)
df2 = results_obj['alumni_data_v2']
df2.drop(columns=['aggregated','data_cleaned','time-interval'], inplace=True)

## Create meta_data : Demo

In [None]:
# including the project directory to the notebook level
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
# import modules
import pandas as pd
import json
import numpy as np

### Create column_aliases

In [None]:
df = pd.read_csv('../data/raw_data/alumni_data_jul2dec2018.csv', index_col='time')

d1 = {'column_names': list(df.columns)}
column_aliases = [
    'pchwst', 'vrf50', 'oat', 'sat', 'sat_stpt', 'oah', 'vrf67', 'pchw_flow',
    'hwe', 'vrf1', 'vrf30', 'vrf34', 'vrf74', 'cwe', 'hws_st_stpt', 'vrf60',
    'vrf63', 'hws_st', 'hws_vlv1', 'vrf77', 'vrf64', 'vrf10', 'ee', 'hws_rt',
    'vrf100', 'vrf40', 'hws_flow', 'vrf108', 'vrf20'
]

d2 = {}
for i, j in zip(df.columns, column_aliases):
    d2.update({j:i})
d1['column_aliases'] = d2

### Create column stats

In [None]:
stats = {}
d3 = dict(df.describe())
for key in d3.keys():
    stats[key] = dict(d3[key])
d1['column_stats'] = stats

### Dump meta data

In [None]:
with open('../alumni_scripts/meta_data.json', 'w') as fp:
    json.dump(d1, fp, indent=4)

### Read meta data

In [None]:
with open('../alumni_scripts/meta_data.json', 'r') as fp:
        meta_data_ = json.load(fp)
meta_data = meta_data_.copy()
for key, value in meta_data_['column_stats'].items():
    if value['std'] == 0:
        meta_data['column_stats'][key]['std'] = 0.0001  # add small std for constant values
stats = pd.DataFrame(meta_data['column_stats'])

## Plot data before and after cleaning: Demo

In [None]:
# including the project directory to the notebook level
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
# import modules
from alumni_scripts import data_process as dp
from alumni_scripts import alumni_data_utils as a_utils
import json
import pandas as pd

from collections import OrderedDict
from CoolProp.HumidAirProp import HAPropsSI

In [None]:
# collate batch of data
file_names = ['jul2dec2018', 'jan2jun2019', 'jul2dec2019', 'jan2jun2020']
dflist = []
for fname in file_names:
    df_ = pd.read_csv('../data/raw_data/alumni_data_{}.csv'.format(fname))
    df_['time'] = pd.to_datetime(df_['time'])
    df_.set_index(keys='time',inplace=True, drop = True)
    dflist.append(df_)
df = a_utils.mergerows(dflist)

In [None]:
rh = df['WeatherDataProfile humidity']/100
rh = rh.to_numpy()
t_db = 5*(df['AHU_1 outdoorAirTemp']-32)/9 + 273.15
t_db = t_db.to_numpy()

In [None]:
T = HAPropsSI('T_wb','R',rh,'T',t_db,'P',101325)
t_f = 9*(T-273.15)/5 + 32
df['wbt'] = t_f

In [None]:
column_aliases = [
    'pchwst', 'vrf50', 'oat', 'sat', 'sat_stpt', 'oah', 'vrf67', 'pchw_flow',
    'hwe', 'vrf1', 'vrf30', 'vrf34', 'vrf74', 'cwe', 'hws_st_stpt', 'vrf60',
    'vrf63', 'hws_st', 'hws_vlv1', 'vrf77', 'vrf64', 'vrf10', 'ee', 'hws_rt',
    'vrf100', 'vrf40', 'hws_flow', 'vrf108', 'vrf20', 'wbt'
]


In [None]:
df.columns = column_aliases

In [None]:
with open('../alumni_scripts/meta_data.json', 'r') as fp:
        meta_data_ = json.load(fp)
df_cleaned1 = dp.offline_batch_data_clean(meta_data_=meta_data_, df = df)

In [None]:
stats = {}
d2 = dict(df_cleaned1.describe())
for key in d2.keys():
    stats[key] = dict(d2[key])
d1 ={'column_stats' : stats}

In [None]:
d3 = {'column_agg_type':
    {"pchwst": "mean","vrf50": "mean","oat": "mean","sat": "mean","sat_stpt": "mean","oah": "mean",
    "vrf67": "mean","pchw_flow": "sum","hwe": "sum","vrf1": "mean","vrf30": "mean","vrf34": "mean",
    "vrf74": "mean","cwe": "sum","hws_st_stpt": "mean","vrf60": "mean","vrf63": "mean","hws_st": "mean",
    "hws_vlv1": "sum","vrf77": "mean", "vrf64": "mean","vrf10": "mean","ee": "sum","hws_rt": "mean",
    "vrf100": "mean","vrf40": "mean","hws_flow": "sum","vrf108": "mean","vrf20": "mean", 'wbt' : "mean"
}}

# aggregate data
rolling_sum_target, rolling_mean_target = [], []
for key, value in d3['column_agg_type'].items():
    if value == 'sum': rolling_sum_target.append(key)
    else: rolling_mean_target.append(key)

df_agg = df_cleaned1.copy()
        
df_agg[rolling_sum_target] =  a_utils.window_sum(df_agg, window_size=6, column_names=rolling_sum_target)
df_agg[rolling_mean_target] =  a_utils.window_mean(df_agg, window_size=6, column_names=rolling_mean_target)
df_agg = a_utils.dropNaNrows(df_agg)
# sample at half hour
df_agg = a_utils.sample_timeseries_df(df_agg, period=6)

# Create column stats for half hour data
stats_halfhour = {}
d4 = OrderedDict(df_agg.describe())
for key, alias in zip(d4.keys(),column_aliases):
    stats_halfhour[alias] = dict(d4[key])
d1['column_stats_half_hour'] = stats_halfhour



In [None]:
with open('../alumni_scripts/meta_data1.json', 'w') as fp:
    json.dump(d1, fp, indent=4)

In [None]:
for col_name in df.columns:
    utils.dataframeplot(df[[col_name]],lazy=False,legend=True)
    utils.dataframeplot(df_cleaned[[col_name]],lazy=False,legend=True)

## Create Time Series Data Base: Demo only

In [None]:
# including the project directory to the notebook level
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
# import modules
from alumni_scripts import data_process as dp
from alumni_scripts import alumni_data_utils as utils
import json
import pandas as pd
from influxdb import DataFrameClient

In [None]:
df = pd.read_csv('../data/raw_data/alumni_data_jul2dec2018.csv',)
df['time'] = pd.to_datetime(df['time'])
df.set_index(keys='time',inplace=True, drop = True)
df_cleaned = dp.offline_batch_data_clean(meta_data_path='../alumni_scripts/meta_data.json', df = df)

In [None]:
"""
before the next steps launch influxd client at a cli
sudo influxd
"""
# launch python client for influxdb
client = DataFrameClient(host='localhost', port=8086)
# create a database inc case it's not there
client.create_database('demo_alumni')
# get list of database
client.get_list_database()
# switch to the databaase you want
client.switch_database('demo_alumni')
# write "dataframe" as "measurements"
client.write_points(dataframe=df_cleaned, measurement='alumni_jul2dec2018', protocol='line', batch_size=5000)
# see measurement added to curent db
client.get_list_measurements()

In [None]:
results_obj = client.query(
    "select * from alumni_jul2dec2018 where time >= '2018-11-15 12:00:00' and time < '2018-11-15 12:05:00'"
)
df2 = results_obj['alumni_jul2dec2018']
df2

In [None]:
results_obj2 = client.query(
    "select * from alumni_jul2dec2018 where time = '2018-11-15 12:00:00'"
)
df3 = results_obj2['alumni_jul2dec2018']
df3

In [None]:
# drop the database after the demo
client.drop_database('demo_alumni')
client.get_list_database()

In [None]:
# close client
client.close()

## Code cemetery

In [None]:
time_str = '2018-11-15 13:12:00'
from datetime import datetime, timedelta
time_now = datetime.strptime(time_str, '%Y-%m-%d %H:%M:%S')
print(time_now)
time_now_str = str(time_now)
print(time_now_str)
time_now ==time_now_str

In [None]:
with open('../alumni_scripts/meta_data.json', 'r') as fp:
        meta_data_ = json.load(fp)
meta_data_['column_agg_type']['pchwst']

In [None]:
meta_data_['column_agg_type'].values()

In [None]:
rolling_sum_target = []
rolling_mean_target = []
for key, value in meta_data_['column_agg_type'].items():
    if value == 'sum': rolling_sum_target.append(key)
    else: rolling_mean_target.append(key)

In [None]:
rolling_sum_target

In [None]:
q = {'c': '11', 'b' : ['1f3','a']}
with open('../logs/cwe_test_info.txt', 'a') as ifile:
        ifile.write(json.dumps(q)+'\n',)      
with open('../logs/cwe_test_info.txt') as f:
    for line in f:
        document = json.loads(line)
        print(document)

In [None]:
import pandas as pd
from CoolProp.HumidAirProp import HAPropsSI
from dateutil import tz

df_ = pd.read_csv('../data/trend_data/alumni_data_train.csv', )
df_['time'] = pd.to_datetime(df_['time'])
to_zone = tz.tzlocal()
df_['time'] = df_['time'].apply(lambda x: x.astimezone(to_zone)) # convert time to loca timezones
df_.set_index(keys='time',inplace=True, drop = True)
df_ = a_utils.dropNaNrows(df_)

rh = df_['WeatherDataProfile humidity']/100
rh = rh.to_numpy()
t_db = 5*(df_['AHU_1 outdoorAirTemp']-32)/9 + 273.15
t_db = t_db.to_numpy()

tdb_rh = np.concatenate((t_db.reshape(-1,1), rh.reshape(-1,1)), axis=1)

In [None]:
import multiprocessing
import psutil
chunks = [
    (sub_arr[:, 0].flatten(), sub_arr[:, 1].flatten(), cpu_id)
    for cpu_id, sub_arr in enumerate(np.array_split(tdb_rh, multiprocessing.cpu_count(), axis=0))]

def unpacking_apply_along_axis(all_args):
    t_db, rh, cpu_id = all_args
    
    proc = psutil.Process()
    proc.cpu_affinity([cpu_id])
    
    T = HAPropsSI('T_wb','R',rh,'T',t_db,'P',101325)
    return T

pool = multiprocessing.Pool()
individual_results = pool.map(unpacking_apply_along_axis, chunks)
# Freeing the workers:
pool.close()
pool.join()
final_T = np.concatenate(individual_results)

## Plotly example

In [None]:
import numpy as np
import plotly.graph_objects as go

In [None]:
basepath='../tmp/cwe_data/cwe'
labelnames = ['sat-oat', 'oah', 'wbt', 'pchw_flow', 'cwe']
X_train = np.load(basepath+'_X_train.npy')
y_train = np.load(basepath+'_y_train.npy')
X_val = np.load(basepath+'_X_val.npy')
y_val = np.load(basepath+'_y_val.npy')

train = np.concatenate((X_train, y_train), axis=-1)[:,0,:]

fig = go.Figure()
for i in range(train.shape[1]):
    fig.add_trace(go.Scatter(y=train[:, i], mode='lines', name=labelnames[i]))

# Edit the layout
fig.update_layout(title='Different Variables',
                  xaxis_title='Time Points',
                  yaxis_title='Scaled Values',
                  font = {'family':'Times New Roman', 'size': 15})

fig.show()

In [None]:
basepath='../tmp/hwe_data/hwe'
labelnames = ['oat', 'oah', 'wbt', 'sat-oat', 'hwe']
X_train = np.load(basepath+'_X_train.npy')
y_train = np.load(basepath+'_y_train.npy')
X_val = np.load(basepath+'_X_val.npy')
y_val = np.load(basepath+'_y_val.npy')

train = np.concatenate((X_train, y_train), axis=-1)[:,0,:]

fig = go.Figure()
for i in range(train.shape[1]):
    fig.add_trace(go.Scatter(y=train[:, i], mode='lines', name=labelnames[i]))

# Edit the layout
fig.update_layout(title='Different Variables',
                  xaxis_title='Time Points',
                  yaxis_title='Scaled Values',
                  font = {'family':'Times New Roman', 'size': 15})

fig.show()

In [None]:
basepath='../tmp/vlv_data/vlv'
labelnames = ['oat', 'oah', 'wbt', 'sat-oat', 'vlv_off', 'vlv_on']
X_train = np.load(basepath+'_X_train.npy')
y_train = np.load(basepath+'_y_train.npy')
X_val = np.load(basepath+'_X_val.npy')
y_val = np.load(basepath+'_y_val.npy')

train = np.concatenate((X_train, y_train), axis=-1)[:,0,:]

fig = go.Figure()
for i in range(train.shape[1]):
    fig.add_trace(go.Scatter(y=train[:, i], mode='lines', name=labelnames[i]))

# Edit the layout
fig.update_layout(title='Different Variables',
                  xaxis_title='Time Points',
                  yaxis_title='Scaled Values',
                  font = {'family':'Times New Roman', 'size': 15})

fig.show()