# Esperanza First Development Notebook

### Imports

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

import prophet as Prophet

In [None]:
floor_2 = pd.read_csv('../data/raw/merged_all_2F.csv', index_col = 'Unnamed: 0')
floor_3 = pd.read_csv('../data/raw/merged_all_3F.csv', index_col = 'Unnamed: 0')
floor_4 = pd.read_csv('../data/raw/merged_all_4F.csv', index_col = 'Unnamed: 0')

In [None]:
floor_2 = floor_2.drop(['Unnamed: 0.1', 'Unnamed: 0.1.1'], axis = 1)

In [None]:
floor_2.head(5)

In [None]:
floor_3 = floor_3.drop(['Unnamed: 0.1'], axis = 1)

In [None]:
floor_3.head(5)

In [None]:
floor_4 = floor_4.drop(['Unnamed: 0.1'], axis = 1)

In [None]:
floor_4.head(5)

In [None]:
floor_2['floor'] = 2
floor_3['floor'] = 3
floor_4['floor'] = 4

In [None]:
combined_floors = pd.concat([floor_2, floor_3, floor_4]).reset_index(drop = True)

In [None]:
combined_floors.head(5)

In [None]:
combined_floors['time_transformed'] = combined_floors['time'].apply(lambda x: pd.Timestamp(x))

### Graph Analyses

In [None]:
combined_floors['Common Setpoint'].hist()

In [None]:
combined_floors['Actual Sup Flow SP'].hist()

In [None]:
combined_floors['Zone Temperature'].hist()

In [None]:
combined_floors['Zone Temperature Next'].hist()

In [None]:
combined_floors['Actual Supply Flow'].hist()

In [None]:
combined_floors['Actual Supply Flow Next'].hist()

In [None]:
combined_floors['energy'].hist()

In [None]:
combined_floors['energy Next'].hist()

In [None]:
combined_floors['Outside Air Temp'].hist()

In [None]:
combined_floors['Humidity'].hist()

In [None]:
combined_floors['Humidity Next'].hist()

In [None]:
combined_floors.describe()

In [None]:
combined_floors.shape

### Turning Time Values into different columns and evaluating

In [None]:
combined_floors['year'] = combined_floors['time_transformed'].transform(lambda x: x.year)

In [None]:
combined_floors['month'] = combined_floors['time_transformed'].transform(lambda x: x.month)

In [None]:
combined_floors['month'].value_counts()
# no values from March - May in any of the years ?

In [None]:
combined_floors['day'] = combined_floors['time_transformed'].transform(lambda x: x.day)

In [None]:
# data is supposed to be from July 2017 to the end of June 2018, but we have a little January 2019 and are missing months

### Splitting Dates for a 70/30 train/test split

In [None]:
dates = combined_floors['time_transformed'].transform(lambda x: x.date)

In [None]:
dates_2019 = (dates >= pd.Timestamp('2019-01-01'))

In [None]:
onwards_2019 = combined_floors.loc[dates_2019, :]

In [None]:
pre_2019 = combined_floors.loc[~dates_2019, :]

In [None]:
onwards_2019.shape

In [None]:
pre_2019.shape

This spans July 2017 to beginning of January 2019.

This is not enough data to segment via the year. if we want to go for a 70/30 split:

If we have a year and a half of data about, 50% of that would be 3/4 of a year. 75% of that would be 1 1/8 of a year, so about a year's worth of data would be about a year (estimating).



In [None]:
dates_test = (dates >= pd.Timestamp('2018-08-01'))

In [None]:
onwards_test_date = combined_floors.loc[dates_test, :]
pre_test_date = combined_floors.loc[~dates_test, :]


In [None]:
pre_test_date.shape

In [None]:
onwards_test_date.shape

In [None]:
188633 / (188633 + 437044)

That's pretty close to a 70%, 30% split so I'll use that for the split. We could do the split at the 7th/8th of the month to try and match days better but I think this is fine.

### Trying out simple Linear and Decision Tree Models

In [None]:
import sklearn
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn import tree

In [None]:
pre_test_date_X = pre_test_date.loc[:, ['Zone Num', 'Common Setpoint', 'Actual Sup Flow SP', 'Zone Temperature', 'Actual Supply Flow', 'Outside Air Temp', 'Humidity', 'floor', 'time_transformed']]
pre_test_date_y = pre_test_date['energy']

onwards_test_date_X = onwards_test_date.loc[:, ['Zone Num', 'Common Setpoint', 'Actual Sup Flow SP', 'Zone Temperature', 'Actual Supply Flow', 'Outside Air Temp', 'Humidity', 'floor', 'time_transformed']]
onwards_test_date_y = onwards_test_date['energy']

In [None]:
pre_test_date_X['month'] = pre_test_date_X['time_transformed'].transform(lambda x: x.month)
pre_test_date_X['year'] = pre_test_date_X['time_transformed'].transform(lambda x: x.year)
pre_test_date_X['day'] = pre_test_date_X['time_transformed'].transform(lambda x: x.day)
pre_test_date_X['weekday'] = pre_test_date_X['time_transformed'].transform(lambda x: x.weekday)
pre_test_date_X['hour'] = pre_test_date_X['time_transformed'].transform(lambda x: x.hour)
pre_test_date_X['minute'] = pre_test_date_X['time_transformed'].transform(lambda x: x.minute)
pre_test_date_X['second'] = pre_test_date_X['time_transformed'].transform(lambda x: x.second)

In [None]:
onwards_test_date_X['month'] = onwards_test_date_X['time_transformed'].transform(lambda x: x.month)
onwards_test_date_X['year'] = onwards_test_date_X['time_transformed'].transform(lambda x: x.year)
onwards_test_date_X['day'] = onwards_test_date_X['time_transformed'].transform(lambda x: x.day)
onwards_test_date_X['weekday'] = onwards_test_date_X['time_transformed'].transform(lambda x: x.weekday)
onwards_test_date_X['hour'] = onwards_test_date_X['time_transformed'].transform(lambda x: x.hour)
onwards_test_date_X['minute'] = onwards_test_date_X['time_transformed'].transform(lambda x: x.minute)
onwards_test_date_X['second'] = onwards_test_date_X['time_transformed'].transform(lambda x: x.second)

In [None]:
pre_test_date_X = pre_test_date_X.drop(['time_transformed'], axis = 1)
onwards_test_date_X = onwards_test_date_X.drop(['time_transformed'], axis = 1)


#### Linear

In [None]:
regr = linear_model.LinearRegression()
regr.fit(pre_test_date_X, pre_test_date_y)

In [None]:
pre_y_pred = regr.predict(pre_test_date_X)


In [None]:
print("Mean squared error: %.2f" % mean_squared_error(pre_test_date_y, pre_y_pred))

In [None]:
onw_y_pred = regr.predict(onwards_test_date_X)

In [None]:
print("Mean squared error: %.2f" % mean_squared_error(onwards_test_date_y, onw_y_pred))

This doesn't really involve prediction though, mostly relies on having all of the data and evaluating. Our goal is to predict ahead.

#### Decision Tree

In [None]:
decTree = tree.DecisionTreeRegressor(max_depth = 5, min_samples_split = 5)
decTree.fit(pre_test_date_X, pre_test_date_y)

In [None]:
preds_y_decTree = decTree.predict(pre_test_date_X)

In [None]:
print("Mean squared error: %.2f" % mean_squared_error(pre_test_date_y, preds_y_decTree))

In [None]:
preds_y_decTree_onw = decTree.predict(onwards_test_date_X)

In [None]:
print("Mean squared error: %.2f" % mean_squared_error(onwards_test_date_y, preds_y_decTree_onw))

Changed max_depth and min_samples_split values based on initial overtraining. Again doesn't fix the issue that we're not really able to predict ahead of time.

Some potential options to look into for sklearn:
- https://scikit-learn.org/stable/auto_examples/applications/plot_cyclical_feature_engineering.html#sphx-glr-auto-examples-applications-plot-cyclical-feature-engineering-py
- https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.TimeSeriesSplit.html

### Prophet Model

#### Need to rename columns and remove timezones to run Prophet

In [None]:
prophet_test_pre = pre_test_date.loc[:, ['time', 'energy']].rename({'time': 'ds', 'energy': 'y'}, axis = 1)

In [None]:
prophet_test_pre['ds'] = prophet_test_pre['ds'].transform(lambda x: x[0:-6])

Running the model.

In [None]:
prophet_model = Prophet.Prophet()

In [None]:
prophet_model.fit(prophet_test_pre)

In [None]:
prophet_forecast = prophet_model.make_future_dataframe(periods=365, freq='H')
prophet_forecast = prophet_model.predict(prophet_forecast)


In [None]:
prophet_forecast_reduced = prophet_forecast.loc[:, ['ds', 'yhat']]

In [None]:
prophet_forecast_reduced['timestamp_changed'] = prophet_forecast_reduced['ds'].transform(lambda x: pd.Timestamp(x))

In [None]:
onwards_compare = onwards_test_date.loc[:, ['time', 'energy']]
onwards_compare['time'] = onwards_compare['time'].transform(lambda x: x[0:-6])
onwards_compare = onwards_compare.rename({'time': 'ds', 'energy': 'yhat'}, axis = 1)
onwards_compare['timestamp_changed'] = onwards_compare['ds'].transform(lambda x: pd.Timestamp(x))

In [None]:
merge_test = prophet_forecast_reduced.merge(onwards_compare, left_on = 'timestamp_changed', right_on = 'timestamp_changed')

In [None]:
merge_test

There should be values for this merge - because there isn't, we have to evaluate how we work with timestamps.