In [None]:
import pandas as pd
import numpy as np
import COPP_Utils

%matplotlib inline

In [None]:
# load different datasets for future append
sen = pd.read_csv('sensor_data.csv', parse_dates=True, index_col='datetime')
occ = pd.read_csv('occupancy_data.csv', parse_dates=True, index_col='datetime')
imgs = pd.read_csv('image_variations.csv', parse_dates=True, index_col='datetime')
door = pd.read_csv('door_data.csv', parse_dates=True, index_col='datetime', dtype={'door_status':'category'})

# deleting irrelevant columns
# be sure to remove columns that will be duplicated when concatenating (such as location)
del sen['location']
del sen['loc_CO2']
# it was demonstrated that 'bluetooth_devices' and 'bluetooth_non_personal_devices'
# are highly correlated so we can delete the latter one.
del sen['bluetooth_non_personal_devices']
#renaming occupancy count to a proper name
occ.count_total.name = 'occupancy_count'
occ = occ['count_total']
#renaming image histogram change to a proper name
imgs.rolling_rms.name = 'image_hist_change'
imgs = imgs['rolling_rms']
# drop door DateTimeIndex duplicates, keeping last seen: As two
# events may occur in the same second (door quickly opening and
# closing), this database may contain "duplicated" datetimes.
door['datetime'] = door.index
door.drop_duplicates(subset='datetime', keep='last', inplace=True)
door = door['door_status']

In [None]:
# Clean up outliers from sensor data:

# Delete -999 values from temperature
print("before temp outliers removal:", len(sen))
sen = sen[sen.temperature != -999]
print("after temp outliers removal:", len(sen))

# Delete 2 values from co2
print("----\nbefore co2 outliers removal:", len(sen))
sen = sen[sen.co2 != 2]
print("after co2 outliers removal:", len(sen))

In [None]:
# interpolate sensor missing-data gaps (using 'pchip' method)
sen = COPP_Utils.interpolateByDay(sen)

In [None]:
# Adding extra features to sensors:

# temperature in Farenheit degrees
sen['temperature_f'] = 9.0/5.0 * sen.temperature + 32

# light switch on and off, selected threshold: <700 is on, otherwise is off
sen['light_status'] = (sen.loc[:,'light'] < 700).astype(int)

#reorganizing columns
cols = sen.columns
sen = sen[['temperature', 
           'temperature_f', 
           'humidity', 'co2', 'light', 
           'light_status',
           'noise', 'bluetooth_devices']]

In [None]:
# first append image comparison feature ('rolling_rms')
extended_set = COPP_Utils.appendFeature(sen, imgs, 'temperature')
# cases where the initial rolling_rms is zero, can beset to NaN and then bfilled
extended_set.image_hist_change[extended_set.image_hist_change == 0] = np.nan
extended_set.image_hist_change = extended_set.image_hist_change.bfill()

extended_set.head(3)

In [None]:
# next, append door information.
extended_set = COPP_Utils.appendFeature(extended_set, door, 'temperature', 'closed')

extended_set.head(3)

In [None]:
# finally, appending Y
dataset = COPP_Utils.appendFeature(extended_set, occ, 'temperature', 0)
dataset.head(3)

In [None]:
# calculating extra Y, using bins:
#0 - 8: Very Low occupation
#9 - 15: Low occupation
#16 - 24: Fair Occupation
#25 - 31 (maximum observed so far): High Occupation

bins = [-1, 8, 15, 24, 31]
group_names = ['very-low', 'low', 'fair', 'high']
dataset['occupancy_category'] = pd.cut(dataset.occupancy_count, bins, labels=group_names)
dataset.head()

# set light status as category at this point (doing it before raised a SettingWithCopyWarning)
dataset.light_status = (dataset.light_status > 0).apply(lambda x: 'light-on' if x else 'light-off').astype('category')

#DATASET READY:
dataset.head()

In [None]:
dataset['20170325'].plot(figsize=(16,14))

In [None]:
# generating minute-by-minute version
dataset_1M = interpolateByDay(dataset, 'T')
dataset_1M

In [None]:
# export datasets
dataset.to_csv('dataset-5sec.csv')
dataset_1M.to_csv('dataset-1min.csv')