In [1]:
import pandas as pd
import numpy as np

In [2]:
# Path to sanitized data
path_metadata = 'data/metadata_sanitized.csv'
path_weather ='data/weather_sanitized.csv'
path_meter_cleaned = 'data/meter_sanitized.csv'

In [3]:
# load data
metadata = pd.read_csv(path_metadata)
weather = pd.read_csv(path_weather)
meter_all = pd.read_csv(path_meter_cleaned)

In [4]:
# verify that no values are missing
print('metadata missing values: \n', metadata.isnull().sum() * 100 / len(metadata))
print('weather missing values: \n', weather.isnull().sum() * 100 / len(weather))
print('meter missing values: \n', meter_all.isnull().sum() * 100 / len(meter_all))

metadata missing values: 
 building_id              0.0
site_id                  0.0
sqm                      0.0
primaryspaceusage        0.0
sub_primaryspaceusage    0.0
lat                      0.0
lng                      0.0
dtype: float64
weather missing values: 
 timestamp         0.0
site_id           0.0
airTemperature    0.0
dewTemperature    0.0
seaLvlPressure    0.0
windDirection     0.0
windSpeed         0.0
dtype: float64
meter missing values: 
 timestamp        0.0
building_id      0.0
meter_reading    0.0
dtype: float64


In [5]:
# create a small training set with 10 random buildings
metadata = metadata.sample(n=10, random_state=42)
metadata

Unnamed: 0,building_id,site_id,sqm,primaryspaceusage,sub_primaryspaceusage,lat,lng
677,Bear_education_Gavin,Bear,7158.6,Education,Education,37.871903,-122.260729
1589,Cockatoo_lodging_Lana,Cockatoo,4541.2,Lodging/residential,Dormitory,42.459837,-76.485292
984,Gator_assembly_Roy,Gator,2461.5,Entertainment/public assembly,Sports Facility,0.0,0.0
607,Rat_public_Sana,Rat,37161.2,Public services,Library,38.903504,-77.005349
1392,Hog_office_Miriam,Hog,1418.5,Office,Office,44.978782,-93.255398
1033,Bull_education_Shona,Bull,1352.8,Education,College Laboratory,30.2672,-97.7431
1582,Cockatoo_lodging_Javier,Cockatoo,12411.3,Lodging/residential,Dormitory,42.459837,-76.485292
1309,Hog_education_Odell,Hog,14537.7,Education,College Laboratory,44.978782,-93.255398
1626,Cockatoo_assembly_Mimi,Cockatoo,1046.6,Entertainment/public assembly,Gymnasium,42.459837,-76.485292
271,Fox_public_Bart,Fox,9407.5,Public services,Library,33.424425,-111.92814


In [6]:
# get the meter data for the selected buildings
meter = meter_all[meter_all['building_id'].isin(metadata['building_id'])]

In [7]:
# add site_id to the meter data by getting first word of building_id
meter['site_id'] = meter['building_id'].apply(lambda x: x.split('_')[0])
meter

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  meter['site_id'] = meter['building_id'].apply(lambda x: x.split('_')[0])


Unnamed: 0,timestamp,building_id,meter_reading,site_id
3649152,2016-01-01 00:00:00,Fox_public_Bart,46.290,Fox
3649153,2016-01-01 01:00:00,Fox_public_Bart,36.790,Fox
3649154,2016-01-01 02:00:00,Fox_public_Bart,36.520,Fox
3649155,2016-01-01 03:00:00,Fox_public_Bart,37.520,Fox
3649156,2016-01-01 04:00:00,Fox_public_Bart,38.090,Fox
...,...,...,...,...
26508979,2017-12-31 19:00:00,Cockatoo_lodging_Javier,46.425,Cockatoo
26508980,2017-12-31 20:00:00,Cockatoo_lodging_Javier,47.200,Cockatoo
26508981,2017-12-31 21:00:00,Cockatoo_lodging_Javier,47.475,Cockatoo
26508982,2017-12-31 22:00:00,Cockatoo_lodging_Javier,47.700,Cockatoo


In [8]:
weather

Unnamed: 0,timestamp,site_id,airTemperature,dewTemperature,seaLvlPressure,windDirection,windSpeed
0,2016-01-01 00:00:00,Panther,19.4,19.4,1019.4,0.0,0.0
1,2016-01-01 01:00:00,Panther,21.1,21.1,1019.4,0.0,0.0
2,2016-01-01 02:00:00,Panther,21.1,21.1,1019.4,210.0,1.5
3,2016-01-01 03:00:00,Panther,20.6,20.0,1019.4,0.0,0.0
4,2016-01-01 04:00:00,Panther,21.1,20.6,1019.4,290.0,1.5
...,...,...,...,...,...,...,...
331161,2017-12-31 19:00:00,Mouse,8.5,4.8,992.3,210.0,8.2
331162,2017-12-31 20:00:00,Mouse,8.5,4.5,992.1,210.0,7.2
331163,2017-12-31 21:00:00,Mouse,8.2,4.0,992.1,230.0,10.3
331164,2017-12-31 22:00:00,Mouse,7.5,4.3,993.7,260.0,12.9


In [9]:
# add weather data to meter data based on site_id
meter = meter.merge(weather, on=['timestamp', 'site_id'], how='left')
meter

Unnamed: 0,timestamp,building_id,meter_reading,site_id,airTemperature,dewTemperature,seaLvlPressure,windDirection,windSpeed
0,2016-01-01 00:00:00,Fox_public_Bart,46.290,Fox,7.2,-5.6,1017.3,0.0,0.0
1,2016-01-01 01:00:00,Fox_public_Bart,36.790,Fox,7.2,-6.7,1017.0,50.0,1.5
2,2016-01-01 02:00:00,Fox_public_Bart,36.520,Fox,6.1,-5.0,1016.9,0.0,0.0
3,2016-01-01 03:00:00,Fox_public_Bart,37.520,Fox,5.0,-5.6,1017.4,0.0,0.0
4,2016-01-01 04:00:00,Fox_public_Bart,38.090,Fox,4.4,-5.6,1017.2,0.0,0.0
...,...,...,...,...,...,...,...,...,...
175435,2017-12-31 19:00:00,Cockatoo_lodging_Javier,46.425,Cockatoo,,,,,
175436,2017-12-31 20:00:00,Cockatoo_lodging_Javier,47.200,Cockatoo,-16.7,-20.0,1028.7,270.0,5.7
175437,2017-12-31 21:00:00,Cockatoo_lodging_Javier,47.475,Cockatoo,-16.1,-18.9,1028.5,290.0,5.1
175438,2017-12-31 22:00:00,Cockatoo_lodging_Javier,47.700,Cockatoo,-15.6,-17.8,1028.3,290.0,5.7


In [10]:
# check if there are any missing values
meter.isnull().sum() * 100 / len(meter)

timestamp         0.000000
building_id       0.000000
meter_reading     0.000000
site_id           0.000000
airTemperature    0.988372
dewTemperature    0.988372
seaLvlPressure    0.988372
windDirection     0.988372
windSpeed         0.988372
dtype: float64

In [11]:
# interpolate missing values
meter = meter.interpolate(limit_direction='both')

In [12]:
# save the small dataset
meter.to_csv('data/dataset_small.csv', index=False)