In [1]:
import pandasql as ps
import pandas as pd
import chart_studio.plotly as py
import plotly.graph_objs as go
# Offline mode
from plotly.offline import init_notebook_mode, iplot, plot
init_notebook_mode(connected='true')


In [2]:
buildings_metadata_full = pd.read_csv('./FirstApproach/data/building_metadata.csv')
buildings_metadata_full = buildings_metadata_full.dropna()
buildings_metadata_full = buildings_metadata_full.reset_index(drop=True)
weather_train_full = pd.read_csv('./FirstApproach/data/weather_train.csv')
weather_test_full = pd.read_csv('./FirstApproach/data/weather_test.csv')
train_full = pd.read_csv('./FirstApproach/data/train.csv')
test_full = pd.read_csv('./FirstApproach/data/test.csv')
leak_df = pd.read_csv('./FirstApproach/data/leak_df.csv')
leaked_test_target = pd.read_csv('./FirstApproach/data/leaked_test_target.csv')

------

# 2. Data Visualization


## 2.1 Interactive Plot Exploration

We'll plot compatible variables toghether along with the energy consumption series in order to observe the behavior and have a rough idea of how they are correlated. Later on we'll do further statistical analysis and plotting on these same series.

In [34]:
air_temp = go.Scatter(x=train_set['air_temperature'].index, y=train_set['air_temperature'].values, name = 'Air temperature', line=dict(color='royalblue', width=0.7), yaxis='y')
dew_temp = go.Scatter(x=train_set['dew_temperature'].index, y=train_set['dew_temperature'].values, name = 'Dew temperature', line=dict(color='lightblue', width=0.7), yaxis='y')
pressure = go.Scatter(x=train_set['sea_level_pressure'].index, y=train_set['sea_level_pressure'].values, name = 'Sea level pressure', line=dict(color='orange'), yaxis='y')
wind_dir = go.Scatter(x=train_set['wind_direction'].index, y=train_set['wind_direction'].values, name = 'Wind direction', line=dict(color='chocolate'), yaxis='y')
wind_spd = go.Scatter(x=train_set['wind_speed'].index, y=train_set['wind_speed'].values, name = 'Wind speed', line=dict(color='blueviolet'), yaxis='y')
electric = go.Scatter(x=train_set['electricity'].index, y=train_set['electricity'].values, name = 'Electricity', line=dict(color='mediumvioletred', width=0.7), yaxis='y2')

In [35]:
layout_temp = go.Layout(title='Temperature and Electric Consumption', xaxis=dict(title='Date'),
                   yaxis=dict(title='ºC', color='royalblue', overlaying='y2'),
                   yaxis2=dict(title='kBTU', color='purple', side='right')    )

In [36]:
fig = go.Figure(data=[air_temp, dew_temp, electric], layout=layout_temp)
fig.show()

In [37]:
layout_speed = go.Layout(title='Wind Speed', xaxis=dict(title='Date'),
                   yaxis=dict(title='m/s', color='royalblue', overlaying='y2'),
                   yaxis2=dict(title='kBTU', color='purple', side='right'))

In [38]:
fig = go.Figure(data=[wind_spd], layout=layout_speed)
fig.show()

In [39]:
layout_direction = go.Layout(title='Wind Direction', xaxis=dict(title='Date'),
                   yaxis=dict(title='Compass º', color='royalblue', overlaying='y2'),
                   yaxis2=dict(title='kBTU', color='purple', side='right'))

In [40]:
fig = go.Figure(data=[wind_dir], layout=layout_direction)
fig.show()

In [41]:
layout_pressure = go.Layout(title='Sea-Level Pressure', xaxis=dict(title='Date'),
                   yaxis=dict(title='mBar', color='royalblue', overlaying='y2'),
                   yaxis2=dict(title='kBTU', color='purple', side='right'))

In [42]:
fig = go.Figure(data=[pressure], layout=layout_pressure)
fig.show()

Not all of this weather data appears to have an evident correlation with the electricity consumption. Some of them appear to have some sort of correlation with each other (for example wind speed and the pressure). We would expect our model to learn all the hidden dependencies between all of the different weather data.

The graphs above shows how the temperature has a trend to increase in summer and decrease in invernal season, so as expected. That one would be a non-linear trend.

The meter reading for electricity show an inverse trend as the temperatures. It's understandable that for colder days, more electricity gets used.

For the wind direction and speed no evident trend pattern shows up.

## 2.2 Decomposing Data into Components

In [43]:
#ref: https://machinelearningmastery.com/decompose-time-series-data-trend-seasonality/
from statsmodels.tsa.seasonal import seasonal_decompose
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
from IPython.display import display

Time series can be divided into several components for a further analysis. These include: base level, trend, seasonality and error.
There are two kinds of time series within that matter: additive and multiplicative time series.
On additive time series, the components are sumed up to aproximate the expected value.
On mutiplicative, these components are multiplied by each other.

- Additive time series:
    $\\
    value = base\ level + trend + seasonality + error$
    
- Multiplicative time series:
    $\\
    value = base\ level\ \times\ trend\ \times\ seasonality\ \times\ error$

### 2.2.1 Additive components decomposition

When we perform a component decomposition we can choose a particular period to tell the funtion how we would like it to exctract the different behaviours. It can also make it automatically, but it won't necesarily do it as well as it could be done. therefore, we'll fine tune this paramenter in order to extract the behaviour of our interest.

#### 2.2.1.1 air_temperature

In [44]:
@interact(p_t=(1,int(366/2),1), p_s1=(1,int(366/2),1), p_s2=(1,int(366/2),1), p_s3=(1,int(366/2),1), p_n=(1,int(366/2),1))
def decompose_air_temperature(p_t=90, p_s=2, p_n=1):
    

SyntaxError: unexpected EOF while parsing (<ipython-input-44-d55a2e6d10ce>, line 3)

In [None]:
decompose_day = seasonal_decompose(x = train_set['electricity'], model='additive', extrapolate_trend='freq', period=1*24) 
decompose_week = seasonal_decompose(x = train_set['electricity'], model='additive', extrapolate_trend='freq', period=7*24) 
decompose_month = seasonal_decompose(x = train_set['electricity'], model='additive', extrapolate_trend='freq', period=30*24) 
decompose_bimestral = seasonal_decompose(x = train_set['electricity'], model='additive', extrapolate_trend='freq', period=60*24)

In [None]:
#We're interested on daily fluctuations. We'll keep the seasonal time series of 1 day period.
day_seasonal = decompose_day.seasonal
#In order to extract weekend behaviour:
week_seasonal = decompose_week.seasonal
wEnd_seasonal = week_seasonal - day_seasonal
month_seasonal = decompose_month.seasonal

#additive_sum = additive_t.trend + additive_s.seasonal + additive_n.resid
#difference = additive_t.observed - additive_sum

#additive_df = pd.concat([additive_t.observed, additive_t.trend, additive_s.seasonal, additive_n.resid, additive_sum , difference], axis=1)
#additive_df.columns = ['Actual values', 'Trend Decomposition', 'Seasonal Decomposition', 'Residue/Noise Deomposition', 'Sumed Values', 'Difference' ]
#additive_df.plot(subplots = True, figsize=(10,10))
#additive_df.head()

In [None]:
day = go.Scatter(x=day_seasonal.index, y=day_seasonal.values, name = 'daily', line=dict(color='mediumvioletred'), yaxis='y')
week = go.Scatter(x=week_seasonal.index, y=week_seasonal.values, name = 'weekly', line=dict(color='royalblue'), yaxis='y')
wEnd = go.Scatter(x=wEnd_seasonal.index, y=wEnd_seasonal.values, name = 'week_ends', line=dict(color='crimson'), yaxis='y')
month = go.Scatter(x=month_seasonal.index, y=month_seasonal.values, name = 'monthly', line=dict(color='lightblue'), yaxis='y')

layout = go.Layout(title='Electricity Seasonal Decomposition', xaxis=dict(title='Date'),
                   yaxis=dict(title='ºC'))
fig = go.Figure(data=[day, week, wEnd, month], layout=layout)
fig.show()

#### 2.2.1.1 dew_temperature

In [None]:
additive = seasonal_decompose(x = train_clean['dew_temperature'], model='additive', extrapolate_trend='freq', period=48 )
additive_df = pd.concat([additive.observed, additive.trend, additive.seasonal, additive.resid], axis=1)
additive_df.columns = ['actual_values', 'trend', 'seasonal', 'resid' ]
additive_df.plot(subplots = True, figsize=(10,10))
#additive_df.head()

#### 2.2.1.1 wind_direction

In [None]:
additive = seasonal_decompose(x = train_clean['wind_direction'], model='additive', extrapolate_trend='freq', period=int(len(train_clean['air_temperature'])/365) )
additive_df = pd.concat([additive.observed, additive.trend, additive.seasonal, additive.resid], axis=1)
additive_df.columns = ['actual_values', 'trend', 'seasonal', 'resid' ]
additive_df.plot(subplots = True, figsize=(10,10))
#additive_df.head()

#### 2.2.1.1 wind_speed

In [None]:
additive = seasonal_decompose(x = train_clean['wind_speed'], model='additive', extrapolate_trend='freq', period=48 )
additive_df = pd.concat([additive.observed, additive.trend, additive.seasonal, additive.resid], axis=1)
additive_df.columns = ['actual_values', 'trend', 'seasonal', 'resid' ]
additive_df.plot(subplots = True, figsize=(10,10))
#additive_df.head()

#### 2.2.1.1 meter_reading = electricity

In [None]:
additive = seasonal_decompose(x = train_clean['meter_reading'], model='additive', extrapolate_trend='freq', period=168 )
additive_df = pd.concat([additive.observed, additive.trend, additive.seasonal, additive.resid], axis=1)
additive_df.columns = ['actual_values', 'trend', 'seasonal', 'resid' ]
additive_df.plot(subplots = True, figsize=(10,10))
#additive_df.head()