# What is done?

#### 1. Downsampling to 10 min interval
#### 2. Add lags to history and future

In [1]:
import numpy as np
import pandas as pd

from tqdm import tqdm_notebook
import re

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
%matplotlib inline

from utils import *

### 1. Load the data

In [2]:
data = pd.read_parquet('./data/dataset_cleared_v1.parquet')

In [3]:
mode_col = 'HEATER'
mode_col_smooth = 'HEATER_SM'

In [4]:
# smooth mode to reduce the downsampling errors
# data[mode_col_smooth] = data[mode_col].shift(-6).rolling(6).mean()
# data[mode_col_smooth] = (data[mode_col].rolling(6).mean() + data[mode_col_smooth]) / 2

# def detect_ix(data, window=63):
#    return int((data[data[mode_col] != 0].index[0] + window / 2) % window)

# # debugging 
# start = 10 * 90
# end = 10 * 100
# shift = detect_ix(data, window=63)
# plt.figure(figsize=(10,5))
# plt.plot(range(start, end), data[mode_col].iloc[start:end], c='g')
# plt.plot(range(start, end), data[mode_col_smooth].iloc[start:end], c='r')
# #plt.plot(data.MODE.iloc[0:N:60], c='r')
# plt.plot(range(shift + start, end + shift, 63), data[mode_col_smooth].iloc[start + shift: end+ shift:63].values, c='b')
# plt.show()

Downsample to `freq = 5 min`:

In [24]:
def downsample(dataframe, freq='1min'):
    ts = pd.DataFrame({'DATE': pd.date_range(start=dataframe['DATE'].min(),
                                             end=dataframe['DATE'].max(),
                                             freq=freq)})
    return pd.merge(ts, dataframe, how='left', on='DATE')

In [25]:
# # detect the center of positive impuls and move time series to that point
# # after that -> downsample

# freq = '10 min'
# # data_dw = downsample(data.iloc[detect_ix(data, window=63):], freq=freq)

freq = 5 # 1 = 1 min
data['GROUP_ID'] = data.index // freq

data_dw = data.groupby(by='GROUP_ID').agg({
    'HEATER': 'sum',
    'SMALL_LAMP': 'mean',
    'BIG_LAMP': 'mean',
    'EXT_T': 'mean',
    'TOP_T': 'mean',
    'MIDDLE_T': 'mean',
    'ENERGY_PULSES': 'mean',
    'ENERGY_WH': 'mean',
    'DATE': 'last'
}).reset_index().drop(['GROUP_ID'], axis=1)

### 2. Create time lags and add different statistics

In [26]:
df = data_dw.copy()
df['INDEX'] = np.arange(df.shape[0])

In [27]:
df.head()

Unnamed: 0,HEATER,SMALL_LAMP,BIG_LAMP,EXT_T,TOP_T,MIDDLE_T,ENERGY_PULSES,ENERGY_WH,DATE,INDEX
0,1.0,0.0,0.0,-0.62,47.5,43.36,8.8,1.4,2019-01-31 00:04:23,0
1,1.0,0.0,0.0,-0.6,46.94,43.18,34.8,5.6,2019-01-31 00:09:23,1
2,3.0,0.0,0.0,-0.5,46.1,42.28,36.6,5.8,2019-01-31 00:14:23,2
3,0.0,0.0,0.0,-0.48,44.86,41.6,15.8,2.6,2019-01-31 00:19:23,3
4,4.0,0.0,0.0,-0.42,43.74,40.9,27.0,4.2,2019-01-31 00:24:23,4


In [28]:
def add_lags(df, lags, forecast_period, original_columns, columns_to_lag, time_step=1):
    df = df.copy()
    for lag in tqdm_notebook(lags):
        tdata = df[original_columns].copy()
        tdata['INDEX'] += lag * time_step + forecast_period
        dict_new_columns = {}
        new_col_names = []
        for col_name in columns_to_lag:
            dict_new_columns[col_name] = col_name + '_M' + str(lag * time_step + forecast_period)
            new_col_names.append(col_name + '_M' + str(lag * time_step + forecast_period))
        tdata.rename(columns=dict_new_columns, inplace=True)
        df = pd.merge(df, tdata, how='left', on=['INDEX'])
    del tdata
    return df

In [29]:
lags = [0, 1, 2, 3, 4, 5, 10, 15]
forecast_period = 2
time_step = 1
original_columns = ['INDEX', 'EXT_T', 'TOP_T', 'MIDDLE_T', 'HEATER']
columns_to_lag = ['EXT_T', 'TOP_T', 'MIDDLE_T', 'HEATER']

In [30]:
df_lags = add_lags(df, lags, forecast_period, original_columns, columns_to_lag, time_step)

HBox(children=(IntProgress(value=0, max=8), HTML(value='')))




In [31]:
columns_to_diff = ['EXT_T', 'TOP_T', 'MIDDLE_T']

In [32]:
def add_diffs(df, lags, forecast_period, original_columns, columns_to_diff, time_step):
    di = {}
    df = df.copy()
    for ix, lag in tqdm_notebook(enumerate(lags[:-1])):
        next_lag = lags[ix + 1]
        for j in range(3):
            di[j] = []
        for col_name in columns_to_diff:
            di[0].append(col_name + '_diff_M' + str(lag * time_step + forecast_period))
            di[1].append(col_name + '_M' + str(lag * time_step + forecast_period))
            di[2].append(col_name + '_M' + str(next_lag * time_step + forecast_period))
        tdata = pd.DataFrame(data=df[di[1]].values - df[di[2]].values, index=df.index, columns=di[0])
        df = df.join(tdata)
    del tdata
    return df

In [33]:
df_lags = add_diffs(df_lags, lags, forecast_period, original_columns, columns_to_diff, time_step)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [35]:
cols_to_stat = ['EXT_T', 'TOP_T', 'MIDDLE_T']
stat_indices = [2, 4 ,6]

In [36]:
def add_stats(df, stat_indices, forecast_period, original_columns, cols_to_stat, time_step):
    df = df.copy()
    for ix in tqdm_notebook(stat_indices):
        for col_name in cols_to_stat:
            tmpstr = col_name + '_M'
            tmpli = np.core.defchararray.add(tmpstr,
                                             np.arange(forecast_period,
                                                       forecast_period + ix * time_step, step=time_step)\
                                             .astype(np.str))
            tmp = df[tmpli]
            df[col_name + '_mean_' + str(ix * time_step)] = tmp.mean(axis=1).astype(np.float32)
            # new mean decay
            df[col_name + '_mean_decay_' + str(ix * time_step)] = tmp.mul(
                                                    np.power(0.9, np.arange(ix * time_step, step=time_step)), axis=1)\
                                                         .sum(axis=1).astype(np.float32)
            df[col_name + '_median_' + str(ix * time_step)] = tmp.median(axis=1).astype(np.float32)
            df[col_name + '_min_' + str(ix * time_step)] = tmp.min(axis=1).astype(np.float32)
            df[col_name + '_max_' + str(ix * time_step)] = tmp.max(axis=1).astype(np.float32)
    del tmp
    return df

In [37]:
df_lags = add_stats(df_lags, stat_indices, forecast_period, original_columns, cols_to_stat, time_step)

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))




### 3. Add `MODE` sequence to features

In [38]:
def add_mode_seq(df, forecast_period, time_step):
    df = df.copy()
    for lag in tqdm_notebook(range(1, forecast_period + 1, time_step)):
        tdata = df[['INDEX', mode_col]].copy()
        tdata['INDEX'] += -lag * time_step + forecast_period
        dict_new_columns = {}
        new_col_names = []
        for col_name in columns_to_lag:
            dict_new_columns[col_name] = col_name + '_M' + str(-lag * time_step + forecast_period)
            new_col_names.append(col_name + '_M' + str(-lag * time_step + forecast_period))
        tdata.rename(columns=dict_new_columns, inplace=True)
        df = pd.merge(df, tdata, how='left', on=['INDEX'])
    del tdata
    return df

In [39]:
df_lags_seq = add_mode_seq(df_lags, forecast_period, time_step)

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))




### 4. Drop lines with NaN in history:

In [40]:
str_max_lag = forecast_period + max(lags)
print('Init. shape: ', df_lags_seq.shape)
df_lags_seq = df_lags_seq[df_lags_seq['EXT_T_M%s' % str_max_lag].notnull()].copy()
print('Final shape: ', df_lags_seq.shape)

Init. shape:  (1512, 110)
Final shape:  (1495, 110)


Cast `HEATER` columns to int8:

In [41]:
int_cols = [s for s in df_lags_seq.columns if 'HEATER' in s]
df_lags_seq[int_cols] = df_lags_seq[int_cols].astype(np.int8)

In [43]:
df_lags_seq.to_parquet('./data/dataset_featured_v1.parquet')
# dataset_features_v1.parquet - Timestep = '5 min', forecast= 2 x Timestep