# What is done?

#### 1. Downsampling to 1 min interval
#### 2. Add lags to history and future

In [1]:
import numpy as np
import pandas as pd

from tqdm import tqdm_notebook
import re

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
%matplotlib inline

from utils import *

### 1. Load the data

In [2]:
data = pd.read_parquet('./data/raw_ts_10s_v1.parquet')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30571 entries, 0 to 30570
Data columns (total 6 columns):
DATE    30571 non-null datetime64[ns]
T1      30571 non-null float64
T2      30571 non-null float64
T3      30571 non-null float64
T4      30571 non-null float64
MODE    30571 non-null int8
dtypes: datetime64[ns](1), float64(4), int8(1)
memory usage: 1.4 MB


Downsample to `freq = 1 min`:

In [4]:
def downsample(dataframe, freq='1min'):
    ts = pd.DataFrame({'DATE': pd.date_range(start=dataframe['DATE'].min(),
                                             end=dataframe['DATE'].max(),
                                             freq=freq)})
    return pd.merge(ts, dataframe, how='left', on='DATE')

In [5]:
data_1m = downsample(data)

### 2. Create time lags and add different statistics

In [6]:
df = data_1m.copy()

In [7]:
def add_lags(df, lags, forecast_period, original_columns, columns_to_lag):
    df = df.copy()
    for lag in tqdm_notebook(lags):
        tdata = df[original_columns].copy()
        tdata['DATE'] += pd.Timedelta(str(lag + forecast_period) + ' minutes')
        dict_new_columns = {}
        new_col_names = []
        for col_name in ['T1', 'T2', 'T3', 'T4', 'MODE']:
            dict_new_columns[col_name] = col_name + '_M' + str(lag + forecast_period)
            new_col_names.append(col_name + '_M' + str(lag + forecast_period))
        tdata.rename(columns=dict_new_columns, inplace=True)
        df = pd.merge(df, tdata, how='left', on=['DATE'])
    del tdata
    return df

In [8]:
lags = [0, 1, 2, 3, 4, 5, 10, 20, 30, 60, 120, 180, 240, 300, 360]
forecast_period = 30
original_columns = list(df.columns)
columns_to_lag = ['T1', 'T2', 'T3', 'T4', 'MODE']

In [9]:
df_lags = add_lags(df, lags, forecast_period, original_columns, columns_to_lag)

HBox(children=(IntProgress(value=0, max=15), HTML(value='')))




In [10]:
columns_to_diff = ['T1', 'T2', 'T3', 'T4']

In [11]:
def add_diffs(df, lags, forecast_period, original_columns, columns_to_diff):
    di = {}
    df = df.copy()
    for ix, lag in tqdm_notebook(enumerate(lags[:-1])):
        next_lag = lags[ix + 1]
        for j in range(3):
            di[j] = []
        for col_name in columns_to_diff:
            di[0].append(col_name + '_diff_M' + str(lag + forecast_period))
            di[1].append(col_name + '_M' + str(lag + forecast_period))
            di[2].append(col_name + '_M' + str(next_lag + forecast_period))
        tdata = pd.DataFrame(data=df[di[1]].values - df[di[2]].values, index=df.index, columns=di[0])
        df = df.join(tdata)
    del tdata
    return df

In [12]:
df_lags = add_diffs(df_lags, lags, forecast_period, original_columns, columns_to_diff)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [13]:
cols_to_stat = ['T1', 'T2', 'T3', 'T4']
stat_indices = [2, 4]

In [14]:
def add_stats(df, stat_indices, forecast_period, original_columns, cols_to_stat):
    df = df.copy()
    for ix in tqdm_notebook(stat_indices):
        for col_name in cols_to_stat:
            tmpstr = col_name + '_M'
            tmpli = np.core.defchararray.add(tmpstr, np.arange(forecast_period, forecast_period + ix).astype(np.str))
            tmp = df[tmpli]
            df[col_name + '_mean_' + str(ix)] = tmp.mean(axis=1).astype(np.float32)
            # new mean decay
            df[col_name + '_mean_decay_' + str(ix)] = tmp.mul(np.power(0.9, np.arange(ix)), axis=1)\
                                                            .sum(axis=1).astype(np.float32)
            df[col_name + '_median_' + str(ix)] = tmp.median(axis=1).astype(np.float32)
            df[col_name + '_min_' + str(ix)] = tmp.min(axis=1).astype(np.float32)
            df[col_name + '_max_' + str(ix)] = tmp.max(axis=1).astype(np.float32)
    del tmp
    return df

In [15]:
df_lags = add_stats(df_lags, stat_indices, forecast_period, original_columns, cols_to_stat)

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))

  r = func(a, **kwargs)





### 3. Add `MODE` sequence to features

In [16]:
def add_mode_seq(df, forecast_period):
    df = df.copy()
    for lag in tqdm_notebook(range(1, forecast_period)):
        tdata = df[['DATE', 'MODE']].copy()
        tdata['DATE'] += pd.Timedelta(str(-lag + forecast_period) + ' minutes')
        dict_new_columns = {}
        new_col_names = []
        for col_name in ['MODE']:
            dict_new_columns[col_name] = col_name + '_M' + str(-lag + forecast_period)
            new_col_names.append(col_name + '_M' + str(-lag + forecast_period))
        tdata.rename(columns=dict_new_columns, inplace=True)
        df = pd.merge(df, tdata, how='left', on=['DATE'])
    del tdata
    return df

In [17]:
df_lags_seq = add_mode_seq(df_lags, forecast_period)

HBox(children=(IntProgress(value=0, max=29), HTML(value='')))




### 4. Drop lines with NaN in history:

In [18]:
print('Init. shape: ', df_lags_seq.shape)
df_lags_seq = df_lags_seq[df_lags_seq['T1_M390'].notnull()].copy()
print('Final shape: ', df_lags_seq.shape)

Init. shape:  (5096, 206)
Final shape:  (4706, 206)


Cast `MODE` columns to int8:

In [20]:
int_cols = [s for s in df_lags_seq.columns if 'MODE' in s]
df_lags_seq[int_cols] = df_lags_seq[int_cols].astype(np.int8)

In [21]:
df_lags_seq.to_parquet('./data/ds_1min_30min_v1.parquet')