# Chapter 3 - Time Series Data Preparation

## Python for time series data

## Common data preparation operations for time series

### Timestamps vs. Time Spans

In [None]:
import pandas as pd
import datetime as dt
import numpy as np

In [None]:
pd.Timestamp(dt.datetime(2014, 5, 1))

In [None]:
pd.Timestamp("2014-06-01")

In [None]:
pd.Timestamp(2014, 6, 1)

In [None]:
pd.Period("2014-06")

In [None]:
pd.Period("2014-06", freq="D")

In [None]:
dates = [
    pd.Timestamp("2014-06-01"),
    pd.Timestamp("2014-06-02"),
    pd.Timestamp("2014-06-03"),
]

ts_data = pd.Series(np.random.randn(3), dates)

type(ts_data.index)

In [None]:
ts_data.index

In [None]:
ts_data

In [None]:
periods = [pd.Period("2014-01"), pd.Period("2014-02"), pd.Period("2014-03")]
ts_data = pd.Series(np.random.randn(3), periods)
type(ts_data.index)

In [None]:
ts_data.index

In [None]:
ts_data

### Converting to timestamps

In [None]:
pd.to_datetime(pd.Series(["Jul 31, 2012", "2012-01-10", None]))

In [None]:
pd.to_datetime(["2012/11/23", "2012.12.31"])

In [None]:
pd.to_datetime(["04-01-2014 10:00"], dayfirst=True)

In [None]:
pd.to_datetime(["14-01-2014", "01-14-2012"], dayfirst=True)

### Providing a Format Argument

In [None]:
pd.to_datetime("2018/11/12", format="%Y/%m/%d")

In [None]:
pd.to_datetime("11-11-2018 00:00", format="%d-%m-%Y %H:%M")

### Indexing

In [None]:
import os
import shutil
from common.utils import download_file, extract_data, load_data

pd.options.display.float_format = "{:,.2f}".format
np.set_printoptions(precision=2)

In [None]:
data_dir = "./data"

if not os.path.exists(data_dir):
    os.mkdir(data_dir)

if not os.path.exists(os.path.join(data_dir, "energy.csv")):
    download_file("https://mlftsfwp.blob.core.windows.net/mlftsfwp/GEFCom2014.zip")
    shutil.move("GEFCom2014.zip", os.path.join(data_dir, "GEFCom2014.zip"))
    extract_data(data_dir)

In [None]:
ts_data_load = load_data(data_dir)[["load"]]
ts_data_load.head()

In [None]:
ts_data_load.index

In [None]:
ts_data_load[:5].index

In [None]:
ts_data_load[::2].index

In [None]:
ts_data_load["2012-6-01"]

In [None]:
ts_data_load["2012-1":"2012-2-28"]

In [None]:
ts_data_load["2012-1":"2012-1-2 00:00:00"]

In [None]:
ts_data_load.truncate(before="2013-11-01", after="2013-11-02")

### Frequency conversion

In [None]:
ts_data = load_data(data_dir)
ts_data.head(10)

In [None]:
daily_ts_data = ts_data.asfreq(pd.offsets.BDay())
daily_ts_data.head(5)

In [None]:
daily_ts_data.asfreq(pd.offsets.BDay(), method="pad")
daily_ts_data.head(5)

# Time series exploration and understanding

## How to get started with time series data analysis

In [None]:
import warnings
import matplotlib
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import statsmodels.api as sm

%matplotlib inline

warnings.filterwarnings("ignore")

In [None]:
ts_data.isna().sum()

In [None]:
ts_data.dtypes

In [None]:
ts_data.describe()

In [None]:
ts_data_load = ts_data["load"]
decomposition = sm.tsa.seasonal_decompose(
    ts_data_load["2012-07-01":"2012-12-31"], model="additive"
)

fig = decomposition.plot()
matplotlib.rcParams["figure.figsize"] = [10.0, 6.0]

In [None]:
decomposition = sm.tsa.seasonal_decompose(ts_data_load, model="additive")

fig, ax = plt.subplots()
ax.grid(True)

year = mdates.YearLocator(month=1)
month = mdates.MonthLocator(interval=1)

year_format = mdates.DateFormatter("%Y")
month_format = mdates.DateFormatter("%m")

ax.xaxis.set_minor_locator(month)
ax.xaxis.grid(True, which="minor")
ax.xaxis.set_major_locator(year)
ax.xaxis.set_major_formatter(year_format)

plt.plot(ts_data_load.index, ts_data_load, c="blue")
plt.plot(decomposition.trend.index, decomposition.trend, c="yellow")

## Data Cleaning of Missing Values in the Time Series

In [None]:
ts_data_load.interpolate(limit=8, method="linear", limit_direction="both")

In [None]:
from scipy import stats

In [None]:
temp_mode = np.asscalar(stats.mode(ts_data["temp"]).mode)
ts_data["temp"] = ts_data["temp"].fillna(temp_mode)
ts_data.isnull().sum()

### Time Series Data Normalization and Standardization

In [None]:
from pandas import Series
from sklearn.preprocessing import MinMaxScaler

values = ts_data_load.values
values = values.reshape((len(values), 1))

scaler = MinMaxScaler(feature_range=(0, 1))
scaler = scaler.fit(values)
print("Min: %f, Max: %f" % (scaler.data_min_, scaler.data_max_))

In [None]:
normalized = scaler.transform(values)
for i in range(5):
    print(normalized[i])

inversed = scaler.inverse_transform(normalized)
for i in range(5):
    print(inversed[i])

from math import sqrt

In [None]:
from sklearn.preprocessing import StandardScaler

values = ts_data_load.values
values = values.reshape((len(values), 1))

scaler = StandardScaler()
scaler = scaler.fit(values)
print("Mean: %f, StandardDeviation: %f" % (scaler.mean_, sqrt(scaler.var_)))

In [None]:
normalized = scaler.transform(values)
for i in range(5):
    print(normalized[i])

inversed = scaler.inverse_transform(normalized)
for i in range(5):
    print(inversed[i])

## Time series feature engineering

### Date Time Features

In [None]:
ts_data["hour"] = [ts_data.index[i].hour for i in range(len(ts_data))]
ts_data["month"] = [ts_data.index[i].month for i in range(len(ts_data))]
ts_data["dayofweek"] = [ts_data.index[i].day for i in range(len(ts_data))]
print(ts_data.head(5))

### Lagged Features

####  Shift function with DateOffset class and offset alias

In [None]:
ts_data = load_data(data_dir)
ts_data.head(10)

In [None]:
ts_data_shift = ts_data.shift(4, freq=pd.offsets.BDay())
ts_data_shift.head(5)

In [None]:
ts_data_shift_2 = ts_data.tshift(6, freq="D")
ts_data_shift_2.head(5)

#### Shift() function

In [None]:
def generated_lagged_features(ts_data, var, max_lag):
    for t in range(1, max_lag + 1):
        ts_data[var + "_lag" + str(t)] = ts_data[var].shift(t, freq="1H")

In [None]:
generated_lagged_features(ts_data, "load", 8)
generated_lagged_features(ts_data, "temp", 8)
print(ts_data.head(5))

### Rolling Window Statistics

In [None]:
from pandas import concat

load_val = ts_data[["load"]]
shifted = load_val.shift(1)

window = shifted.rolling(window=6)
means = window.mean()
new_dataframe = concat([means, load_val], axis=1)
new_dataframe.columns = ["load_rol_mean", "load"]

print(new_dataframe.head(10))

In [None]:
from pandas import concat

load_val = ts_data[["load"]]
width = 4
shifted = load_val.shift(width - 1)
window = shifted.rolling(window=width)

new_dataframe = pd.concat([window.min(), window.mean(), window.max(), load_val], axis=1)
new_dataframe.columns = ["min", "mean", "max", "load"]

print(new_dataframe.head(10))

### Expanding Window Statistics

In [None]:
from pandas import concat

load_val = ts_data[["load"]]
window = load_val.expanding()
new_dataframe = concat(
    [window.min(), window.mean(), window.max(), load_val.shift(-1)], axis=1
)
new_dataframe.columns = ["min", "mean", "max", "load+1"]
print(new_dataframe.head(10))