# M4 Datasets Exploration

M4 dataset can be downloaded from [Kaggle](https://www.kaggle.com/yogesh94/m4-forecasting-competition-dataset)

More details on the dataset [here](https://www.sciencedirect.com/science/article/pii/S0169207019301128)

In [None]:
import os
import json
from enum import Enum

from numpy.random import randn
import numpy as np
np.random.seed(123)
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import pandas as pd
plt.rc('figure', figsize=(10, 6))
np.set_printoptions(precision=4)
pd.options.display.max_rows = 20
import seaborn as sns
from IPython.display import display

In [None]:
# Input
INPUT_DIR = "../input"

Datasets = Enum('Datasets', 'Hourly Daily Monthly Quarterly')

def load_dataset(dataset):
    train_filepath = os.path.join(INPUT_DIR, dataset.name + "-train.csv")
    test_filepath = os.path.join(INPUT_DIR, dataset.name + "-test.csv")
    
    return pd.read_csv(train_filepath, sep=',', header=0, index_col=0, engine='python'), pd.read_csv(test_filepath, sep=',', header=0, index_col=0, engine='python')

In [None]:
hourly_train, hourly_test = load_dataset(Datasets.Hourly)
daily_train, daily_test = load_dataset(Datasets.Daily)
monthly_train, monthly_test = load_dataset(Datasets.Monthly)
quarterly_train, quarterly_test = load_dataset(Datasets.Quarterly)

In [None]:
hourly_train.shape, hourly_test.shape

In [None]:
#daily_train.shape, daily_test.shape

In [None]:
#monthly_train.shape, monthly_test.shape

In [None]:
#quarterly_train.shape, quarterly_test.shape

In [None]:
hourly_train.head()

In [None]:
# Lots of seasonality in there
hourly_train.iloc[0:10, ].transpose().plot()

In [None]:
hourly_train.min().min()

In [None]:
hourly_train.max().max()

In [None]:
# Because every time series is not the same length NaN is used as passing at the end
hourly_train.transpose().isnull().sum()

In [None]:
# Test data is clean, with no end padding

In [None]:
hourly_test.isnull().sum().sum()

In [None]:
hourly_test.iloc[0:10, ].transpose().plot()

In [None]:
hourly_train.iloc[0, ].first_valid_index()

In [None]:
 hourly_train.iloc[0, ].last_valid_index()

In [None]:
def trim(df, index):
    """Return the time series at index, with the end NaN padding removed (not all M4 TS are the same length)."""
    s = df.iloc[index, ]
    return s.loc[:s.last_valid_index()]

trim(hourly_train, 0)

In [None]:
# For performance, confirm that this is returning a view, not a copy
hourly_train.values.base is s.values.base

In [None]:
# Now check all data for NaN once trimed
daily_train, daily_test = load_dataset(Datasets.Daily)
monthly_train, monthly_test = load_dataset(Datasets.Monthly)
quarterly_train, quarterly_test = load_dataset(Datasets.Quarterly)

In [None]:
dfs = []
dfs.append(hourly_train)
dfs.append(hourly_test)
dfs.append(daily_train)
dfs.append(daily_test)
dfs.append(monthly_train)
dfs.append(monthly_test)
dfs.append(quarterly_train)
dfs.append(quarterly_test)

for df in dfs:
    for i in range(df.shape[0]):
        assert(trim(df, i).isnull().sum() == 0)