# 05.07 - Modeling Setup - Data Splitting & Re-Combining

## Imports & setup

In [12]:
import pathlib
from datetime import datetime
import math
import sys

import pandas as pd
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt
plt.style.use('grayscale')
from matplotlib.dates import DateFormatter
import matplotlib.dates as mdates

from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import Pipeline
from skoot.feature_selection import FeatureFilter
from skoot.preprocessing import SelectiveRobustScaler

sys.path.append("..")
from src.models.models import SetTempAsPower, SK_Prophet
from src.utils.utils import bound_precision, AnnualTimeSeriesSplit, run_cross_val
from src.visualization.visualize import (plot_prediction,
                                         plot_joint_plot,
                                         residual_plots,
                                         print_residual_stats)

%matplotlib inline

PROJECT_DIR = pathlib.Path.cwd().parent.resolve()
CLEAN_DATA_DIR = PROJECT_DIR / 'data' / '05-clean'

pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

## Load Test Data

## Splitting Data (e.g. Monday Model, Tuesday Model ... )

In [2]:
df = pd.read_csv(CLEAN_DATA_DIR / 'clean-cut.csv', parse_dates=True, index_col=0)
df = df.loc['1994': '2013']

y = df.pop('daily_peak')
X = df

In [3]:
X.head()

Unnamed: 0,temp,dew_point_temp,rel_hum,wind_speed,visibility,press,hmdx,wind_chill,weather,hour_of_day,year,month,day_of_week,day_of_year,week_of_year,quarter,stat_hol,day_light_hours,hourly_demand
1994-01-01 00:00:00,-1.2,-3.8,83.0,15.0,19.3,99.91,,-6.0,Cloudy,0.0,1994.0,1.0,5.0,1.0,52.0,1.0,True,False,14422.0
1994-01-01 01:00:00,-0.9,-3.0,86.0,20.0,16.1,99.91,,-6.0,Cloudy,1.0,1994.0,1.0,5.0,1.0,52.0,1.0,True,False,13845.0
1994-01-01 02:00:00,-0.7,-3.2,83.0,15.0,16.1,99.87,,-5.0,Cloudy,2.0,1994.0,1.0,5.0,1.0,52.0,1.0,True,False,13372.0
1994-01-01 03:00:00,-0.8,-2.4,89.0,15.0,12.9,99.81,,-5.0,Cloudy,3.0,1994.0,1.0,5.0,1.0,52.0,1.0,True,False,13025.0
1994-01-01 04:00:00,-1.0,-3.3,84.0,19.0,16.1,99.77,,-6.0,Mostly Cloudy,4.0,1994.0,1.0,5.0,1.0,52.0,1.0,True,False,12869.0


In [4]:
def temporal_split(X, y, splitter_col = 'day_of_week'):
    X_splits = []
    y_splits = []
    Xt = X.copy(deep=True) ; yt = y.copy(deep=True)
    split_flags = sorted(Xt[splitter_col].unique())
    for split_flag in split_flags:
        X_split = Xt[Xt[splitter_col] == split_flag]
        X_splits.append(X_split)
        y_splits.append(y.loc[X_split.index])
        
    return X_splits, y_splits

In [6]:
X_splits, y_splits = temporal_split(X, y, splitter_col = 'day_of_week')

for X_split, y_split in zip(X_splits, y_splits):#temporal_split(X, y, splitter_col = 'day_of_week'):
    print(X_split[['temp', 'day_of_week', 'week_of_year']].head(7))
    print(y_split.head(7))
    break

                     temp  day_of_week  week_of_year
1994-01-03 00:00:00 -14.8          0.0           1.0
1994-01-03 01:00:00 -15.4          0.0           1.0
1994-01-03 02:00:00 -16.3          0.0           1.0
1994-01-03 03:00:00 -16.0          0.0           1.0
1994-01-03 04:00:00 -15.0          0.0           1.0
1994-01-03 05:00:00 -14.7          0.0           1.0
1994-01-03 06:00:00 -13.8          0.0           1.0
1994-01-03 00:00:00    21923.0
1994-01-03 01:00:00    21923.0
1994-01-03 02:00:00    21923.0
1994-01-03 03:00:00    21923.0
1994-01-03 04:00:00    21923.0
1994-01-03 05:00:00    21923.0
1994-01-03 06:00:00    21923.0
Name: daily_peak, dtype: float64
