# 05.03 - Modeling Setup - Time Series Cross Validation

## Imports & setup

In [1]:
import pathlib
from datetime import datetime
import math
import sys

import pandas as pd
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt
plt.style.use('grayscale')
from matplotlib.dates import DateFormatter
import matplotlib.dates as mdates

from sklearn.metrics import mean_absolute_error

sys.path.append("..")
from src.models.models import SetTempAsPower, SK_Prophet
from src.utils.utils import bound_precision

%matplotlib inline

PROJECT_DIR = pathlib.Path.cwd().parent.resolve()
CLEAN_DATA_DIR = PROJECT_DIR / 'data' / '05-clean'

pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

## Load Test Data

In [2]:
df = pd.read_csv(CLEAN_DATA_DIR / 'clean-cut.csv', parse_dates=True, index_col=0)
df = df.loc['1994': '2013']
df = df.resample('D').max()
# Just select a reasonable subset of data to test the model wrappers
df = df[['temp', 'dew_point_temp', 'week_of_year', 'daily_peak']]
df.rename(columns={'temp': 'temp_max'}, inplace=True)

y = df.pop('daily_peak')
X = df

X.head()

Unnamed: 0,temp_max,dew_point_temp,week_of_year
1994-01-01,2.8,1.1,52.0
1994-01-02,1.7,0.5,52.0
1994-01-03,-10.3,-12.6,1.0
1994-01-04,-7.4,-11.5,1.0
1994-01-05,-7.2,-10.7,1.0


In [3]:
y.head()

1994-01-01    16892.0
1994-01-02    18947.0
1994-01-03    21923.0
1994-01-04    21457.0
1994-01-05    22082.0
Freq: D, Name: daily_peak, dtype: float64

In [4]:
y.tail()

2013-12-27    18611.0
2013-12-28    17651.0
2013-12-29    17853.0
2013-12-30    19997.0
2013-12-31    19748.0
Freq: D, Name: daily_peak, dtype: float64

## Custom Time Series Cross Validation

### Scikit Learn Time Series Cross Validation
Not Suitable because cannot select minimum train set size 

In [5]:
from sklearn.model_selection import TimeSeriesSplit

tscv = TimeSeriesSplit(n_splits=10)

for train_indx, val_indx in tscv.split(X):
    print('Train', X.iloc[[train_indx[0]]].index.date,
        X.iloc[[train_indx[-1]]].index.date)
    print('Validate', X.iloc[[val_indx[0]]].index.date,
        X.iloc[[val_indx[-1]]].index.date, '\n')

Train [datetime.date(1994, 1, 1)] [datetime.date(1995, 10, 27)]
Validate [datetime.date(1995, 10, 28)] [datetime.date(1997, 8, 21)] 

Train [datetime.date(1994, 1, 1)] [datetime.date(1997, 8, 21)]
Validate [datetime.date(1997, 8, 22)] [datetime.date(1999, 6, 16)] 

Train [datetime.date(1994, 1, 1)] [datetime.date(1999, 6, 16)]
Validate [datetime.date(1999, 6, 17)] [datetime.date(2001, 4, 10)] 

Train [datetime.date(1994, 1, 1)] [datetime.date(2001, 4, 10)]
Validate [datetime.date(2001, 4, 11)] [datetime.date(2003, 2, 3)] 

Train [datetime.date(1994, 1, 1)] [datetime.date(2003, 2, 3)]
Validate [datetime.date(2003, 2, 4)] [datetime.date(2004, 11, 28)] 

Train [datetime.date(1994, 1, 1)] [datetime.date(2004, 11, 28)]
Validate [datetime.date(2004, 11, 29)] [datetime.date(2006, 9, 23)] 

Train [datetime.date(1994, 1, 1)] [datetime.date(2006, 9, 23)]
Validate [datetime.date(2006, 9, 24)] [datetime.date(2008, 7, 18)] 

Train [datetime.date(1994, 1, 1)] [datetime.date(2008, 7, 18)]
Validate [d

### Time Series Cross Validation - Fixed Start


We are looking for an annual splitting scheme that works like the following blue and green blocks

Code up in cell below

![Custom cross Validation](images/custom-cross-validation.PNG "Custom Cross Validation")

In [6]:
class AnnualTimeSeriesSplit():
    """
    Instantiate with number of folds
    split accepts a pandas dataframe indexed by datetime covering multiple years sorted ascending
    Splits to the number of folds, with a single year returned as the validation set
    Walks up the timeseries yielding the indices from each train, test split
    """
    def __init__(self, n_splits):
        self.n_splits = n_splits
        
    def split(self, X, y=None, groups=None):
        years = X.index.year.unique()
        
        for ind, year in enumerate(years[0:self.n_splits]):
            
            final_train_year = years[-1] - self.n_splits + ind
            
            train_final_index = X.index.get_loc(str(final_train_year)).stop
            test_final_index = X.index.get_loc(str(final_train_year + 1)).stop
            
            train_indices = list(range(0, train_final_index))
            test_indices = list(range(train_final_index, test_final_index))
            
            yield train_indices, test_indices

In [7]:
atscv = AnnualTimeSeriesSplit(n_splits=10)

for train_indx, val_indx in atscv.split(X):
    print('Train', X.iloc[[train_indx[0]]].index.date,
        X.iloc[[train_indx[-1]]].index.date)
    print('Validate', X.iloc[[val_indx[0]]].index.date,
        X.iloc[[val_indx[-1]]].index.date, '\n')

Train [datetime.date(1994, 1, 1)] [datetime.date(2003, 12, 31)]
Validate [datetime.date(2004, 1, 1)] [datetime.date(2004, 12, 31)] 

Train [datetime.date(1994, 1, 1)] [datetime.date(2004, 12, 31)]
Validate [datetime.date(2005, 1, 1)] [datetime.date(2005, 12, 31)] 

Train [datetime.date(1994, 1, 1)] [datetime.date(2005, 12, 31)]
Validate [datetime.date(2006, 1, 1)] [datetime.date(2006, 12, 31)] 

Train [datetime.date(1994, 1, 1)] [datetime.date(2006, 12, 31)]
Validate [datetime.date(2007, 1, 1)] [datetime.date(2007, 12, 31)] 

Train [datetime.date(1994, 1, 1)] [datetime.date(2007, 12, 31)]
Validate [datetime.date(2008, 1, 1)] [datetime.date(2008, 12, 31)] 

Train [datetime.date(1994, 1, 1)] [datetime.date(2008, 12, 31)]
Validate [datetime.date(2009, 1, 1)] [datetime.date(2009, 12, 31)] 

Train [datetime.date(1994, 1, 1)] [datetime.date(2009, 12, 31)]
Validate [datetime.date(2010, 1, 1)] [datetime.date(2010, 12, 31)] 

Train [datetime.date(1994, 1, 1)] [datetime.date(2010, 12, 31)]
Valid

### Time Series Cross Validation - Fixed Start


We are looking for an annual splitting scheme that works like the following blue and green blocks

Code up in cell below

![Custom cross Validation](images/rolling-cross-validation.PNG "Custom Cross Validation")

In [8]:
class RollingAnnualTimeSeriesSplit():
    """
    Instantiate with number of folds
    split accepts a pandas dataframe indexed by datetime covering multiple years sorted ascending
    Splits to the number of folds, with a single year returned as the validation set
    Walks up the timeseries yielding the indices from each train, test split
    """
    def __init__(self, n_splits, goback_years=5):
        self.n_splits = n_splits
        self.goback_years = goback_years
        
    def split(self, X, y=None, groups=None):
        years = X.index.year.unique()
        
        for ind, year in enumerate(years[0:self.n_splits]):
            
            final_train_year = years[-1] - self.n_splits + ind
            start_train_year = final_train_year - self.goback_years +1
            print(f'{final_train_year+1}')
            
            train_start_index = X.index.get_loc(str(start_train_year)).start
            train_final_index = X.index.get_loc(str(final_train_year)).stop
            test_final_index = X.index.get_loc(str(final_train_year + 1)).stop
            
            train_indices = list(range(train_start_index, train_final_index))
            test_indices = list(range(train_final_index, test_final_index))
            
            yield train_indices, test_indices