In [1]:
import pickle
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import QuantileTransformer
from statsmodels.tsa.holtwinters import ExponentialSmoothing
import matplotlib.pyplot as plt
from typing import Any, Dict, List
import time 
from datetime import datetime  
import yaml

In [2]:
df_infer_scaled = df = pd.read_hdf(
    path_or_buf='../data/05_model_input/df_infer_scaled.hdf', 
    key='df_infer_scaled'
)

with open('../data/05_model_input/splits_positions.pkl/2020-10-01T14.00.54.003Z/splits_positions.pkl', 'rb') as pkl_file:
    splits_positions = pickle.load(pkl_file)
    
with open(r'../conf/base/parameters.yml') as file:
    params = yaml.load(file, Loader=yaml.FullLoader)
    
modeling = params['modeling']
cv_params = params['cv']

## Non-Essential

In [None]:
df_infer_scaled.head(3)

In [None]:
df_infer_scaled[['DEF0C']]

In [None]:
df_infer_scaled.loc['2013-01-01': '2015-12-22']

In [None]:
train = slice('2013-01-01', '2015-12-22')
df_infer_scaled[train]

## Defining CV Splits

In [3]:
window_size_first_pass = cv_params['window_size_first_pass']
window_size_last_pass = cv_params['window_size_last_pass']
if window_size_last_pass == 'complete inference window':
    window_size_last_pass = len(df)
n_passes = cv_params['n_passes']
forecasting_window_size = cv_params['forecasting_window_size']

cv_splits_dict = {}
window_size_increment = int((window_size_last_pass - window_size_first_pass) / (n_passes - 1))
for p in range(n_passes):
    pass_id = 'pass_' + str(p + 1)
    cv_splits_dict[pass_id] = {
        'train_idx': [
            0,
            window_size_first_pass + p * window_size_increment
        ],
        'test_idx': [
            window_size_first_pass + p * window_size_increment,
            window_size_first_pass + p * window_size_increment + forecasting_window_size,
        ],
    }
    
cv_splits_dict

{'pass_1': {'train_idx': [0, 730], 'test_idx': [730, 737]},
 'pass_2': {'train_idx': [0, 816], 'test_idx': [816, 823]},
 'pass_3': {'train_idx': [0, 902], 'test_idx': [902, 909]}}

In [None]:
window_size_first_pass = cv_params['window_size_first_pass']
window_size_last_pass = cv_params['window_size_last_pass']
if window_size_last_pass == 'complete inference window':
    window_size_last_pass = len(df)
n_passes = cv_params['n_passes']
forecasting_window_size = cv_params['forecasting_window_size']

cv_splits_dict = {}
window_size_increment = int((window_size_last_pass - window_size_first_pass) / (n_passes - 1))
for p in range(n_passes):
    pass_id = 'pass_' + str(p + 1)
    cv_splits_dict[pass_id] = {
        'train': slice(
            df.index[0],
            df.index[ window_size_first_pass + p * window_size_increment ]
        ),
        'val': slice(
            df.index[ window_size_first_pass + p * window_size_increment ],
            df.index[ window_size_first_pass + p * window_size_increment + forecasting_window_size ]
        ),
    }

cv_splits_dict

In [None]:
# ignore all vars we don't want to model
targets = modeling['targets']
df = df[targets]

In [None]:
splits_positions

In [None]:
train_idx = slice( 
    splits_positions['pass_1']['train_idx'][0],
    splits_positions['pass_1']['train_idx'][1],
    )

In [None]:
df.iloc[train_idx]

In [None]:
train = slice(
    df.index[0],
    df.index[730]
)

df[train]

## Training Split-Wise

In [None]:
def split_data(df: pd.DataFrame, modeling):
    train = slice(
        modeling['train_window']['start'],
        modeling['train_window']['end']
    )
    
    test = slice(
        modeling['test_window']['start'],
        modeling['test_window']['end']
    )
    
    return {
        'df_train': df[train],
        'df_test': df[test]
    }

In [None]:
class MakeStrictlyPositive(TransformerMixin, BaseEstimator):
    '''Add constant to variable so that it only assumes positive values.'''

    def __init__(self):
        pass

    def fit(self, X, y=None):
        self.offset_ = X.min(axis=0)
        return self 
    
    def transform(self, X, y=None):
        return X + abs(self.offset_)
    
    def inverse_transform(self, X, y=None):
        return X - abs(self.offset_)      

In [None]:
TRANSFORMERS = {
    'get_quantile_equivalent_normal_dist': QuantileTransformer(
                                                output_distribution='normal', 
                                                random_state=0,
                                            ),
    'make_strictly_positive': MakeStrictlyPositive(),
}

In [None]:
preprocessing_pipeline = make_pipeline(
    QuantileTransformer(
        output_distribution='normal', 
        random_state=0,
    ),
    MakeStrictlyPositive(),
)

In [None]:
preprocessing_pipeline = make_pipeline(
    *[ TRANSFORMERS[ step ] for step in modeling['preprocessing'] ]
)

In [None]:
train_test_split = split_data(df_spatiotemporal, modeling)

In [None]:
df_train = train_test_split['df_train']

df_train['temporal'].head()

In [None]:
df_train_preprocessed = df_train.copy(deep=True)

In [None]:
preprocessing_pipeline = preprocessing_pipeline.fit(
    df_train_copy['temporal']
)

df_train_preprocessed['temporal'].update(
    preprocessing_pipeline.transform(
        df_train_copy['temporal']
    )
)


In [None]:
(df_train_copy['temporal'] < 0).sum().sum()

In [None]:
cv_params = {
    'cv_type': 'expanding_windows',
    'window_size_first_pass': 365,
    'window_size_last_pass': 540,
    'n_passes': 3,
    'forecasting_window_size': 7,
}

In [None]:
def define_cvsplits(cv_pars: Dict) -> Dict[str, Any]:  # Dict[str, List[pd.date_range, List[str]]]:
    """
    Example of Cross-Validation Splits Dictionary:

    cv_splits_dict = {
        'pass_1': {
            'train_idx': [0, 365],
            'eval_idx': [365, 465],
        }
    }

    :param window_size_first_pass:
    :param window_size_last_pass:
    :param n_passes:
    :param forecasting_window_size:
    :return:
    """
    window_size_first_pass = cv_pars['window_size_first_pass']
    window_size_last_pass = cv_pars['window_size_last_pass']
    n_passes = cv_pars['n_passes']
    forecasting_window_size = cv_pars['forecasting_window_size']

    cv_splits_dict = {}
    window_size_increment = int( (window_size_last_pass - window_size_first_pass) / (n_passes-1) )
    for p in range(n_passes):
        pass_id = 'pass_' + str(p + 1)
        cv_splits_dict[pass_id] = {
                'train_idx': [
                    0,
                    window_size_first_pass + p * window_size_increment
                ],
                'eval_idx': [
                    window_size_first_pass + p * window_size_increment,
                    window_size_first_pass + p * window_size_increment + forecasting_window_size,
                ],
        }
    return cv_splits_dict

In [None]:
def _split_train_val(df: pd.DataFrame, cv_splits_dict: dict, pass_id: str):
    train_idx_start = cv_splits_dict[pass_id]['train_idx'][0]
    train_idx_end = cv_splits_dict[pass_id]['train_idx'][1]

    test_idx_start = cv_splits_dict[pass_id]['test_idx'][0]
    test_idx_end = cv_splits_dict[pass_id]['test_idx'][1]

    return {
        'train': df.iloc[train_idx_start:train_idx_end, :],
        'val': df.iloc[test_idx_start:test_idx_end, :],
    }


class ForecastingModel:
    def __init__(self, y_train, modeling):
        
        # model artifacts (metadata) for training
        self.modeling_settings = modeling
        self.y_train_info = y_train.info()
        self.y_train_columns = y_train.columns
        
        self.hyperpars = modeling['hyperpars']
        
        self.targets_list = self.modeling['target_timeseries']
        if self.targets_list == 'all_available':
            self.targets_list = y_train.columns 
        
        y_train_ = y_train  # i.e. all districts at once (spatio-temporal)
        if self.modeling_settings['mode'] == 'temporal':  # i.e. districtwise
            y_train_ = y_train['temporal']

            
    def fit(self, district=None):      
        self.datetime_start = datetime.now()
        time_start = time.time()
        
        if self.modeling_settings['approach'] == 'HW-ES':
            self.submodels_ = { 
                district: ExponentialSmoothing( 
                    endog=y_train_[district], 
                    *self.hyperpars,
                ).fit() for district in self.targets_list 
            } 

        elif self.modeling_settings['approach'] == 'RNN-ES':
            self.model_ = None

        elif self.modeling_settings['approach'] == 'GWNet':
            self.model_ = None
        
        else: 
            return NotImplementedError(f'Invalid modeling approach {self.modeling_settings["approach"]}')
        
        self.training_duration = format( time.time() - time_start, "2.00E" ) + ' secs' 

        return self
    
    
    def predict(self, start, end, transformer):               
        y_hat = pd.DataFrame(
            data=None,
            columns=self.y_train_columns,
        )
        
        if self.modeling_settings['mode'] == 'temporal': # i.e. districtwise
            y_hat.update(
                data = {
                    self.submodels_[district].predict(
                        start=start,
                        end=start,)
                for district in self.y_train_columns}, 
                copy=False,
            )
        else: 
            y_hat.update(
                data = {
                    self.model_.predict(
                        start=start,
                        end=start,
                    )
                }
                copy=False,
            )
        
        y_hat_unscaled = transformer.inverse_transform(y_hat)
        return y_hat_unscaled

    
def cv_train(df_train_preprocessed: pd.DataFrame,
             modeling: Dict[str, Any],
             cv_splits_dict: Dict[str, Any]) -> Dict[str, Any]:

    model = {}
    for pass_id in cv_splits_dict.keys():

        # splitting
        y = _split_train_val(df_train_preprocessed, cv_splits_dict, pass_id)  # cv_splits_dict[pass_id]

        # training
        model[pass_id] = ForecastingModel(y['train'], modeling).fit()   
    
    longest_pass_id = pass_id
    return {
        'intermediate_models': model,
        'model': model[longest_pass_id]
    }


def evaluate(model, para)

In [None]:
model['pass_3'].predict(
    start='2015-06-21',
    end='2015-06-27',
    scaler=preprocessing_pipeline,  # TODO: populate all districts columns, then predict
)