In [11]:
import pickle
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import QuantileTransformer
from statsmodels.tsa.holtwinters import ExponentialSmoothing
import matplotlib.pyplot as plt
from typing import Any, Dict, List
import time 
from datetime import datetime  
import yaml

In [41]:
df_infer_scaled = df = pd.read_hdf(
    path_or_buf='../data/05_model_input/df_infer_scaled.hdf', 
    key='df_infer_scaled'
)

with open('../data/05_model_input/splits_positions.pkl/2020-10-01T14.00.54.003Z/splits_positions.pkl', 'rb') as pkl_file:
    splits_positions = pickle.load(pkl_file)
    
with open(r'../conf/base/parameters.yml') as file:
    params = yaml.load(file, Loader=yaml.FullLoader)
    
modeling = params['modeling']
cv_params = params['cv']

## Non-Essential

In [4]:
df_infer_scaled.head(3)

Unnamed: 0_level_0,DE145,DE114,DE146,DE132,DE12A,DE133,DE12C,DE11C,DE118,DE119,...,DEG01,DEG0F,DE275,DE21C,DE234,DE251,DE276,DE278,DE718,DE943
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-01,6.030707,6.332357,6.507886,6.628375,6.878469,6.701568,6.856132,6.266206,6.590818,6.636133,...,6.701568,6.824102,1e-08,1e-08,1e-08,1e-08,1e-08,1e-08,1e-08,1e-08
2013-01-02,5.232691,5.199338,5.104712,4.3913,5.32752,4.303404,5.254946,5.191001,5.556967,5.489596,...,6.141913,5.918106,1e-08,1e-08,1e-08,1e-08,1e-08,1e-08,1e-08,1e-08
2013-01-03,6.133282,6.227721,6.755229,5.601776,6.190702,6.046516,6.107795,6.370123,6.856132,6.913633,...,6.856132,6.555127,1e-08,1e-08,1e-08,1e-08,1e-08,1e-08,1e-08,1e-08


In [7]:
df_infer_scaled[['DEF0C']]

Unnamed: 0_level_0,DEF0C
date,Unnamed: 1_level_1
2013-01-01,5.598764
2013-01-02,5.521634
2013-01-03,6.475852
2013-01-04,6.613113
2013-01-05,4.885811
...,...
2015-06-18,5.834230
2015-06-19,5.896682
2015-06-20,5.280015
2015-06-21,4.082019


In [8]:
df_infer_scaled.loc['2013-01-01': '2015-12-22']

Unnamed: 0_level_0,DE145,DE114,DE146,DE132,DE12A,DE133,DE12C,DE11C,DE118,DE119,...,DEG01,DEG0F,DE275,DE21C,DE234,DE251,DE276,DE278,DE718,DE943
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-01,6.030707,6.332357,6.507886,6.628375,6.878469,6.701568,6.856132,6.266206,6.590818,6.636133,...,6.701568,6.824102,1.000000e-08,1.000000e-08,1.000000e-08,1.000000e-08,1.000000e-08,1.000000e-08,1.000000e-08,1.000000e-08
2013-01-02,5.232691,5.199338,5.104712,4.391300,5.327520,4.303404,5.254946,5.191001,5.556967,5.489596,...,6.141913,5.918106,1.000000e-08,1.000000e-08,1.000000e-08,1.000000e-08,1.000000e-08,1.000000e-08,1.000000e-08,1.000000e-08
2013-01-03,6.133282,6.227721,6.755229,5.601776,6.190702,6.046516,6.107795,6.370123,6.856132,6.913633,...,6.856132,6.555127,1.000000e-08,1.000000e-08,1.000000e-08,1.000000e-08,1.000000e-08,1.000000e-08,1.000000e-08,1.000000e-08
2013-01-04,6.074704,6.107795,6.427164,4.601411,5.817322,4.814915,5.719279,6.301226,6.386830,6.359163,...,6.676286,6.736847,1.000000e-08,1.000000e-08,1.000000e-08,1.000000e-08,1.000000e-08,1.000000e-08,1.000000e-08,1.000000e-08
2013-01-05,6.793656,6.590818,5.793944,5.235472,6.018986,5.031819,5.965703,6.736847,6.195255,6.054500,...,6.628375,6.783829,1.000000e-08,1.000000e-08,1.000000e-08,1.000000e-08,1.000000e-08,1.000000e-08,1.000000e-08,1.000000e-08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2015-06-18,5.999696,5.925321,6.172693,5.751363,5.662824,5.716101,5.577783,5.903787,5.622961,5.900230,...,5.932573,5.807263,6.867197e+00,6.925839e+00,6.527769e+00,6.745973e+00,6.889959e+00,6.834593e+00,6.727848e+00,6.427164e+00
2015-06-19,5.925321,5.598764,5.009229,5.296755,5.466483,5.000733,5.406452,5.709759,5.026180,5.352781,...,5.748129,5.764355,6.301226e+00,6.398156e+00,6.227721e+00,6.534515e+00,6.359163e+00,6.398156e+00,6.370123e+00,6.475852e+00
2015-06-20,5.366856,5.224351,4.754337,4.841708,4.832807,4.414199,4.871182,5.277227,4.992223,5.115872,...,5.709759,5.635152,6.342981e+00,6.375656e+00,5.932573e+00,6.415438e+00,6.195255e+00,6.223017e+00,6.120464e+00,6.291084e+00
2015-06-21,5.160422,4.986541,5.787325,5.738463,5.583760,5.589751,5.586754,5.079556,5.204896,5.554006,...,5.127021,5.082355,6.286055e+00,6.403876e+00,5.988264e+00,6.321859e+00,6.332357e+00,6.392474e+00,6.168239e+00,6.168239e+00


In [9]:
train = slice('2013-01-01', '2015-12-22')
df_infer_scaled[train]

Unnamed: 0_level_0,DE145,DE114,DE146,DE132,DE12A,DE133,DE12C,DE11C,DE118,DE119,...,DEG01,DEG0F,DE275,DE21C,DE234,DE251,DE276,DE278,DE718,DE943
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-01,6.030707,6.332357,6.507886,6.628375,6.878469,6.701568,6.856132,6.266206,6.590818,6.636133,...,6.701568,6.824102,1.000000e-08,1.000000e-08,1.000000e-08,1.000000e-08,1.000000e-08,1.000000e-08,1.000000e-08,1.000000e-08
2013-01-02,5.232691,5.199338,5.104712,4.391300,5.327520,4.303404,5.254946,5.191001,5.556967,5.489596,...,6.141913,5.918106,1.000000e-08,1.000000e-08,1.000000e-08,1.000000e-08,1.000000e-08,1.000000e-08,1.000000e-08,1.000000e-08
2013-01-03,6.133282,6.227721,6.755229,5.601776,6.190702,6.046516,6.107795,6.370123,6.856132,6.913633,...,6.856132,6.555127,1.000000e-08,1.000000e-08,1.000000e-08,1.000000e-08,1.000000e-08,1.000000e-08,1.000000e-08,1.000000e-08
2013-01-04,6.074704,6.107795,6.427164,4.601411,5.817322,4.814915,5.719279,6.301226,6.386830,6.359163,...,6.676286,6.736847,1.000000e-08,1.000000e-08,1.000000e-08,1.000000e-08,1.000000e-08,1.000000e-08,1.000000e-08,1.000000e-08
2013-01-05,6.793656,6.590818,5.793944,5.235472,6.018986,5.031819,5.965703,6.736847,6.195255,6.054500,...,6.628375,6.783829,1.000000e-08,1.000000e-08,1.000000e-08,1.000000e-08,1.000000e-08,1.000000e-08,1.000000e-08,1.000000e-08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2015-06-18,5.999696,5.925321,6.172693,5.751363,5.662824,5.716101,5.577783,5.903787,5.622961,5.900230,...,5.932573,5.807263,6.867197e+00,6.925839e+00,6.527769e+00,6.745973e+00,6.889959e+00,6.834593e+00,6.727848e+00,6.427164e+00
2015-06-19,5.925321,5.598764,5.009229,5.296755,5.466483,5.000733,5.406452,5.709759,5.026180,5.352781,...,5.748129,5.764355,6.301226e+00,6.398156e+00,6.227721e+00,6.534515e+00,6.359163e+00,6.398156e+00,6.370123e+00,6.475852e+00
2015-06-20,5.366856,5.224351,4.754337,4.841708,4.832807,4.414199,4.871182,5.277227,4.992223,5.115872,...,5.709759,5.635152,6.342981e+00,6.375656e+00,5.932573e+00,6.415438e+00,6.195255e+00,6.223017e+00,6.120464e+00,6.291084e+00
2015-06-21,5.160422,4.986541,5.787325,5.738463,5.583760,5.589751,5.586754,5.079556,5.204896,5.554006,...,5.127021,5.082355,6.286055e+00,6.403876e+00,5.988264e+00,6.321859e+00,6.332357e+00,6.392474e+00,6.168239e+00,6.168239e+00


## Defining CV Splits

In [44]:
window_size_first_pass = cv_params['window_size_first_pass']
window_size_last_pass = cv_params['window_size_last_pass']
if window_size_last_pass == 'complete inference window':
    window_size_last_pass = len(df)
n_passes = cv_params['n_passes']
forecasting_window_size = cv_params['forecasting_window_size']

cv_splits_dict = {}
window_size_increment = int((window_size_last_pass - window_size_first_pass) / (n_passes - 1))
for p in range(n_passes):
    pass_id = 'pass_' + str(p + 1)
    cv_splits_dict[pass_id] = {
        'train_idx': [
            0,
            window_size_first_pass + p * window_size_increment
        ],
        'test_idx': [
            window_size_first_pass + p * window_size_increment,
            window_size_first_pass + p * window_size_increment + forecasting_window_size,
        ],
    }
    
cv_splits_dict

{'pass_1': {'train_idx': [0, 730], 'test_idx': [730, 737]},
 'pass_2': {'train_idx': [0, 816], 'test_idx': [816, 823]},
 'pass_3': {'train_idx': [0, 902], 'test_idx': [902, 909]}}

In [None]:
window_size_first_pass = cv_params['window_size_first_pass']
window_size_last_pass = cv_params['window_size_last_pass']
if window_size_last_pass == 'complete inference window':
    window_size_last_pass = len(df)
n_passes = cv_params['n_passes']
forecasting_window_size = cv_params['forecasting_window_size']

cv_splits_dict = {}
window_size_increment = int((window_size_last_pass - window_size_first_pass) / (n_passes - 1))
for p in range(n_passes):
    pass_id = 'pass_' + str(p + 1)
    cv_splits_dict[pass_id] = {
        'train': slice(
            df.index[0],
            df.index[ window_size_first_pass + p * window_size_increment ]
        ),
        'val': slice(
            df.index[ window_size_first_pass + p * window_size_increment ],
            df.index[ window_size_first_pass + p * window_size_increment + forecasting_window_size ]
        ),
    }

cv_splits_dict

In [23]:
# ignore all vars we don't want to model
targets = modeling['targets']
df = df[targets]

In [26]:
splits_positions

{'pass_1': {'train_idx': [0, 730], 'test_idx': [730, 737]},
 'pass_2': {'train_idx': [0, 816], 'test_idx': [816, 823]},
 'pass_3': {'train_idx': [0, 902], 'test_idx': [902, 909]}}

In [34]:
train_idx = slice( 
    splits_positions['pass_1']['train_idx'][0],
    splits_positions['pass_1']['train_idx'][1],
    )

In [35]:
df.iloc[train_idx]

Unnamed: 0_level_0,DEF0C,DE111
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2013-01-01,5.598764,6.727848
2013-01-02,5.521634,5.282803
2013-01-03,6.475852,5.524562
2013-01-04,6.613113,6.415438
2013-01-05,4.885811,6.398156
...,...,...
2014-12-27,5.003567,5.958269
2014-12-28,4.949433,6.074704
2014-12-29,5.193780,6.026787
2014-12-30,5.243816,6.112002


In [40]:
train = slice(
    df.index[0],
    df.index[730]
)

df[train]

Unnamed: 0_level_0,DEF0C,DE111
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2013-01-01,5.598764,6.727848
2013-01-02,5.521634,5.282803
2013-01-03,6.475852,5.524562
2013-01-04,6.613113,6.415438
2013-01-05,4.885811,6.398156
...,...,...
2014-12-28,4.949433,6.074704
2014-12-29,5.193780,6.026787
2014-12-30,5.243816,6.112002
2014-12-31,5.380964,4.137365


## Training Split-Wise

In [8]:
def split_data(df: pd.DataFrame, modeling):
    train = slice(
        modeling['train_window']['start'],
        modeling['train_window']['end']
    )
    
    test = slice(
        modeling['test_window']['start'],
        modeling['test_window']['end']
    )
    
    return {
        'df_train': df[train],
        'df_test': df[test]
    }

In [104]:
class MakeStrictlyPositive(TransformerMixin, BaseEstimator):
    '''Add constant to variable so that it only assumes positive values.'''

    def __init__(self):
        pass

    def fit(self, X, y=None):
        self.offset_ = X.min(axis=0)
        return self 
    
    def transform(self, X, y=None):
        return X + abs(self.offset_)
    
    def inverse_transform(self, X, y=None):
        return X - abs(self.offset_)      

In [None]:
TRANSFORMERS = {
    'get_quantile_equivalent_normal_dist': QuantileTransformer(
                                                output_distribution='normal', 
                                                random_state=0,
                                            ),
    'make_strictly_positive': MakeStrictlyPositive(),
}

In [106]:
preprocessing_pipeline = make_pipeline(
    QuantileTransformer(
        output_distribution='normal', 
        random_state=0,
    ),
    MakeStrictlyPositive(),
)

In [107]:
preprocessing_pipeline = make_pipeline(
    *[ TRANSFORMERS[ step ] for step in modeling['preprocessing'] ]
)

In [108]:
train_test_split = split_data(df_spatiotemporal, modeling)

In [109]:
df_train = train_test_split['df_train']

df_train['temporal'].head()

district,DE111,DE114,DE115,DE116,DE118,DE119,DE11A,DE11B,DE11C,DE11D,...,DEG0E,DEG0F,DEG0G,DEG0I,DEG0J,DEG0K,DEG0L,DEG0M,DEG0N,DEG0P
var,power,power,power,power,power,power,power,power,power,power,...,power,power,power,power,power,power,power,power,power,power
2013-01-01,0.178783,0.269458,0.35113,0.184231,0.357407,0.410291,0.362935,0.344386,0.28414,0.360815,...,0.187667,0.363388,0.393872,0.482197,0.551744,0.435241,0.372426,0.464629,0.393286,0.497463
2013-01-02,0.030363,0.063571,0.103089,0.030433,0.07874,0.108224,0.105199,0.093845,0.083709,0.093448,...,0.07391,0.13393,0.226143,0.205528,0.284261,0.219417,0.215123,0.272299,0.203758,0.24802
2013-01-03,0.041567,0.229298,0.182997,0.090303,0.46015,0.516133,0.410587,0.371699,0.320095,0.357628,...,0.107586,0.297686,0.4512,0.379433,0.536884,0.463397,0.424858,0.528489,0.426369,0.502846
2013-01-04,0.128148,0.20583,0.347322,0.144561,0.264061,0.31902,0.357322,0.220658,0.295479,0.364025,...,0.124282,0.342673,0.427804,0.447503,0.505166,0.311061,0.373373,0.438596,0.329357,0.34778
2013-01-05,0.126854,0.346798,0.274581,0.188337,0.212976,0.22108,0.343575,0.272172,0.459154,0.448515,...,0.294815,0.35746,0.321136,0.415867,0.361462,0.349354,0.301562,0.275772,0.331298,0.337855


In [110]:
df_train_preprocessed = df_train.copy(deep=True)

In [111]:
preprocessing_pipeline = preprocessing_pipeline.fit(
    df_train_copy['temporal']
)

df_train_preprocessed['temporal'].update(
    preprocessing_pipeline.transform(
        df_train_copy['temporal']
    )
)


In [112]:
(df_train_copy['temporal'] < 0).sum().sum()

0

In [113]:
cv_params = {
    'cv_type': 'expanding_windows',
    'window_size_first_pass': 365,
    'window_size_last_pass': 540,
    'n_passes': 3,
    'forecasting_window_size': 7,
}

In [114]:
def define_cvsplits(cv_pars: Dict) -> Dict[str, Any]:  # Dict[str, List[pd.date_range, List[str]]]:
    """
    Example of Cross-Validation Splits Dictionary:

    cv_splits_dict = {
        'pass_1': {
            'train_idx': [0, 365],
            'eval_idx': [365, 465],
        }
    }

    :param window_size_first_pass:
    :param window_size_last_pass:
    :param n_passes:
    :param forecasting_window_size:
    :return:
    """
    window_size_first_pass = cv_pars['window_size_first_pass']
    window_size_last_pass = cv_pars['window_size_last_pass']
    n_passes = cv_pars['n_passes']
    forecasting_window_size = cv_pars['forecasting_window_size']

    cv_splits_dict = {}
    window_size_increment = int( (window_size_last_pass - window_size_first_pass) / (n_passes-1) )
    for p in range(n_passes):
        pass_id = 'pass_' + str(p + 1)
        cv_splits_dict[pass_id] = {
                'train_idx': [
                    0,
                    window_size_first_pass + p * window_size_increment
                ],
                'eval_idx': [
                    window_size_first_pass + p * window_size_increment,
                    window_size_first_pass + p * window_size_increment + forecasting_window_size,
                ],
        }
    return cv_splits_dict

In [159]:
def _split_train_val(df: pd.DataFrame, cv_splits_dict: dict, pass_id: str):
    train_idx_start = cv_splits_dict[pass_id]['train_idx'][0]
    train_idx_end = cv_splits_dict[pass_id]['train_idx'][1]

    test_idx_start = cv_splits_dict[pass_id]['test_idx'][0]
    test_idx_end = cv_splits_dict[pass_id]['test_idx'][1]

    return {
        'train': df.iloc[train_idx_start:train_idx_end, :],
        'val': df.iloc[test_idx_start:test_idx_end, :],
    }


class ForecastingModel:
    def __init__(self, y_train, modeling):
        
        # model artifacts (metadata) for training
        self.modeling_settings = modeling
        self.y_train_info = y_train.info()
        self.y_train_columns = y_train.columns
        
        self.hyperpars = modeling['hyperpars']
        
        self.targets_list = self.modeling['target_timeseries']
        if self.targets_list == 'all_available':
            self.targets_list = y_train.columns 
        
        y_train_ = y_train  # i.e. all districts at once (spatio-temporal)
        if self.modeling_settings['mode'] == 'temporal':  # i.e. districtwise
            y_train_ = y_train['temporal']

            
    def fit(self, district=None):      
        self.datetime_start = datetime.now()
        time_start = time.time()
        
        if self.modeling_settings['approach'] == 'HW-ES':
            self.submodels_ = { 
                district: ExponentialSmoothing( 
                    endog=y_train_[district], 
                    *self.hyperpars,
                ).fit() for district in self.targets_list 
            } 

        elif self.modeling_settings['approach'] == 'RNN-ES':
            self.model_ = None

        elif self.modeling_settings['approach'] == 'GWNet':
            self.model_ = None
        
        else: 
            return NotImplementedError(f'Invalid modeling approach {self.modeling_settings["approach"]}')
        
        self.training_duration = format( time.time() - time_start, "2.00E" ) + ' secs' 

        return self
    
    
    def predict(self, start, end, transformer):               
        y_hat = pd.DataFrame(
            data=None,
            columns=self.y_train_columns,
        )
        
        if self.modeling_settings['mode'] == 'temporal': # i.e. districtwise
            y_hat.update(
                data = {
                    self.submodels_[district].predict(
                        start=start,
                        end=start,)
                for district in self.y_train_columns}, 
                copy=False,
            )
        else: 
            y_hat.update(
                data = {
                    self.model_.predict(
                        start=start,
                        end=start,
                    )
                }
                copy=False,
            )
        
        y_hat_unscaled = transformer.inverse_transform(y_hat)
        return y_hat_unscaled

    
def cv_train(df_train_preprocessed: pd.DataFrame,
             modeling: Dict[str, Any],
             cv_splits_dict: Dict[str, Any]) -> Dict[str, Any]:

    model = {}
    for pass_id in cv_splits_dict.keys():

        # splitting
        y = _split_train_val(df_train_preprocessed, cv_splits_dict, pass_id)  # cv_splits_dict[pass_id]

        # training
        model[pass_id] = ForecastingModel(y['train'], modeling).fit()   
    
    longest_pass_id = pass_id
    return {
        'intermediate_models': model,
        'model': model[longest_pass_id]
    }


def evaluate(model, para)

In [117]:
model['pass_3'].predict(
    start='2015-06-21',
    end='2015-06-27',
    scaler=preprocessing_pipeline,  # TODO: populate all districts columns, then predict
)

ValueError: operands could not be broadcast together with shapes (7,) (292,) 