# 05.04 - Modeling Setup - Pipelines

## Imports & setup

In [1]:
import pathlib
from datetime import datetime
import math
import sys

import pandas as pd
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt
plt.style.use('grayscale')
from matplotlib.dates import DateFormatter
import matplotlib.dates as mdates

from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import Pipeline
from skoot.feature_selection import FeatureFilter
from skoot.preprocessing import SelectiveRobustScaler

sys.path.append("..")
from src.models.models import SetTempAsPower, SK_Prophet
from src.utils.utils import bound_precision, AnnualTimeSeriesSplit

%matplotlib inline

PROJECT_DIR = pathlib.Path.cwd().parent.resolve()
CLEAN_DATA_DIR = PROJECT_DIR / 'data' / '05-clean'

pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

## Load Test Data

In [2]:
df = pd.read_csv(CLEAN_DATA_DIR / 'clean-cut.csv', parse_dates=True, index_col=0)
df = df.loc['1994': '1998']
df = df.resample('D').max()
# Just select a reasonable subset of data to test the model wrappers
df = df[['temp', 'dew_point_temp', 'week_of_year', 'daily_peak']]
df.rename(columns={'temp': 'temp_max'}, inplace=True)

y = df.pop('daily_peak')
X = df

X.head()

Unnamed: 0,temp_max,dew_point_temp,week_of_year
1994-01-01,2.8,1.1,52.0
1994-01-02,1.7,0.5,52.0
1994-01-03,-10.3,-12.6,1.0
1994-01-04,-7.4,-11.5,1.0
1994-01-05,-7.2,-10.7,1.0


In [3]:
y.head()

1994-01-01    16892.0
1994-01-02    18947.0
1994-01-03    21923.0
1994-01-04    21457.0
1994-01-05    22082.0
Freq: D, Name: daily_peak, dtype: float64

In [4]:
y.tail()

1998-12-27    17984.0
1998-12-28    18648.0
1998-12-29    19467.0
1998-12-30    20886.0
1998-12-31    19709.0
Freq: D, Name: daily_peak, dtype: float64

### Pipeline

In [5]:
n_splits=3
sk_prophet = SK_Prophet(regressors={'dew_point_temp':(), 'week_of_year': ()})

tatscv = AnnualTimeSeriesSplit(n_splits=n_splits)
steps = [('dropper', FeatureFilter(cols=['temp_max'])),
        ('robust_scaler', SelectiveRobustScaler(cols=['dew_point_temp'], trans_col_name=['dew_point_temp'])),
        ('sk_prophet', sk_prophet)]
pipeline = Pipeline(steps)

### Cross Validation & Scoring

In [6]:
cv_mae_scores_train = []
cv_bound_prec_scores_train = []

cv_mae_scores = []
cv_bound_prec_scores_test = []

for train_indx, val_indx in tatscv.split(X):
    X_train = X.iloc[train_indx] ; y_train = y.iloc[train_indx]
    X_val = X.iloc[val_indx] ; y_val = y.iloc[val_indx]
    
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_train)
    cv_mae_scores_train.append(mean_absolute_error(y_train, y_pred))
    cv_bound_prec_scores_train.append(bound_precision(y_train, y_pred))
    
    y_pred = pipeline.predict(X_val)

    cv_mae_scores.append(mean_absolute_error(y_val, y_pred))
    cv_bound_prec_scores_test.append(bound_precision(y_val, y_pred))

print(cv_mae_scores_train)
print(cv_bound_prec_scores_train) 
print()
print(cv_mae_scores)
print(cv_bound_prec_scores_test)  

INFO:numexpr.utils:NumExpr defaulting to 4 threads.
INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.

Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.


Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.


Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.



[742.9669035078234, 675.6945217815219, 683.5281096770452]
[0.4, 0.6, 0.6]

[5353.916951308222, 875.9532672417328, 895.8241614289749]
[0.0, 0.2, 0.0]
