## Using time to model a linear trend
*  Creating feature using time to capture the trend of a time series.


In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import HistGradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

from sklearn.pipeline import make_pipeline

from statsmodels.tsa.deterministic import DeterministicProcess
from sktime.transformations.series.time_since import TimeSince

## Data

The air passengers dataset is the monthly totals of international airline passengers, from 1949 to 1960, in units of 1000s.

In [2]:
data = pd.read_csv('../../Datasets/example_air_passengers.csv', parse_dates=['ds'], index_col=['ds'])
data.plot(figsize=(15,4))

<img src='./plots/air-passengers-data.png'>

### OPTION : 1  Creating the time feature to capture trend (i.e., time since the start)

In [3]:
df = data.copy()

df['time-feature'] = np.round((df.index - df.index.min()) / np.timedelta64(1, "M"))

df.head()

Unnamed: 0_level_0,y,time-feature
ds,Unnamed: 1_level_1,Unnamed: 2_level_1
1949-01-01,112,0.0
1949-02-01,118,1.0
1949-03-01,132,2.0
1949-04-01,129,3.0
1949-05-01,121,4.0


### OPTION : 2 Using deterministic process from statsmodels

In [27]:
df = data.copy()
dp = DeterministicProcess(index=df.index, order=1)
df['time-feature'] = dp.in_sample()

df.head()

Unnamed: 0_level_0,y,time-feature
ds,Unnamed: 1_level_1,Unnamed: 2_level_1
1949-01-01,112,1.0
1949-02-01,118,2.0
1949-03-01,132,3.0
1949-04-01,129,4.0
1949-05-01,121,5.0


### OPTION : 3 Using TimeSince from Sktime

* `start` 
    * Here you can specify the start-date to count from
    * Because we specified `start` there are no learned parameters from the data itself. 
    * When we use `.transform()` the transformer will compute for each date in `start`: `df.index - start_date` (i.e., the time since the specified start date)

<br>

* `to_numeric`
    * Convert time-object to numeric 

<br>

* `freq`
    * if the data we are passing does not have freq specified, you have to specify here

<br>

* `keep_original_columns` 
    * boolean , Keep the other columns in the dataframe

<br>

* `positive_only`
    * convert negative values to zero


In [4]:
ts = TimeSince(
    start=['1949-01-01'], 
    to_numeric=True, 
    freq='MS', 
    keep_original_columns=True, 
    positive_only=True
    )

df = ts.fit_transform(data)

df.head()

Unnamed: 0_level_0,y,time_since_1949-01-01 00:00:00
ds,Unnamed: 1_level_1,Unnamed: 2_level_1
1949-01-01,112,0
1949-02-01,118,1
1949-03-01,132,2
1949-04-01,129,3
1949-05-01,121,4


## Let's build a forecast with just the time feature

In [5]:
df = data.copy()
holdout_size = 24
df_train = df.iloc[:-holdout_size]
df_test = df.iloc[-holdout_size:]

In [7]:
def model_pipe(model=None):
    time_since = TimeSince(freq='MS', keep_original_columns=False)
    return make_pipeline(time_since, model)

In [34]:
def plot_perdiction(model=None, figsize=(15, 4)):
    model = LinearRegression() if None else model
    pipe = model_pipe(model)
    pipe.fit(df_train, df_train['y'])

    y_preds_train = pd.DataFrame(pipe.predict(df_train), columns=['y_pred_train'], index=df_train.index)
    y_preds_test = pd.DataFrame(pipe.predict(df_test), columns=['y_pred_test'], index=df_test.index)

    ax = df_train.plot(figsize=figsize)
    y_preds_train.plot(ax=ax)
    df_test.plot(ax=ax)
    y_preds_test.plot(ax=ax)
    ax.legend(['train', 'train_pred', 'test', 'test_pred']);

## Linear models | Linear Regression | able to Extrapolate 

In [79]:
model = LinearRegression()
plot_perdiction(model)

<img src='./plots/linear-reg-air-passenger-trend-modeling.png'>

### Tree Based models | Random Forest | Tree models can't extrapolate

In [80]:
model = RandomForestRegressor(max_depth=4)
plot_perdiction(model)

<img src='./plots/random-forest-max-depth-4-air-passenger-trend-modeling.png'>

### Tree models has the capacity to overfit the train data. But being unable to extrapolate its performance is poor in unseen data

In [81]:
model = RandomForestRegressor(max_depth=10)
plot_perdiction(model)

<img src='./plots/random-forest-max-depth-10-air-passenger-trend-modeling.png'>

### Decision Tree

In [82]:
model = DecisionTreeRegressor(max_depth=4)
# plot_perdiction(model)

<img src='./plots/decision-tree-depth-4-air-passenger-trend-modeling.png'>

In [83]:
model = DecisionTreeRegressor(max_depth=8)
plot_perdiction(model)

<img src='./plots/decision-tree-depth-8-air-passenger-trend-modeling.png'>

### Gradient Boosting

In [84]:
model = HistGradientBoostingRegressor(max_depth=4)
plot_perdiction(model)

<img src='./plots/gradient-boosted-tree-depth-4-air-passenger-trend-modeling.png'>

### Forecasting with linear models

In [58]:
data.index.max()

Timestamp('1960-12-01 00:00:00')

In [55]:
pd.DateOffset(months=1)

<DateOffset: months=1>

In [57]:

data.index.max() + pd.DateOffset(months=1)

Timestamp('1961-01-01 00:00:00')

In [70]:
# forecast 12 periods into the future

forecast_horizon = pd.date_range(start=data.index.max() + pd.DateOffset(months=1), periods=12, freq='MS')

fh = pd.DataFrame(index=forecast_horizon)

fh

1961-01-01
1961-02-01
1961-03-01
1961-04-01
1961-05-01
1961-06-01
1961-07-01
1961-08-01
1961-09-01
1961-10-01
1961-11-01


In [75]:
model = model_pipe(model=LinearRegression())
model.fit(data, data['y'])

y_preds_train = model.predict(data)
y_preds_test = model.predict(fh)


y_preds_train = pd.DataFrame(data=y_preds_train, index=data.index, columns=['forecast-train'] )
y_preds_test = pd.DataFrame(data=y_preds_test, index=fh.index, columns=['forecast-test'] )

In [78]:
ax=data.plot(figsize=(15,4))
y_preds_train.plot(ax=ax, y=['forecast-train'])
y_preds_test.plot(ax=ax, y=['forecast-test'], linestyle='--');

<img src='./plots/Linear-model-can-extraploate-fh-12.png'>