In [None]:
import warnings
warnings.filterwarnings(action="ignore")

import pandas as pd 
import numpy as np 
import statsmodels.api as sm
import math
import itertools

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

% matplotlib inline 
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (16, 6)

In [None]:
def load_airline_data():
    airlines = pd.read_csv('data/international-airline-passengers.csv', 
                       index_col='Month')[:-1]

    airlines.columns = ['passengers_thousands']
    airlines = airlines['passengers_thousands']
    airlines.index = pd.to_datetime(airlines.index)
    
    return airlines

In [None]:
def plot_predictions(series_, pred_):
    
    """ 
    Remember Sam told us to build functions as we go? Let's not write this stuff again. 
    """
    
    mean_predictions_ = pred_.predicted_mean

    pred_ci_ = pred_.conf_int()
    
    series_.plot(label='observed')
    mean_predictions_.plot(label='predicted', 
                           alpha=.7)

    plt.fill_between(pred_ci_.index,
                     pred_ci_.iloc[:, 0],
                     pred_ci_.iloc[:, 1], 
                     color='k', 
                     alpha=.2)
    plt.legend()
    plt.show()

In [None]:
def get_aic(series_, params):
    p = params[0] 
    d = params[1] 
    q = params[2] 
    P = params[3]
    D = params[4] 
    Q = params[5]
    S = params[6]
    
    model = sm.tsa.statespace.SARIMAX(series_,
                                      order=(p, d, q),
                                      seasonal_order=(P, D, Q, S),
                                      enforce_stationarity=False,
                                      enforce_invertibility=False)
    results = model.fit()
    
    return results.aic


def get_best_params(series_, inputs):
    
    aic_scores = {}
    params_index = {}

    for i in range(len(inputs)):
        # this just prints a kind of progress bar, you can use tqdm if you like 
        print('%0.1f%%' % (i/len(inputs)*100), end=', ')
        try: 
            param_set = inputs[i]
            aic = get_aic(series_, param_set) 
            aic_scores[i] = aic
            params_index[i] = param_set

        except Exception as e: 
            continue

    temp = pd.DataFrame(params_index).T
    temp.columns = ['p', 'd', 'q', 'P', 'D', 'Q', 'S']
    temp['aic'] = pd.Series(aic_scores)
    temp.sort_values('aic').head()

    best_model_params = temp.aic.idxmin()

    return temp.loc[best_model_params]

In [None]:
def get_inputs():
    p = d = q = P = D = Q = range(0, 2)
    S = 7

    params_combinations = list(itertools.product(p, d, q, P, D, Q))
    inputs = [[x[0], x[1], x[2], x[3], x[4], x[5], S] for x in params_combinations]
    return inputs

We already know this dataset! 

In [None]:
airlines = load_airline_data()

In [None]:
airlines.head()

In [None]:
airlines.plot();

### Split the data set in train and test (consider test after the year of 1957)

### Exercise

In [None]:
airlines = load_airline_data()[:'1957'] # train 
airlines_test = load_airline_data()['1958':] # test

### Q1. Fit your SARIMAX model and get in sample predictions, starting from the first period of the training dataset (and not the test dataset)

* Use (p,d,q) = (0,1,1)
* Use seasonal_order = (1,1,1,12)
* enforce_stationarity=False  
* enforce_invertibility=True

### Exercise

In [None]:
# order = (p,d,q)
# seasonal_order = (P,D,Q,s)

# model = # call your SARIMAX
# results = # fit your model

# YOUR CODE HERE
raise NotImplementedError()

In [None]:
# get in sample predictions

# pred = # 
# mean_predictions = # 


# YOUR CODE HERE
raise NotImplementedError()

In [None]:
assert order == (0, 1, 1)
assert seasonal_order == (1, 1, 1, 12)
assert math.isclose(mean_predictions.sum(), 24838.36, abs_tol=0.5)

In [None]:
# plot this
airlines.plot(label='observed', figsize=(16, 4))
mean_predictions.plot(label='One-step ahead Forecast with dynamic=False', alpha=.7)
plt.legend()

### Q1.1: Get confidence intervals and plot it

In [None]:
# pred_ci = # get the confidence interval for the predictions

# YOUR CODE HERE
raise NotImplementedError()

In [None]:
airlines.plot(label='observed')
mean_predictions.plot(label='One-step ahead Forecast with dynamic=False', alpha=.7)

plt.fill_between(pred_ci.index,
                 pred_ci['lower passengers_thousands'],
                 pred_ci['upper passengers_thousands'], 
                 color='k', 
                 alpha=.2)

plt.ylim([0, 700])
plt.legend()
plt.show()

In [None]:
assert math.isclose(pred_ci.mean()[0], -24.61, abs_tol=0.5)
assert math.isclose(pred_ci.mean()[1], 484.58, abs_tol=0.5)

### Q2: Predict the future! Forecast 36 months ahead and plot it against the test set 

### Exercise

In [None]:
# forecast = # get your forecast object
# forecast_pred = # get your predictions
# forecast_ci = # get your confidence interval for the forecast

# YOUR CODE HERE
raise NotImplementedError()

In [None]:
airlines.plot(label='train')
forecast_pred.plot(label='predicted')
airlines_test.plot(label='test')
plt.legend()
plt.show()

In [None]:
forecast_ci.mean()[1]

In [None]:
assert math.isclose(forecast_pred.sum(), 15445.9, abs_tol=0.5)
assert math.isclose(forecast_ci.mean()[0], 338.2, abs_tol=0.5)
assert math.isclose(forecast_ci.mean()[1], 519.8, abs_tol=0.5)

In [None]:
# plot this 

plot_predictions(series_=airlines, pred_=forecast)

### Q3: Calculate the $R^{2}$ for your forecast and the `airline_test` 

#### Exercise

In [None]:
# y_pred = 
# y_true = 

# r2 = # use sklearn r2_score

# YOUR CODE HERE
raise NotImplementedError()

In [None]:
assert math.isclose(y_pred.sum(), 15445.91, abs_tol=0.5)
assert math.isclose(r2, 0.9232, abs_tol=0.5)

Ok all good for now but let's see what we can do with timeseries without using timeseries tools.

## Workflow

### Q4

In [None]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

### Q4.1: Get a quick benchmark with last days sales of each store with AIC

In [None]:
# get your multi-index and store
# train['Date'] = # get your datetime 
# train = # set the index, first the Date then Store
# train = # sort it! 
# idx = # create your index slicer

# YOUR CODE HERE
raise NotImplementedError()

In [None]:
assert train.iloc[-1][0] == 11354

#### split train test: use the last 4 days as test

In [None]:
# train test split

# new_train = # train without the last 4 days
# new_test = # the last 4 days 

# YOUR CODE HERE
raise NotImplementedError()

In [None]:
assert new_train.shape[0] == 20400
assert new_test.shape[0] == 400

#### make a quick benchmark with the sales of the last day of each store

In [None]:
# get a quick benchmark

# last_day = # gets the sale of each store on the last day of our training dataset
# new_test['predictions'] = # set it to zero
# days_in_test_set = # list of the unique values of the dates to predict

# for day in days_in_test_set: 
    # new_test.loc[idx[day, :], 'predictions'] = # assign it to the Sales' last day

# y_true = # get the true Sales values from the test
# y_pred = # get your predictions
# mean_absolute_error = # get the mean absolute error betwee y_true and y_pred

# YOUR CODE HERE
raise NotImplementedError()

In [None]:
assert y_pred[idx['2015-07-24', 1]][0] == 3769
assert y_true[idx['2015-07-24', 1]][0] == 3706

### Q4.2: Use SARIMAX with grid search to predict the sales of the store for the same days you were predicing in 4.1

In [None]:
# store_4 = filter your store with the number 4
# store_4.index = drop the level "Store" 

# YOUR CODE HERE
raise NotImplementedError()

In [None]:
assert store_4.sum() == 1657224

#### for the SARIMAX model, start with the following arguments: 
* order=(0, 1, 0)
* seasonal_order=(1, 1, 1, 7) 
* enforce_stationarity=False  
* enforce_invertibility=False

In [None]:
# model = create your SARIMAX model
# results = fit your model
# aic = # get the aic

# YOUR CODE HERE
raise NotImplementedError()

In [None]:
assert math.isclose(results.fittedvalues.sum(), 1661410.194, abs_tol=0.5)
assert math.isclose(aic, 3618.186, abs_tol=0.5)

#### For the grid search, use the function `get_best_params` and `get_inputs` that are defined at the beginning of the notebook 

In [None]:
# grid search

# inputs = # get the inputs
# best_params = # get the best_params for the SARIMAX

# YOUR CODE HERE
raise NotImplementedError()

In [None]:
assert math.isclose(best_params.aic, 3471.621, abs_tol=5)

In [None]:
# fit the new model
# model = # sarimax with the new parameters
# results = # fit the model
# aic = get the aic. This should be the same as before 
# YOUR CODE HERE
raise NotImplementedError()

In [None]:
assert math.isclose(best_params.aic, 3471.621, abs_tol=0.5)
assert math.isclose(results.fittedvalues.sum(), 1581705.679, abs_tol=0.5)

In [None]:
# store_4_preds = # get SARIMAX predictions for store 4
# store_4_forecast = # get the forecast for the 4 days we are testing

# YOUR CODE HERE
raise NotImplementedError()

In [None]:
assert math.isclose(store_4_preds['2015-01-03'], 1106.3, abs_tol=0.5)
assert math.isclose(store_4_forecast['2015-07-25'], 10017.3, abs_tol=0.5)

### Q4.3: Get a prediction for the first `10 stores` for the new test (the final 4 days)

In [None]:
def predict_with_sarimax(df_, store_nr, steps=4): 
    
    store_ = df_.loc[idx[:, store_nr], 'Sales']
    
    store_.index = store_.index.droplevel('Store')

    model = sm.tsa.statespace.SARIMAX(store_,             
                              order=(1, 0, 1),             
                              seasonal_order=(1, 1, 1, 7),
                              enforce_stationarity=False,  
                              enforce_invertibility=False) 

    results = model.fit()
    return results.get_forecast(steps=steps).predicted_mean

##### This part can be tricky! Have a look at the learning notebook if you need! 

* We wrote some of the parts for you, you just have to uncomment them. Others you will have to write some code but, then again, in case of troubles, check the notebooks! 

In [None]:
# just uncomment
# stores = train.index.get_level_values('Store').unique()[:10]

# just uncomment
# new_test = new_test.loc[idx[:, stores], :]
# res = {}

# just uncomment
# i = 0
# for store_nr in stores:
    # i += 1
    # print('%0.0f%%'% (i/len(stores)*100), end=',')
    
    # need to do
    # res[store_nr] = # use predict sarimax to get predictions for each store
    
# just uncomment
# results = pd.DataFrame(res).unstack().reset_index()
# results.columns = ['Store', 'Date', 'Sales']
# results = results.set_index(['Date', 'Store']).sort_index()

# just uncomment
# days_in_test_set = new_test.index.get_level_values('Date').unique()

# just uncomment
# for day in days_in_test_set:
    # new_test.loc[idx[day, :], 'predictions'] = results.loc[idx[day, :], 'Sales'].values
    

# need to do
# mean_absolute_error_final = # get the mean absolute error

# YOUR CODE HERE
raise NotImplementedError()

In [None]:
assert math.isclose(mean_absolute_error_final, 569.4, abs_tol=0.5)

# OPTIONAL

### Prepare a dataframe with the following:
- Target: should be the value `n` months after each month
    - Where `n` will be the number of periods ahead we are trying to predict
- Features: diff1 and diff2 

In [None]:
passenger = load_airline_data() # fresh start 

In [None]:
# build useful functions
# build_target should give you a target variable
# build_features should give features with the diff1 and diff2
# prepare_preds gets them together and splits in X_train and y_train

def build_target(_series, periods): 
    _series = _series.copy()
    _df = pd.DataFrame(_series)
    # _df['target'] = # create your target variable
    return _df

def build_features(_df, feature): 
    _df = _df.copy()
    # _df['diff1'] = # create the feature with the difference of 1 period
    # _df['diff2'] = # create the feature with the difference of 2 period
    return _df.dropna()

def prepare_preds(_series, periods): 
    col_name = _series.name
    
    # _df = # build your dataframe with target and features
    
    # features = # get a list of features
    # _df = # Get a _df without NaN
    
    # X_train = # get your X_train 
    # y_train = # get your y_train 

    
    return X_train, y_train

# YOUR CODE HERE
raise NotImplementedError()

### Q5: Split your dataframe in X_train, y_train fit a linear regression, predict 2 months ahead and check the r2 score

In [None]:
# X_train, y_train = use your prepare_preds
# lr = # Linear regression
# fit your Linear Regression 
# r2_score = calculate the score of r2

# YOUR CODE HERE
raise NotImplementedError()

In [None]:
assert math.isclose(r2_score, 0.81, abs_tol=0.1)